diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index db2f4c843f32..a96440a29d7e 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -225,7 +225,7 @@ Flax), PyTorch, and/or TensorFlow. | ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ | | CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | -| CvT | ❌ | ❌ | ✅ | ❌ | ❌ | +| CvT | ❌ | ❌ | ✅ | ✅ | ❌ | | Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | | Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | | Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/cvt.mdx b/docs/source/en/model_doc/cvt.mdx index 84be7e39a550..a46ae68d58f8 100644 --- a/docs/source/en/model_doc/cvt.mdx +++ b/docs/source/en/model_doc/cvt.mdx @@ -51,3 +51,14 @@ This model was contributed by [anugunj](https://huggingface.co/anugunj). The ori [[autodoc]] CvtForImageClassification - forward + +## TFCvtModel + +[[autodoc]] TFCvtModel + - call + +## TFCvtForImageClassification + +[[autodoc]] TFCvtForImageClassification + - call + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 8a982969fc1e..cf937f2a162b 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2357,6 +2357,14 @@ "TFCTRLPreTrainedModel", ] ) + _import_structure["models.cvt"].extend( + [ + "TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFCvtForImageClassification", + "TFCvtModel", + "TFCvtPreTrainedModel", + ] + ) _import_structure["models.data2vec"].extend( [ "TFData2VecVisionForImageClassification", @@ -5022,6 +5030,12 @@ TFCTRLModel, TFCTRLPreTrainedModel, ) + from .models.cvt import ( + TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFCvtForImageClassification, + TFCvtModel, + TFCvtPreTrainedModel, + ) from .models.data2vec import ( TFData2VecVisionForImageClassification, TFData2VecVisionForSemanticSegmentation, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index db462aa62186..95fe3f8aaaec 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -39,6 +39,7 @@ ("convbert", "TFConvBertModel"), ("convnext", "TFConvNextModel"), ("ctrl", "TFCTRLModel"), + ("cvt", "TFCvtModel"), ("data2vec-vision", "TFData2VecVisionModel"), ("deberta", "TFDebertaModel"), ("deberta-v2", "TFDebertaV2Model"), @@ -184,6 +185,7 @@ [ # Model for Image-classsification ("convnext", "TFConvNextForImageClassification"), + ("cvt", "TFCvtForImageClassification"), ("data2vec-vision", "TFData2VecVisionForImageClassification"), ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")), ("mobilevit", "TFMobileViTForImageClassification"), diff --git a/src/transformers/models/cvt/__init__.py b/src/transformers/models/cvt/__init__.py index 36a6f69824ef..66b18f334411 100644 --- a/src/transformers/models/cvt/__init__.py +++ b/src/transformers/models/cvt/__init__.py @@ -17,7 +17,7 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available _import_structure = {"configuration_cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"]} @@ -36,6 +36,18 @@ "CvtPreTrainedModel", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_cvt"] = [ + "TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFCvtForImageClassification", + "TFCvtModel", + "TFCvtPreTrainedModel", + ] if TYPE_CHECKING: from .configuration_cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig @@ -53,6 +65,20 @@ CvtPreTrainedModel, ) + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_cvt import ( + TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFCvtForImageClassification, + TFCvtModel, + TFCvtPreTrainedModel, + ) + + else: import sys diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py new file mode 100644 index 000000000000..448bfd230288 --- /dev/null +++ b/src/transformers/models/cvt/modeling_tf_cvt.py @@ -0,0 +1,948 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 Cvt model.""" + + +import collections.abc +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Union + +import tensorflow as tf + +from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_cvt import CvtConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "CvtConfig" + +TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/cvt-13", + "microsoft/cvt-13-384", + "microsoft/cvt-13-384-22k", + "microsoft/cvt-21", + "microsoft/cvt-21-384", + "microsoft/cvt-21-384-22k", + # See all Cvt models at https://huggingface.co/models?filter=cvt +] + + +@dataclass +class TFBaseModelOutputWithCLSToken(ModelOutput): + """ + Base class for model's outputs. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`): + Classification token at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus + the initial embedding outputs. + """ + + last_hidden_state: tf.Tensor = None + cls_token_value: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + + +class TFCvtDropPath(tf.keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_prob: float, **kwargs): + super().__init__(**kwargs) + self.drop_prob = drop_prob + + def call(self, x: tf.Tensor, training=None): + if self.drop_prob == 0.0 or not training: + return x + keep_prob = 1 - self.drop_prob + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + + +class TFCvtEmbeddings(tf.keras.layers.Layer): + """Construct the Convolutional Token Embeddings.""" + + def __init__( + self, + config: CvtConfig, + patch_size: int, + embed_dim: int, + stride: int, + padding: int, + dropout_rate: float, + **kwargs + ): + super().__init__(**kwargs) + self.convolution_embeddings = TFCvtConvEmbeddings( + config, + patch_size=patch_size, + embed_dim=embed_dim, + stride=stride, + padding=padding, + name="convolution_embeddings", + ) + self.dropout = tf.keras.layers.Dropout(dropout_rate) + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution_embeddings(pixel_values) + hidden_state = self.dropout(hidden_state, training=training) + return hidden_state + + +class TFCvtConvEmbeddings(tf.keras.layers.Layer): + """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" + + def __init__(self, config: CvtConfig, patch_size: int, embed_dim: int, stride: int, padding: int, **kwargs): + super().__init__(**kwargs) + self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) + self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + self.projection = tf.keras.layers.Conv2D( + filters=embed_dim, + kernel_size=patch_size, + strides=stride, + padding="valid", + data_format="channels_last", + kernel_initializer=get_initializer(config.initializer_range), + name="projection", + ) + # Using the same default epsilon as PyTorch + self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + if isinstance(pixel_values, dict): + pixel_values = pixel_values["pixel_values"] + + pixel_values = self.projection(self.padding(pixel_values)) + + # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels" + batch_size, height, width, num_channels = shape_list(pixel_values) + hidden_size = height * width + pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels)) + pixel_values = self.normalization(pixel_values) + + # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" + pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels)) + return pixel_values + + +class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): + """Convolutional projection layer.""" + + def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs): + super().__init__(**kwargs) + self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) + self.convolution = tf.keras.layers.Conv2D( + filters=embed_dim, + kernel_size=kernel_size, + kernel_initializer=get_initializer(config.initializer_range), + padding="valid", + strides=stride, + use_bias=False, + name="convolution", + groups=embed_dim, + ) + # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) + self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution(self.padding(hidden_state)) + hidden_state = self.normalization(hidden_state, training=training) + return hidden_state + + +class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): + """Linear projection layer used to flatten tokens into 1D.""" + + def call(self, hidden_state: tf.Tensor) -> tf.Tensor: + # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels" + batch_size, height, width, num_channels = shape_list(hidden_state) + hidden_size = height * width + hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels)) + return hidden_state + + +class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): + """Convolutional Projection for Attention.""" + + def __init__( + self, + config: CvtConfig, + embed_dim: int, + kernel_size: int, + stride: int, + padding: int, + projection_method: str = "dw_bn", + **kwargs + ): + super().__init__(**kwargs) + if projection_method == "dw_bn": + self.convolution_projection = TFCvtSelfAttentionConvProjection( + config, embed_dim, kernel_size, stride, padding, name="convolution_projection" + ) + self.linear_projection = TFCvtSelfAttentionLinearProjection() + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution_projection(hidden_state, training=training) + hidden_state = self.linear_projection(hidden_state) + return hidden_state + + +class TFCvtSelfAttention(tf.keras.layers.Layer): + """ + Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for + query, key, and value embeddings. + """ + + def __init__( + self, + config: CvtConfig, + num_heads: int, + embed_dim: int, + kernel_size: int, + stride_q: int, + stride_kv: int, + padding_q: int, + padding_kv: int, + qkv_projection_method: str, + qkv_bias: bool, + attention_drop_rate: float, + with_cls_token: bool = True, + **kwargs + ): + super().__init__(**kwargs) + self.scale = embed_dim**-0.5 + self.with_cls_token = with_cls_token + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.convolution_projection_query = TFCvtSelfAttentionProjection( + config, + embed_dim, + kernel_size, + stride_q, + padding_q, + projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method, + name="convolution_projection_query", + ) + self.convolution_projection_key = TFCvtSelfAttentionProjection( + config, + embed_dim, + kernel_size, + stride_kv, + padding_kv, + projection_method=qkv_projection_method, + name="convolution_projection_key", + ) + self.convolution_projection_value = TFCvtSelfAttentionProjection( + config, + embed_dim, + kernel_size, + stride_kv, + padding_kv, + projection_method=qkv_projection_method, + name="convolution_projection_value", + ) + + self.projection_query = tf.keras.layers.Dense( + units=embed_dim, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=qkv_bias, + bias_initializer="zeros", + name="projection_query", + ) + self.projection_key = tf.keras.layers.Dense( + units=embed_dim, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=qkv_bias, + bias_initializer="zeros", + name="projection_key", + ) + self.projection_value = tf.keras.layers.Dense( + units=embed_dim, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=qkv_bias, + bias_initializer="zeros", + name="projection_value", + ) + self.dropout = tf.keras.layers.Dropout(attention_drop_rate) + + def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor: + batch_size, hidden_size, _ = shape_list(hidden_state) + head_dim = self.embed_dim // self.num_heads + hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim)) + hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3)) + return hidden_state + + def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: + if self.with_cls_token: + cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1) + + # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" + batch_size, hidden_size, num_channels = shape_list(hidden_state) + hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) + + key = self.convolution_projection_key(hidden_state, training=training) + query = self.convolution_projection_query(hidden_state, training=training) + value = self.convolution_projection_value(hidden_state, training=training) + + if self.with_cls_token: + query = tf.concat((cls_token, query), axis=1) + key = tf.concat((cls_token, key), axis=1) + value = tf.concat((cls_token, value), axis=1) + + head_dim = self.embed_dim // self.num_heads + + query = self.rearrange_for_multi_head_attention(self.projection_query(query)) + key = self.rearrange_for_multi_head_attention(self.projection_key(key)) + value = self.rearrange_for_multi_head_attention(self.projection_value(value)) + + attention_score = tf.matmul(query, key, transpose_b=True) * self.scale + attention_probs = stable_softmax(logits=attention_score, axis=-1) + attention_probs = self.dropout(attention_probs, training=training) + + context = tf.matmul(attention_probs, value) + # "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)" + _, _, hidden_size, _ = shape_list(context) + context = tf.transpose(context, perm=(0, 2, 1, 3)) + context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim)) + return context + + +class TFCvtSelfOutput(tf.keras.layers.Layer): + """Output of the Attention layer .""" + + def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = tf.keras.layers.Dropout(drop_rate) + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.dense(inputs=hidden_state) + hidden_state = self.dropout(inputs=hidden_state, training=training) + return hidden_state + + +class TFCvtAttention(tf.keras.layers.Layer): + """Attention layer. First chunk of the convolutional transformer block.""" + + def __init__( + self, + config: CvtConfig, + num_heads: int, + embed_dim: int, + kernel_size: int, + stride_q: int, + stride_kv: int, + padding_q: int, + padding_kv: int, + qkv_projection_method: str, + qkv_bias: bool, + attention_drop_rate: float, + drop_rate: float, + with_cls_token: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.attention = TFCvtSelfAttention( + config, + num_heads, + embed_dim, + kernel_size, + stride_q, + stride_kv, + padding_q, + padding_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + with_cls_token, + name="attention", + ) + self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False): + self_output = self.attention(hidden_state, height, width, training=training) + attention_output = self.dense_output(self_output, training=training) + return attention_output + + +class TFCvtIntermediate(tf.keras.layers.Layer): + """Intermediate dense layer. Second chunk of the convolutional transformer block.""" + + def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + units=int(embed_dim * mlp_ratio), + kernel_initializer=get_initializer(config.initializer_range), + activation="gelu", + name="dense", + ) + + def call(self, hidden_state: tf.Tensor) -> tf.Tensor: + hidden_state = self.dense(hidden_state) + return hidden_state + + +class TFCvtOutput(tf.keras.layers.Layer): + """ + Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection. + """ + + def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = tf.keras.layers.Dropout(drop_rate) + + def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.dense(inputs=hidden_state) + hidden_state = self.dropout(inputs=hidden_state, training=training) + hidden_state = hidden_state + input_tensor + return hidden_state + + +class TFCvtLayer(tf.keras.layers.Layer): + """ + Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It + consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the + `Block` class in the original implementation. + """ + + def __init__( + self, + config: CvtConfig, + num_heads: int, + embed_dim: int, + kernel_size: int, + stride_q: int, + stride_kv: int, + padding_q: int, + padding_kv: int, + qkv_projection_method: str, + qkv_bias: bool, + attention_drop_rate: float, + drop_rate: float, + mlp_ratio: float, + drop_path_rate: float, + with_cls_token: bool = True, + **kwargs + ): + super().__init__(**kwargs) + self.attention = TFCvtAttention( + config, + num_heads, + embed_dim, + kernel_size, + stride_q, + stride_kv, + padding_q, + padding_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + drop_rate, + with_cls_token, + name="attention", + ) + self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate") + self.dense_output = TFCvtOutput(config, embed_dim, drop_rate, name="output") + # Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour. + self.drop_path = ( + TFCvtDropPath(drop_path_rate, name="drop_path") + if drop_path_rate > 0.0 + else tf.keras.layers.Activation("linear", name="drop_path") + ) + # Using the same default epsilon as PyTorch + self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") + self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") + + def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: + # in Cvt, layernorm is applied before self-attention + attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training) + attention_output = self.drop_path(attention_output, training=training) + + # first residual connection + hidden_state = attention_output + hidden_state + + # in Cvt, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_state) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.dense_output(layer_output, hidden_state) + layer_output = self.drop_path(layer_output, training=training) + return layer_output + + +class TFCvtStage(tf.keras.layers.Layer): + """ + Cvt stage (encoder block). Each stage has 2 parts : + - (1) A Convolutional Token Embedding layer + - (2) A Convolutional Transformer Block (layer). + The classification token is added only in the last stage. + + Args: + config ([`CvtConfig`]): Model configuration class. + stage (`int`): Stage number. + """ + + def __init__(self, config: CvtConfig, stage: int, **kwargs): + super().__init__(**kwargs) + self.config = config + self.stage = stage + if self.config.cls_token[self.stage]: + self.cls_token = self.add_weight( + shape=(1, 1, self.config.embed_dim[-1]), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="cvt.encoder.stages.2.cls_token", + ) + + self.embedding = TFCvtEmbeddings( + self.config, + patch_size=config.patch_sizes[self.stage], + stride=config.patch_stride[self.stage], + embed_dim=config.embed_dim[self.stage], + padding=config.patch_padding[self.stage], + dropout_rate=config.drop_rate[self.stage], + name="embedding", + ) + + drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage]) + drop_path_rates = [x.numpy().item() for x in drop_path_rates] + self.layers = [ + TFCvtLayer( + config, + num_heads=config.num_heads[self.stage], + embed_dim=config.embed_dim[self.stage], + kernel_size=config.kernel_qkv[self.stage], + stride_q=config.stride_q[self.stage], + stride_kv=config.stride_kv[self.stage], + padding_q=config.padding_q[self.stage], + padding_kv=config.padding_kv[self.stage], + qkv_projection_method=config.qkv_projection_method[self.stage], + qkv_bias=config.qkv_bias[self.stage], + attention_drop_rate=config.attention_drop_rate[self.stage], + drop_rate=config.drop_rate[self.stage], + mlp_ratio=config.mlp_ratio[self.stage], + drop_path_rate=drop_path_rates[self.stage], + with_cls_token=config.cls_token[self.stage], + name=f"layers.{j}", + ) + for j in range(config.depth[self.stage]) + ] + + def call(self, hidden_state: tf.Tensor, training: bool = False): + cls_token = None + hidden_state = self.embedding(hidden_state, training) + + # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels" + batch_size, height, width, num_channels = shape_list(hidden_state) + hidden_size = height * width + hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels)) + + if self.config.cls_token[self.stage]: + cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0) + hidden_state = tf.concat((cls_token, hidden_state), axis=1) + + for layer in self.layers: + layer_outputs = layer(hidden_state, height, width, training=training) + hidden_state = layer_outputs + + if self.config.cls_token[self.stage]: + cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1) + + # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" + hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) + return hidden_state, cls_token + + +class TFCvtEncoder(tf.keras.layers.Layer): + """ + Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers + (depth) being 1, 2 and 10. + + Args: + config ([`CvtConfig`]): Model configuration class. + """ + + config_class = CvtConfig + + def __init__(self, config: CvtConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.stages = [ + TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth)) + ] + + def call( + self, + pixel_values: TFModelInputType, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + hidden_state = pixel_values + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width) + # as input format. So change the input format to (batch_size, height, width, num_channels). + hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1)) + + cls_token = None + for _, (stage_module) in enumerate(self.stages): + hidden_state, cls_token = stage_module(hidden_state, training=training) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + # Change back to (batch_size, num_channels, height, width) format to have uniformity in the modules + hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2)) + if output_hidden_states: + all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states]) + + if not return_dict: + return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) + + return TFBaseModelOutputWithCLSToken( + last_hidden_state=hidden_state, + cls_token_value=cls_token, + hidden_states=all_hidden_states, + ) + + +@keras_serializable +class TFCvtMainLayer(tf.keras.layers.Layer): + """Construct the Cvt model.""" + + config_class = CvtConfig + + def __init__(self, config: CvtConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.encoder = TFCvtEncoder(config, name="encoder") + + @unpack_inputs + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + encoder_outputs = self.encoder( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithCLSToken( + last_hidden_state=sequence_output, + cls_token_value=encoder_outputs.cls_token_value, + hidden_states=encoder_outputs.hidden_states, + ) + + +class TFCvtPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CvtConfig + base_model_prefix = "cvt" + main_input_name = "pixel_values" + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + return self.serving_output(output) + + +TFCVT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + + + Args: + config ([`CvtConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TFCVT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See + [`AutoFeatureExtractor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.", + TFCVT_START_DOCSTRING, +) +class TFCvtModel(TFCvtPreTrainedModel): + def __init__(self, config: CvtConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.cvt = TFCvtMainLayer(config, name="cvt") + + @unpack_inputs + @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoFeatureExtractor, TFCvtModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/cvt-13") + >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + outputs = self.cvt( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return (outputs[0],) + outputs[1:] + + return TFBaseModelOutputWithCLSToken( + last_hidden_state=outputs.last_hidden_state, + cls_token_value=outputs.cls_token_value, + hidden_states=outputs.hidden_states, + ) + + def serving_output(self, output: TFBaseModelOutputWithCLSToken) -> TFBaseModelOutputWithCLSToken: + return TFBaseModelOutputWithCLSToken( + last_hidden_state=output.last_hidden_state, + cls_token_value=output.cls_token_value, + hidden_states=output.hidden_states, + ) + + +@add_start_docstrings( + """ + Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """, + TFCVT_START_DOCSTRING, +) +class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: CvtConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.cvt = TFCvtMainLayer(config, name="cvt") + # Using same default epsilon as in the original implementation. + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm") + + # Classifier head + self.classifier = tf.keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=True, + bias_initializer="zeros", + name="classifier", + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoFeatureExtractor, TFCvtForImageClassification + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/cvt-13") + >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] + >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) + ```""" + + outputs = self.cvt( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + cls_token = outputs[1] + if self.config.cls_token[-1]: + sequence_output = self.layernorm(cls_token) + else: + # rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels" + batch_size, num_channels, height, width = shape_list(sequence_output) + sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width)) + sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1)) + sequence_output = self.layernorm(sequence_output) + + sequence_output_mean = tf.reduce_mean(sequence_output, axis=1) + logits = self.classifier(sequence_output_mean) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention: + return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 7cec699498c9..dbfe14c7f633 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -786,6 +786,30 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFCvtForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFCvtModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFCvtPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + class TFData2VecVisionForImageClassification(metaclass=DummyObject): _backends = ["tf"] diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py new file mode 100644 index 000000000000..9e261a5f25be --- /dev/null +++ b/tests/models/cvt/test_modeling_tf_cvt.py @@ -0,0 +1,271 @@ +""" Testing suite for the Tensorflow CvT model. """ + + +import inspect +import unittest +from math import floor + +import numpy as np + +from transformers import CvtConfig +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFCvtForImageClassification, TFCvtModel + from transformers.models.cvt.modeling_tf_cvt import TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoFeatureExtractor + + +class TFCvtConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "embed_dim")) + self.parent.assertTrue(hasattr(config, "num_heads")) + + +class TFCvtModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + num_channels=3, + embed_dim=[16, 48, 96], + num_heads=[1, 3, 6], + depth=[1, 2, 10], + patch_sizes=[7, 3, 3], + patch_stride=[4, 2, 2], + patch_padding=[2, 1, 1], + stride_kv=[2, 2, 2], + cls_token=[False, False, True], + attention_drop_rate=[0.0, 0.0, 0.0], + initializer_range=0.02, + layer_norm_eps=1e-12, + is_training=True, + use_labels=True, + num_labels=2, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_sizes = patch_sizes + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.is_training = is_training + self.use_labels = use_labels + self.num_labels = num_labels + self.num_channels = num_channels + self.embed_dim = embed_dim + self.num_heads = num_heads + self.stride_kv = stride_kv + self.depth = depth + self.cls_token = cls_token + self.attention_drop_rate = attention_drop_rate + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + # create a random int32 tensor of given shape + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return CvtConfig( + image_size=self.image_size, + num_labels=self.num_labels, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + num_heads=self.num_heads, + patch_sizes=self.patch_sizes, + patch_padding=self.patch_padding, + patch_stride=self.patch_stride, + stride_kv=self.stride_kv, + depth=self.depth, + cls_token=self.cls_token, + attention_drop_rate=self.attention_drop_rate, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFCvtModel(config=config) + result = model(pixel_values, training=False) + image_size = (self.image_size, self.image_size) + height, width = image_size[0], image_size[1] + for i in range(len(self.depth)): + height = floor(((height + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1) + width = floor(((width + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dim[-1], height, width)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = TFCvtForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFCvtModelTest(TFModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Cvt + does not use input_ids, inputs_embeds, attention_mask and seq_length. + """ + + all_model_classes = (TFCvtModel, TFCvtForImageClassification) if is_tf_available() else () + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCvtModelTester(self) + self.config_tester = TFCvtConfigTester(self, config_class=CvtConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + @unittest.skip(reason="Cvt does not output attentions") + def test_attention_outputs(self): + pass + + @unittest.skip(reason="Cvt does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Cvt does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_dataset_conversion(self): + super().test_dataset_conversion() + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_keras_fit(self): + super().test_keras_fit() + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.hidden_states + + expected_num_layers = len(self.model_tester.depth) + self.assertEqual(len(hidden_states), expected_num_layers) + + # verify the first hidden states (first block) + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [ + self.model_tester.embed_dim[0], + self.model_tester.image_size // 4, + self.model_tester.image_size // 4, + ], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFCvtModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFCvtModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return AutoFeatureExtractor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0]) + + @slow + def test_inference_image_classification_head(self): + model = TFCvtForImageClassification.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0]) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant([0.9285, 0.9015, -0.3150]) + self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4))