From 4bd737f49d213b96ffe410361a1195962aa1b2de Mon Sep 17 00:00:00 2001 From: steven Date: Wed, 26 Mar 2025 12:17:26 +0100 Subject: [PATCH 1/6] (draft) --- .../models/rf_detr/configuration_rf_detr.py | 145 + .../models/rf_detr/modeling_rf_detr.py | 3758 +++++++++++++++++ .../models/rf_detr/modular_rf_detr.py | 543 +++ 3 files changed, 4446 insertions(+) create mode 100644 src/transformers/models/rf_detr/configuration_rf_detr.py create mode 100644 src/transformers/models/rf_detr/modeling_rf_detr.py create mode 100644 src/transformers/models/rf_detr/modular_rf_detr.py diff --git a/src/transformers/models/rf_detr/configuration_rf_detr.py b/src/transformers/models/rf_detr/configuration_rf_detr.py new file mode 100644 index 000000000000..0d4275597772 --- /dev/null +++ b/src/transformers/models/rf_detr/configuration_rf_detr.py @@ -0,0 +1,145 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rf_detr.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 + +from ...configuration_utils import PretrainedConfig +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +class RFDetrConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RFDetrModel`]. It is used to instantiate an + RFDetr model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import RFDetrConfig, RFDetrModel + + >>> # Initializing a RFDetr base style configuration + >>> configuration = RFDetrConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = RFDetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "rf_detr" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + num_windows=1, + window_block_indexes=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.num_windows = num_windows + self.window_block_indexes = window_block_indexes diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py new file mode 100644 index 000000000000..e03e7b511d5e --- /dev/null +++ b/src/transformers/models/rf_detr/modeling_rf_detr.py @@ -0,0 +1,3758 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rf_detr.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +import copy +import math +import os +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Dict, List, Optional, Set, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import BackboneOutput, BaseModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_ninja_available, + is_timm_available, + is_torch_cuda_available, + is_torchdynamo_compiling, + logging, + replace_return_docstrings, + requires_backends, + torch_int, +) +from ...utils.backbone_utils import BackboneMixin, load_backbone +from .configuration_rf_detr import RFConfig, RFDetrConfig + + +if is_timm_available(): + from timm import create_model + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "RFDetrConfig" + + +class RFDetrPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class RFDetrEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: RFDetrConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = RFDetrPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + if self.config.num_windows > 1: + # reshape for windows + num_h_patches = height // self.config.patch_size + num_w_patches = width // self.config.patch_size + cls_token_with_pos_embed = embeddings[:, :1] + pixel_tokens_with_pos_embed = embeddings[:, 1:] + pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( + batch_size, num_h_patches, num_w_patches, -1 + ) + num_w_patches_per_window = num_w_patches // self.config.num_windows + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_windows = self.config.num_windows + windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( + batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 + ) + windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) + windowed_pixel_tokens = windowed_pixel_tokens.reshape( + batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 + ) + windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) + embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) + + # add register tokens + embeddings = ( + torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + if self.config.num_register_tokens > 0 + else embeddings + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class RFDetrSelfAttention(nn.Module): + def __init__(self, config: RFDetrConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class RFDetrSelfOutput(nn.Module): + """ + The residual connection is defined in RFDetrLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: RFDetrConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class RFDetrAttention(nn.Module): + def __init__(self, config: RFDetrConfig) -> None: + super().__init__() + self.attention = RFDetrSelfAttention(config) + self.output = RFDetrSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class RFDetrLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class RFDetrDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class RFDetrMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class RFDetrSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class RFDetrLayer(nn.Module): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = RFDetrAttention(config) + self.layer_scale1 = RFDetrLayerScale(config) + self.drop_path = RFDetrDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = RFDetrSwiGLUFFN(config) + else: + self.mlp = RFDetrMLP(config) + self.layer_scale2 = RFDetrLayerScale(config) + + self.num_windows = config.num_windows + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + run_full_attention: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + assert head_mask is None, "head_mask is not supported for windowed attention" + assert not output_attentions, "output_attentions is not supported for windowed attention" + shortcut = hidden_states + if run_full_attention: + # reshape x to remove windows + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) + + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + if run_full_attention: + # reshape x to add windows back + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) + attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + shortcut + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +MultiScaleDeformableAttention = None + + +class MultiScaleDeformableAttentionFunction(Function): + @staticmethod + def forward( + context, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + context.im2col_step = im2col_step + output = MultiScaleDeformableAttention.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + context.im2col_step, + ) + context.save_for_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + ) + return output + + @staticmethod + @once_differentiable + def backward(context, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = context.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + context.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +class RFLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def load_cuda_kernels(): + from torch.utils.cpp_extension import load + + global MultiScaleDeformableAttention + + root = Path(__file__).resolve().parent.parent.parent / "kernels" / "r_f" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] + + MultiScaleDeformableAttention = load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) + + +def multi_scale_deformable_attention( + value: Tensor, + value_spatial_shapes: Union[Tensor, List[Tuple]], + sampling_locations: Tensor, + attention_weights: Tensor, +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class RFMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: RFConfig, num_heads: int, n_points: int): + super().__init__() + + kernel_loaded = MultiScaleDeformableAttention is not None + if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded: + try: + load_cuda_kernels() + except Exception as e: + logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in RFMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + total_elements = sum(height * width for height, width in spatial_shapes_list) + if total_elements != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling(): + # PyTorch implementation + output = multi_scale_deformable_attention( + value, spatial_shapes_list, sampling_locations, attention_weights + ) + else: + try: + # custom kernel + output = MultiScaleDeformableAttentionFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + except Exception: + # PyTorch implementation + output = multi_scale_deformable_attention( + value, spatial_shapes_list, sampling_locations, attention_weights + ) + output = self.output_proj(output) + + return output, attention_weights + + +class RFEncoderLayer(nn.Module): + def __init__(self, config: RFConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = RFMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class RFPreTrainedModel(PreTrainedModel): + config_class = RFConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = [r"RFConvEncoder", r"RFEncoderLayer", r"RFDecoderLayer"] + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, RFLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, RFMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + +class RFDetrEncoder(RFPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`RFDetrEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: RFDetrConfig + """ + + def __init__(self, config: RFConfig): + super().__init__(config) + self.gradient_checkpointing = False + + self.dropout = config.dropout + self.layers = nn.ModuleList([RFEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + spatial_shapes_tuple = tuple(spatial_shapes_list) + reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + position_embeddings, + reference_points, + spatial_shapes, + spatial_shapes_list, + level_start_index, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +R_F_DETR_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`RFDetrConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +R_F_DETR_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BitImageProcessor.preprocess`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + """ + RFDetr backbone, to be used with frameworks like DETR and MaskFormer. + """, + R_F_DETR_START_DOCSTRING, +) +class RFDetrBackbone(RFDetrPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = RFDetrEmbeddings(config) + self.encoder = RFDetrEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.num_register_tokens = config.num_register_tokens + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> RFDetrPatchEmbeddings: + return self.embeddings.patch_embeddings + + @add_start_docstrings_to_model_forward(R_F_DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + Returns: + + Examples: + Returns: + + Examples: + Returns: + + Examples: + + + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + + num_h_patches = height // patch_size + num_w_patches = width // patch_size + + if self.config.num_windows > 1: + # undo windowing + num_windows_squared = self.config.num_windows**2 + B, HW, C = hidden_state.shape + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_w_patches_per_window = num_w_patches // self.config.num_windows + hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) + hidden_state = hidden_state.view( + B // num_windows_squared, + self.config.num_windows, + self.config.num_windows, + num_h_patches_per_window, + num_w_patches_per_window, + C, + ) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) + + hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +class RFDetrLayerNorm(nn.Module): + """ + A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the + channel dimension for inputs that have shape (batch_size, channels, height, width). + https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 + """ + + def __init__(self, normalized_shape, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.normalized_shape = (normalized_shape,) + + def forward(self, x): + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class ConvX(nn.Module): + """Conv-bn module""" + + def __init__(self, in_planes, out_planes, kernel=3, stride=1, groups=1, dilation=1, act="relu"): + super(ConvX, self).__init__() + self.conv = nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel, + stride=stride, + padding=kernel // 2, + groups=groups, + dilation=dilation, + bias=False, + ) + self.bn = nn.BatchNorm2d(out_planes) + self.act = ACT2FN[act] + + def forward(self, x): + """forward""" + out = self.act(self.bn(self.conv(x))) + return out + + +class Bottleneck(nn.Module): + """Standard bottleneck.""" + + def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, act="silu"): + """ch_in, ch_out, shortcut, groups, kernels, expand""" + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = ConvX(c1, c_, k[0], 1, act=act) + self.cv2 = ConvX(c_, c2, k[1], 1, groups=g, act=act) + self.add = shortcut and c1 == c2 + + def forward(self, x): + """'forward()' applies the YOLOv5 FPN to input data.""" + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + + +class RFDetrC2f(nn.Module): + """Faster Implementation of CSP Bottleneck with 2 convolutions.""" + + def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, act="silu"): + """ch_in, ch_out, number, shortcut, groups, expansion""" + super().__init__() + self.c = int(c2 * e) # hidden channels + self.cv1 = ConvX(c1, 2 * self.c, 1, 1, act=act) + self.cv2 = ConvX((2 + n) * self.c, c2, 1, act=act) # optional act=FReLU(c2) + self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=(3, 3), e=1.0, act=act) for _ in range(n)) + + def forward(self, x): + """Forward pass using split() instead of chunk().""" + y = list(self.cv1(x).split((self.c, self.c), 1)) + y.extend(m(y[-1]) for m in self.m) + return self.cv2(torch.cat(y, 1)) + + +class RFDetrMultiScaleProjector(nn.Module): + """ + This module implements MultiScaleProjector in :paper:`lwdetr`. + It creates pyramid features built on top of the input feature map. + """ + + def __init__( + self, + in_channels, + out_channels, + scale_factors, + num_blocks=3, + layer_norm=False, + rms_norm=False, + survival_prob=1.0, + force_drop_last_n_features=0, + ): + """ + Args: + net (Backbone): module representing the subnetwork backbone. + Must be a subclass of :class:`Backbone`. + out_channels (int): number of channels in the output feature maps. + scale_factors (list[float]): list of scaling factors to upsample or downsample + the input features for creating pyramid features. + """ + super().__init__() + + self.scale_factors = scale_factors + self.survival_prob = survival_prob + self.force_drop_last_n_features = force_drop_last_n_features + + stages_sampling = [] + stages = [] + # use_bias = norm == "" + use_bias = False + self.use_extra_pool = False + for scale in scale_factors: + stages_sampling.append([]) + for in_dim in in_channels: + out_dim = in_dim + layers = [] + + # if in_dim > 512: + # layers.append(ConvX(in_dim, in_dim // 2, kernel=1)) + # in_dim = in_dim // 2 + + if scale == 4.0: + layers.extend( + [ + nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), + RFDetrLayerNorm(in_dim // 2), + nn.GELU(), + nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), + ] + ) + out_dim = in_dim // 4 + elif scale == 2.0: + # a hack to reduce the FLOPs and Params when the dimention of output feature is too large + # if in_dim > 512: + # layers = [ + # ConvX(in_dim, in_dim // 2, kernel=1), + # nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), + # ] + # out_dim = in_dim // 4 + # else: + layers.extend( + [ + nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), + ] + ) + out_dim = in_dim // 2 + elif scale == 1.0: + pass + elif scale == 0.5: + layers.extend( + [ + ConvX(in_dim, in_dim, 3, 2, layer_norm=layer_norm), + ] + ) + elif scale == 0.25: + self.use_extra_pool = True + continue + else: + raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) + layers = nn.Sequential(*layers) + stages_sampling[-1].append(layers) + stages_sampling[-1] = nn.ModuleList(stages_sampling[-1]) + + in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) + layers = [ + RFDetrC2f(in_dim, out_channels, num_blocks, layer_norm=layer_norm), + RFDetrLayerNorm(out_channels), + ] + layers = nn.Sequential(*layers) + stages.append(layers) + + self.stages_sampling = nn.ModuleList(stages_sampling) + self.stages = nn.ModuleList(stages) + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: + mapping from feature map name to pyramid feature map tensor + in high to low resolution order. Returned feature names follow the FPN + convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + num_features = len(x) + if self.survival_prob < 1.0 and self.training: + final_drop_prob = 1 - self.survival_prob + drop_p = np.random.uniform() + for i in range(1, num_features): + critical_drop_prob = i * (final_drop_prob / (num_features - 1)) + if drop_p < critical_drop_prob: + x[i][:] = 0 + elif self.force_drop_last_n_features > 0: + for i in range(self.force_drop_last_n_features): + # don't do it inplace to ensure the compiler can optimize out the backbone layers + x[-(i + 1)] = torch.zeros_like(x[-(i + 1)]) + + results = [] + # x list of len(out_features_indexes) + for i, stage in enumerate(self.stages): + feat_fuse = [] + for j, stage_sampling in enumerate(self.stages_sampling[i]): + feat_fuse.append(stage_sampling(x[j])) + if len(feat_fuse) > 1: + feat_fuse = torch.cat(feat_fuse, dim=1) + else: + feat_fuse = feat_fuse[0] + results.append(stage(feat_fuse)) + if self.use_extra_pool: + results.append(F.max_pool2d(results[-1], kernel_size=1, stride=2, padding=0)) + return results + + +class RFMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class RFDetrDecoderLayer(nn.Module): + def __init__(self, config: RFConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = RFMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = RFMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@dataclass +class RFDecoderOutput(ModelOutput): + """ + Base class for outputs of the RFDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class RFDecoderLayer(nn.Module): + def __init__(self, config: RFConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = RFMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = RFMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +class RFDetrDecoder(RFPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDetrDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Deformable DETR: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: RFDetrConfig + """ + + def __init__(self, config: RFConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([RFDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif reference_points.shape[-1] == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Reference points' last dimension must be of size 2") + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + position_embeddings, + reference_points_input, + spatial_shapes, + spatial_shapes_list, + level_start_index, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return RFDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class RFDetrPreTrainedModel(PreTrainedModel): + config_class = RFConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = [r"RFDetrConvEncoder", r"RFDetrEncoderLayer", r"RFDetrDecoderLayer"] + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, RFLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, RFMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + +@dataclass +class RFModelOutput(ModelOutput): + """ + Base class for outputs of the Deformable DETR encoder-decoder model. + + Args: + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + init_reference_points: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +class RFFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `RFFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = RFFrozenBatchNorm2d(module.num_features) + + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class RFConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by RFFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API + if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. + requires_backends(self, ["timm"]) + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) + num_channels = kwargs.pop("in_chans", config.num_channels) + if config.dilation: + kwargs["output_stride"] = kwargs.get("output_stride", 16) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=out_indices, + in_chans=num_channels, + **kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +class RFConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +class RFSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype) + x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class RFEncoder(RFPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`RFEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: RFConfig + """ + + def __init__(self, config: RFConfig): + super().__init__(config) + self.gradient_checkpointing = False + + self.dropout = config.dropout + self.layers = nn.ModuleList([RFEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + spatial_shapes_tuple = tuple(spatial_shapes_list) + reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + position_embeddings, + reference_points, + spatial_shapes, + spatial_shapes_list, + level_start_index, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class RFDecoder(RFPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Deformable DETR: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: RFConfig + """ + + def __init__(self, config: RFConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([RFDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif reference_points.shape[-1] == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Reference points' last dimension must be of size 2") + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + position_embeddings, + reference_points_input, + spatial_shapes, + spatial_shapes_list, + level_start_index, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return RFDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = RFSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = RFLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +R_F_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`RFConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +R_F_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`RFImageProcessor.__call__`] + for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + """ + The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + R_F_START_DOCSTRING, +) +class RFDetrModel(RFPreTrainedModel): + def __init__(self, config: RFConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = RFConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = RFConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + if not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + + self.encoder = RFEncoder(config) + self.decoder = RFDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask, dtype=torch.float32): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.to(dtype) / height + valid_ratio_width = valid_width.to(dtype) / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=enc_output.dtype, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=enc_output.dtype, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @add_start_docstrings_to_model_forward(R_F_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], RFModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RFDetrModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = RFDetrModel.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(features): + sources.append(self.input_proj[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj[level](features[-1][0]) + else: + source = self.input_proj[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to( + torch.bool + )[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if not self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes_list = [] + for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes_list.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1) + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=source_flatten, + attention_mask=mask_flatten, + position_embeddings=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, prepare decoder inputs + batch_size, _, num_channels = encoder_outputs[0].shape + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes_list + ) + + # hack implementation for two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.config.two_stage_num_proposals + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return RFModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +@dataclass +class RFObjectDetectionOutput(ModelOutput): + """ + Output type of [`RFForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~RFProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, + 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average + in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional = None + enc_outputs_coord_logits: Optional = None + + +@add_start_docstrings( + """ + The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + R_F_START_DOCSTRING, +) +class RFModel(RFPreTrainedModel): + def __init__(self, config: RFConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = RFConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = RFConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + if not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + + self.encoder = RFEncoder(config) + self.decoder = RFDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask, dtype=torch.float32): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.to(dtype) / height + valid_ratio_width = valid_width.to(dtype) / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=enc_output.dtype, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=enc_output.dtype, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @add_start_docstrings_to_model_forward(R_F_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], RFModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RFModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = RFModel.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(features): + sources.append(self.input_proj[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj[level](features[-1][0]) + else: + source = self.input_proj[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to( + torch.bool + )[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if not self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes_list = [] + for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes_list.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1) + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=source_flatten, + attention_mask=mask_flatten, + position_embeddings=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, prepare decoder inputs + batch_size, _, num_channels = encoder_outputs[0].shape + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes_list + ) + + # hack implementation for two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.config.two_stage_num_proposals + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return RFModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +class RFMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +@add_start_docstrings( + """ + Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """, + R_F_START_DOCSTRING, +) +class RFDetrForObjectDetection(RFPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: RFConfig): + super().__init__(config) + + # Deformable DETR encoder-decoder model + self.model = RFModel(config) + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = RFMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + for box_embed in self.bbox_embed: + nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(R_F_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[List[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], RFObjectDetectionOutput]: + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = RFObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs diff --git a/src/transformers/models/rf_detr/modular_rf_detr.py b/src/transformers/models/rf_detr/modular_rf_detr.py new file mode 100644 index 000000000000..30de1bdcd4a5 --- /dev/null +++ b/src/transformers/models/rf_detr/modular_rf_detr.py @@ -0,0 +1,543 @@ +from numbers import Number +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from ...activations import ACT2FN +from ...configuration_utils import PretrainedConfig +from ...modeling_outputs import BackboneOutput, BaseModelOutput +from ..auto import CONFIG_MAPPING +from ..deformable_detr.modeling_deformable_detr import ( + DeformableDetrDecoder, + DeformableDetrDecoderLayer, + DeformableDetrEncoder, + DeformableDetrForObjectDetection, + DeformableDetrModel, + DeformableDetrPreTrainedModel, +) +from ..dinov2_with_registers.configuration_dinov2_with_registers import Dinov2WithRegistersConfig +from ..dinov2_with_registers.modeling_dinov2_with_registers import ( + Dinov2WithRegistersBackbone, + Dinov2WithRegistersEmbeddings, + Dinov2WithRegistersEncoder, + Dinov2WithRegistersLayer, +) +from ..vitdet.modeling_vitdet import VitDetLayerNorm + + +class RFDetrConfig(PretrainedConfig): + model_type = "rf_detr" + sub_configs = {"backbone_config": Dinov2WithRegistersConfig} + + def __init__( + self, + backbone_config=None, + num_windows: int = 4, + window_block_indexes=None, + out_feature_indexes: List[int] = [2, 5, 8, 11], + scale_factors: List[Number[2.0, 1.0, 0.5, 0.25]] = [1.0], + layer_norm: bool = False, + rms_norm: bool = False, + **kwargs, + ): + self.out_feature_indexes = out_feature_indexes + + if isinstance(backbone_config, dict): + backbone_config["out_indices"] = out_feature_indexes + backbone_config["model_type"] = ( + backbone_config["model_type"] if "model_type" in backbone_config else "dinov2_with_registers" + ) + backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) + elif backbone_config is None: + backbone_config = CONFIG_MAPPING["dinov2_with_registers"](out_indices=out_feature_indexes) + self.backbone_config = backbone_config + self.backbone_config.num_windows = num_windows + self.backbone_config.window_block_indexes = ( + list(range(backbone_config.num_hidden_layers)) if window_block_indexes is None else window_block_indexes + ) + + self.scale_factors = [1.0] if scale_factors is None else scale_factors + assert self.scale_factors > 0, "scale_factors must be a list of at least one element" + assert sorted(self.scale_factors) == self.scale_factors, "scale_factors must be sorted" + assert all(scale in [2.0, 1.0, 0.5, 0.25] for scale in self.scale_factors), ( + "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]" + ) + + self.layer_norm = layer_norm + self.rms_norm = rms_norm + super().__init__(**kwargs) + + +class RFDetrEmbeddings(Dinov2WithRegistersEmbeddings): + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + if self.config.num_windows > 1: + # reshape for windows + num_h_patches = height // self.config.patch_size + num_w_patches = width // self.config.patch_size + cls_token_with_pos_embed = embeddings[:, :1] + pixel_tokens_with_pos_embed = embeddings[:, 1:] + pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( + batch_size, num_h_patches, num_w_patches, -1 + ) + num_w_patches_per_window = num_w_patches // self.config.num_windows + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_windows = self.config.num_windows + windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( + batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 + ) + windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) + windowed_pixel_tokens = windowed_pixel_tokens.reshape( + batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 + ) + windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) + embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) + + # add register tokens + embeddings = ( + torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + if self.config.num_register_tokens > 0 + else embeddings + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class RFDetrLayer(Dinov2WithRegistersLayer): + def __init__(self, config): + super(Dinov2WithRegistersLayer).__init__(config) + + self.num_windows = config.num_windows + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + run_full_attention: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + assert head_mask is None, "head_mask is not supported for windowed attention" + assert not output_attentions, "output_attentions is not supported for windowed attention" + shortcut = hidden_states + if run_full_attention: + # reshape x to remove windows + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) + + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + if run_full_attention: + # reshape x to add windows back + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) + attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + shortcut + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class RFDetrEncoder(Dinov2WithRegistersEncoder): + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if i > int(self.config.out_features[-1][5:]): # TODO check this + # early stop if we have reached the last output feature + break + + run_full_attention = i not in self.config.window_block_indexes + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + run_full_attention, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, run_full_attention) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class RFDetrBackbone(Dinov2WithRegistersBackbone): + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + Returns: + + Examples: + + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + + num_h_patches = height // patch_size + num_w_patches = width // patch_size + + if self.config.num_windows > 1: + # undo windowing + num_windows_squared = self.config.num_windows**2 + B, HW, C = hidden_state.shape + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_w_patches_per_window = num_w_patches // self.config.num_windows + hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) + hidden_state = hidden_state.view( + B // num_windows_squared, + self.config.num_windows, + self.config.num_windows, + num_h_patches_per_window, + num_w_patches_per_window, + C, + ) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) + + hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +class RFDetrLayerNorm(VitDetLayerNorm): + pass + + +class ConvX(nn.Module): + """Conv-bn module""" + + def __init__(self, in_planes, out_planes, kernel=3, stride=1, groups=1, dilation=1, act="relu"): + super(ConvX, self).__init__() + self.conv = nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel, + stride=stride, + padding=kernel // 2, + groups=groups, + dilation=dilation, + bias=False, + ) + self.bn = nn.BatchNorm2d(out_planes) + self.act = ACT2FN[act] + + def forward(self, x): + """forward""" + out = self.act(self.bn(self.conv(x))) + return out + + +class Bottleneck(nn.Module): + """Standard bottleneck.""" + + def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, act="silu"): + """ch_in, ch_out, shortcut, groups, kernels, expand""" + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = ConvX(c1, c_, k[0], 1, act=act) + self.cv2 = ConvX(c_, c2, k[1], 1, groups=g, act=act) + self.add = shortcut and c1 == c2 + + def forward(self, x): + """'forward()' applies the YOLOv5 FPN to input data.""" + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + + +class RFDetrC2f(nn.Module): + """Faster Implementation of CSP Bottleneck with 2 convolutions.""" + + def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, act="silu"): + """ch_in, ch_out, number, shortcut, groups, expansion""" + super().__init__() + self.c = int(c2 * e) # hidden channels + self.cv1 = ConvX(c1, 2 * self.c, 1, 1, act=act) + self.cv2 = ConvX((2 + n) * self.c, c2, 1, act=act) # optional act=FReLU(c2) + self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=(3, 3), e=1.0, act=act) for _ in range(n)) + + def forward(self, x): + """Forward pass using split() instead of chunk().""" + y = list(self.cv1(x).split((self.c, self.c), 1)) + y.extend(m(y[-1]) for m in self.m) + return self.cv2(torch.cat(y, 1)) + + +class RFDetrMultiScaleProjector(nn.Module): + """ + This module implements MultiScaleProjector in :paper:`lwdetr`. + It creates pyramid features built on top of the input feature map. + """ + + def __init__( + self, + config: RFDetrConfig, + in_channels, + out_channels, + scale_factors, + num_blocks=3, + ): + """ + Args: + net (Backbone): module representing the subnetwork backbone. + Must be a subclass of :class:`Backbone`. + out_channels (int): number of channels in the output feature maps. + scale_factors (list[float]): list of scaling factors to upsample or downsample + the input features for creating pyramid features. + """ + super().__init__() + + self.scale_factors = config.scale_factors + in_channels = [config.backbone_config.hidden_size] * len(config.out_feature_indexes) + + stages_sampling = [] + stages = [] + + self.use_extra_pool = False + for scale in scale_factors: + stages_sampling.append([]) + for in_dim in in_channels: + layers = [] + + # if in_dim > 512: + # layers.append(ConvX(in_dim, in_dim // 2, kernel=1)) + # in_dim = in_dim // 2 + + if scale == 4.0: + layers.extend( + [ + nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), + RFDetrLayerNorm(in_dim // 2), + nn.GELU(), + nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), + ] + ) + elif scale == 2.0: + # a hack to reduce the FLOPs and Params when the dimention of output feature is too large + # if in_dim > 512: + # layers = [ + # ConvX(in_dim, in_dim // 2, kernel=1), + # nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), + # ] + # out_dim = in_dim // 4 + # else: + layers.extend( + [ + nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), + ] + ) + elif scale == 1.0: + pass + elif scale == 0.5: + layers.extend( + [ + ConvX(in_dim, in_dim, 3, 2, layer_norm=config.layer_norm), + ] + ) + elif scale == 0.25: + self.use_extra_pool = True + continue + else: + raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) + layers = nn.Sequential(*layers) + stages_sampling[-1].append(layers) + stages_sampling[-1] = nn.ModuleList(stages_sampling[-1]) + + in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) + layers = [ + RFDetrC2f(in_dim, out_channels, num_blocks, layer_norm=config.layer_norm), + RFDetrLayerNorm(out_channels), + ] + layers = nn.Sequential(*layers) + stages.append(layers) + + self.stages_sampling = nn.ModuleList(stages_sampling) + self.stages = nn.ModuleList(stages) + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: + mapping from feature map name to pyramid feature map tensor + in high to low resolution order. Returned feature names follow the FPN + convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + results = [] + # x list of len(out_features_indexes) + for i, stage in enumerate(self.stages): + feat_fuse = [] + for j, stage_sampling in enumerate(self.stages_sampling[i]): + feat_fuse.append(stage_sampling(x[j])) + if len(feat_fuse) > 1: + feat_fuse = torch.cat(feat_fuse, dim=1) + else: + feat_fuse = feat_fuse[0] + results.append(stage(feat_fuse)) + if self.use_extra_pool: + results.append(F.max_pool2d(results[-1], kernel_size=1, stride=2, padding=0)) + return results + + +class RFDetrDecoderLayer(DeformableDetrDecoderLayer): + pass + + +class RFDetrDecoder(DeformableDetrDecoder): + pass + + +class RFDetrPreTrainedModel(DeformableDetrPreTrainedModel): + pass + + +class RFDetrDecoder(DeformableDetrDecoder): + pass + + +class RFDetrEncoder(DeformableDetrEncoder): + pass + + +class RFDetrModel(DeformableDetrModel): + pass + + +class RFDetrForObjectDetection(DeformableDetrForObjectDetection): + pass From 17a0e328e7f4aa7f180c5d21e6a33ea47fb5ae25 Mon Sep 17 00:00:00 2001 From: steven Date: Wed, 26 Mar 2025 12:17:26 +0100 Subject: [PATCH 2/6] (draft) two modeling files --- src/transformers/__init__.py | 18 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 5 + src/transformers/models/auto/modeling_auto.py | 3 + src/transformers/models/rf_detr/__init__.py | 31 + .../models/rf_detr/configuration_rf_detr.py | 384 +- ...iguration_rf_detr_dinov2_with_registers.py | 145 + .../models/rf_detr/modeling_rf_detr.py | 4396 +++++------------ .../modeling_rf_detr_dinov2_with_registers.py | 850 ++++ .../models/rf_detr/modular_rf_detr.py | 543 -- src/transformers/models/rf_detr/run_rfdetr.py | 18 + 11 files changed, 2701 insertions(+), 3693 deletions(-) create mode 100644 src/transformers/models/rf_detr/__init__.py create mode 100644 src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py create mode 100644 src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py delete mode 100644 src/transformers/models/rf_detr/modular_rf_detr.py create mode 100644 src/transformers/models/rf_detr/run_rfdetr.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e8da536747d4..c0ad2f21a733 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -762,6 +762,7 @@ "RoFormerConfig", "RoFormerTokenizer", ], + "models.rf_detr": ["RFDetrConfig", "RFDetrDinov2WithRegistersConfig"], "models.rt_detr": ["RTDetrConfig", "RTDetrResNetConfig"], "models.rt_detr_v2": ["RTDetrV2Config"], "models.rwkv": ["RwkvConfig"], @@ -3536,6 +3537,15 @@ "load_tf_weights_in_roformer", ] ) + _import_structure["models.rf_detr"].extend( + [ + "RFDetrForObjectDetection", + "RFDetrModel", + "RFDetrPreTrainedModel", + "RFDetrDinov2WithRegistersBackbone", + "RFDetrDinov2WithRegistersPreTrainedModel", + ] + ) _import_structure["models.rt_detr"].extend( [ "RTDetrForObjectDetection", @@ -5987,6 +5997,7 @@ from .models.regnet import RegNetConfig from .models.rembert import RemBertConfig from .models.resnet import ResNetConfig + from .models.rf_detr import RFDetrConfig, RFDetrDinov2WithRegistersConfig from .models.roberta import ( RobertaConfig, RobertaTokenizer, @@ -8316,6 +8327,13 @@ ResNetModel, ResNetPreTrainedModel, ) + from .models.rf_detr import ( + RFDetrDinov2WithRegistersBackbone, + RFDetrDinov2WithRegistersPreTrainedModel, + RFDetrForObjectDetection, + RFDetrModel, + RFDetrPreTrainedModel, + ) from .models.roberta import ( RobertaForCausalLM, RobertaForMaskedLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 49ce48dd6c04..d3c044778c46 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -235,6 +235,7 @@ regnet, rembert, resnet, + rf_detr, roberta, roberta_prelayernorm, roc_bert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c7ef472882ba..152f35e95ab5 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -261,6 +261,8 @@ ("rembert", "RemBertConfig"), ("resnet", "ResNetConfig"), ("retribert", "RetriBertConfig"), + ("rf_detr_dinov2_with_registers", "RFDetrDinov2WithRegistersConfig"), + ("rf_detr", "RFDetrConfig"), ("roberta", "RobertaConfig"), ("roberta-prelayernorm", "RobertaPreLayerNormConfig"), ("roc_bert", "RoCBertConfig"), @@ -615,6 +617,8 @@ ("rembert", "RemBERT"), ("resnet", "ResNet"), ("retribert", "RetriBERT"), + ("rf_detr", "RF-DETR"), + ("rf_detr_dinov2_with_registers", "RF-DETR-DINOv2 with Registers"), ("roberta", "RoBERTa"), ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"), ("roc_bert", "RoCBert"), @@ -766,6 +770,7 @@ ("smolvlm_vision", "smolvlm"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), + ("rf_detr_dinov2_with_registers", "rf_detr"), ("granitevision", "llava_next"), ] ) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 05a415741413..72f06cb2d1cb 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -238,6 +238,7 @@ ("rembert", "RemBertModel"), ("resnet", "ResNetModel"), ("retribert", "RetriBertModel"), + ("rf_detr", "RFDetrModel"), ("roberta", "RobertaModel"), ("roberta-prelayernorm", "RobertaPreLayerNormModel"), ("roc_bert", "RoCBertModel"), @@ -920,6 +921,7 @@ ("deformable_detr", "DeformableDetrForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), + ("rf_detr", "RFDetrForObjectDetection"), ("rt_detr", "RTDetrForObjectDetection"), ("rt_detr_v2", "RTDetrV2ForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), @@ -1440,6 +1442,7 @@ ("nat", "NatBackbone"), ("pvt_v2", "PvtV2Backbone"), ("resnet", "ResNetBackbone"), + ("rf_detr_dinov2_with_registers", "RFDetrDinov2WithRegistersBackbone"), ("rt_detr_resnet", "RTDetrResNetBackbone"), ("swin", "SwinBackbone"), ("swinv2", "Swinv2Backbone"), diff --git a/src/transformers/models/rf_detr/__init__.py b/src/transformers/models/rf_detr/__init__.py new file mode 100644 index 000000000000..46dba76871ff --- /dev/null +++ b/src/transformers/models/rf_detr/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_rf_detr import * + from .configuration_rf_detr_dinov2_with_registers import * + from .modeling_rf_detr import * + from .modeling_rf_detr_dinov2_with_registers import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/rf_detr/configuration_rf_detr.py b/src/transformers/models/rf_detr/configuration_rf_detr.py index 0d4275597772..dc62390c7762 100644 --- a/src/transformers/models/rf_detr/configuration_rf_detr.py +++ b/src/transformers/models/rf_detr/configuration_rf_detr.py @@ -1,86 +1,153 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr.py. -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_rf_detr.py file directly. One of our CI enforces this. -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Deformable DETR model configuration""" + +from typing import List from ...configuration_utils import PretrainedConfig -from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) -class RFDetrConfig(BackboneConfigMixin, PretrainedConfig): +class RFDetrConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`RFDetrModel`]. It is used to instantiate an - RFDetr model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the DINOv2 with Registers - [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + This is the configuration class to store the configuration of a [`RFDetrModel`]. It is used to instantiate + a Deformable DETR model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Deformable DETR + [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`RFDetrModel`] can detect in a single image. In case `two_stage` is set to `True`, we use + `two_stage_num_proposals` instead. + max_position_embeddings (``, *optional*, defaults to 1024): + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer encoder. - mlp_ratio (`int`, *optional*, defaults to 4): - Ratio of the hidden size of the MLPs relative to the `hidden_size`. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + is_encoder_decoder (``, *optional*, defaults to `True`): + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` are supported. - hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + `"relu"`, `"silu"` and `"gelu_new"` are supported. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + dropout (`float`, *optional*, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the layer normalization layers. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - num_channels (`int`, *optional*, defaults to 3): - The number of input channels. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add a bias to the queries, keys and values. - layerscale_value (`float`, *optional*, defaults to 1.0): - Initial value to use for layer scale. - drop_path_rate (`float`, *optional*, defaults to 0.0): - Stochastic depth rate per sample (when applied in the main path of residual layers). - use_swiglu_ffn (`bool`, *optional*, defaults to `False`): - Whether to use the SwiGLU feedforward neural network. - num_register_tokens (`int`, *optional*, defaults to 4): - Number of register tokens to use. - out_features (`List[str]`, *optional*): - If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. - (depending on how many stages the model has). If unset and `out_indices` is set, will default to the - corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the - same order as defined in the `stage_names` attribute. - out_indices (`List[int]`, *optional*): - If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how - many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. - If unset and `out_features` is unset, will default to the last stage. Must be in the - same order as defined in the `stage_names` attribute. - apply_layernorm (`bool`, *optional*, defaults to `True`): - Whether to apply layer normalization to the feature maps in case the model is used as backbone. - reshape_hidden_states (`bool`, *optional*, defaults to `True`): - Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in - case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, - seq_len, hidden_size)`. - - Example: + init_xavier_std (`float`, *optional*, defaults to 1.0): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + return_intermediate (``, *optional*, defaults to `True`): + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `False`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Deformable DETR, which are further fed into the decoder for iterative bounding box refinement. + two_stage_num_proposals (`int`, *optional*, defaults to 300): + The number of region proposals to be generated, in case `two_stage` is set to `True`. + with_box_refine (`bool`, *optional*, defaults to `False`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + out_feature_indexes (`List`, *optional*, defaults to `[2, 5, 8, 11]`): + scale_factors (`List`, *optional*, defaults to `[1.0]`): + layer_norm (`bool`, *optional*, defaults to `False`): + rms_norm (`bool`, *optional*, defaults to `False`): + projector_out_channels (`int`, *optional*, defaults to 256): + projector_num_blocks (`int`, *optional*, defaults to 3): + + Examples: ```python >>> from transformers import RFDetrConfig, RFDetrModel - >>> # Initializing a RFDetr base style configuration + >>> # Initializing a Deformable DETR SenseTime/deformable-detr style configuration >>> configuration = RFDetrConfig() - >>> # Initializing a model (with random weights) from the base style configuration + >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration >>> model = RFDetrModel(configuration) >>> # Accessing the model configuration @@ -88,58 +155,157 @@ class RFDetrConfig(BackboneConfigMixin, PretrainedConfig): ```""" model_type = "rf_detr" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } def __init__( self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=224, - patch_size=16, + use_timm_backbone=True, + backbone_config=None, num_channels=3, - qkv_bias=True, - layerscale_value=1.0, - drop_path_rate=0.0, - use_swiglu_ffn=False, - num_register_tokens=4, - out_features=None, - out_indices=None, - apply_layernorm=True, - reshape_hidden_states=True, - num_windows=1, - window_block_indexes=None, + num_queries=300, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=1024, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + return_intermediate=True, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + use_pretrained_backbone=True, + backbone_kwargs=None, + dilation=False, + num_feature_levels=4, + encoder_n_points=4, + decoder_n_points=4, + two_stage=False, + two_stage_num_proposals=300, + with_box_refine=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + focal_alpha=0.25, + disable_custom_kernels=False, + out_feature_indexes: List[int] = [2, 5, 8, 11], + scale_factors: List[float] = [1.0], + layer_norm: bool = False, + rms_norm: bool = False, + projector_out_channels: int = 256, + projector_num_blocks: int = 3, # TODO rename **kwargs, ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.use_swiglu_ffn = use_swiglu_ffn - self.num_register_tokens = num_register_tokens - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self._out_features, self._out_indices = get_aligned_output_features_output_indices( - out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.backbone_kwargs = backbone_kwargs + self.dilation = dilation + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + self.two_stage_num_proposals = two_stage_num_proposals + self.with_box_refine = with_box_refine + if two_stage is True and with_box_refine is False: + raise ValueError("If two_stage is True, with_box_refine must be True.") + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + + self.scale_factors = [1.0] if scale_factors is None else scale_factors + assert len(self.scale_factors) > 0, "scale_factors must be a list of at least one element" + assert sorted(self.scale_factors) == self.scale_factors, "scale_factors must be sorted" + assert all(scale in [2.0, 1.0, 0.5, 0.25] for scale in self.scale_factors), ( + "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]" ) - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states - self.num_windows = num_windows - self.window_block_indexes = window_block_indexes + + self.layer_norm = layer_norm + self.rms_norm = rms_norm + self.projector_out_channels = projector_out_channels + self.projector_num_blocks = projector_num_blocks + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + +__all__ = ["RFDetrConfig"] diff --git a/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py new file mode 100644 index 000000000000..6af39627f87b --- /dev/null +++ b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py @@ -0,0 +1,145 @@ +from ...configuration_utils import PretrainedConfig +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +class RFDetrDinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): + r""" + TODO + This is the configuration class to store the configuration of a [`RFDetrDinov2WithRegistersModel`]. It is used to instantiate an + RFDetrDinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import RFDetrDinov2WithRegistersConfig, RFDetrDinov2WithRegistersModel + + >>> # Initializing a RFDetrDinov2WithRegisters base style configuration + >>> configuration = RFDetrDinov2WithRegistersConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = RFDetrDinov2WithRegistersModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "rf_detr_dinov2_with_registers" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + num_windows: int = 4, + window_block_indexes=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + + self.num_windows = num_windows + self.window_block_indexes = ( + list(range(self.num_hidden_layers)) if window_block_indexes is None else window_block_indexes + ) + + +__all__ = ["RFDetrDinov2WithRegistersConfig"] diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py index e03e7b511d5e..88ebbc663a8d 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr.py @@ -1,19 +1,11 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr.py. -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_rf_detr.py file directly. One of our CI enforces this. -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -import collections.abc import copy import math import os import warnings from dataclasses import dataclass from pathlib import Path -from typing import Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union -import numpy as np import torch import torch.nn.functional as F from torch import Tensor, nn @@ -22,9 +14,9 @@ from ...activations import ACT2FN from ...modeling_attn_mask_utils import _prepare_4d_attention_mask -from ...modeling_outputs import BackboneOutput, BaseModelOutput -from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid from ...utils import ( ModelOutput, add_start_docstrings, @@ -36,481 +28,53 @@ logging, replace_return_docstrings, requires_backends, - torch_int, ) -from ...utils.backbone_utils import BackboneMixin, load_backbone -from .configuration_rf_detr import RFConfig, RFDetrConfig - - -if is_timm_available(): - from timm import create_model +from ...utils.backbone_utils import load_backbone +from .configuration_rf_detr import RFDetrConfig logger = logging.get_logger(__name__) -# General docstring -_CONFIG_FOR_DOC = "RFDetrConfig" - - -class RFDetrPatchEmbeddings(nn.Module): - """ - This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial - `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a - Transformer. - """ - - def __init__(self, config): - super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.hidden_size - - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) - - def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: - num_channels = pixel_values.shape[1] - if num_channels != self.num_channels: - raise ValueError( - "Make sure that the channel dimension of the pixel values match with the one set in the configuration." - f" Expected {self.num_channels} but got {num_channels}." - ) - embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) - return embeddings - - -class RFDetrEmbeddings(nn.Module): - """ - Construct the CLS token, mask token, register tokens, position and patch embeddings. - """ - - def __init__(self, config: RFDetrConfig) -> None: - super().__init__() - - self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) - self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) - self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) - self.patch_embeddings = RFDetrPatchEmbeddings(config) - num_patches = self.patch_embeddings.num_patches - self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.patch_size = config.patch_size - self.config = config - - def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: - """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility - with the original implementation. - - Adapted from: - - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py - - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py - """ - num_patches = embeddings.shape[1] - 1 - num_positions = self.position_embeddings.shape[1] - 1 - - # Skip interpolation for matching dimensions (unless tracing) - if not torch.jit.is_tracing() and num_patches == num_positions and height == width: - return self.position_embeddings - - # Handle class token and patch embeddings separately - class_pos_embed = self.position_embeddings[:, 0] - patch_pos_embed = self.position_embeddings[:, 1:] - dim = embeddings.shape[-1] - - # Calculate new dimensions - height = height // self.config.patch_size - width = width // self.config.patch_size - - # Reshape for interpolation - sqrt_num_positions = torch_int(num_positions**0.5) - patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) - patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) - - # Store original dtype for restoration after interpolation - target_dtype = patch_pos_embed.dtype - - # Interpolate at float32 precision - patch_pos_embed = nn.functional.interpolate( - patch_pos_embed.to(dtype=torch.float32), - size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor - mode="bicubic", - align_corners=False, - antialias=True, - ).to(dtype=target_dtype) - - # Validate output dimensions if not tracing - if not torch.jit.is_tracing(): - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") - - # Reshape back to original format - patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - - # Combine class and patch embeddings - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) - - def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: - batch_size, _, height, width = pixel_values.shape - target_dtype = self.patch_embeddings.projection.weight.dtype - embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) - - if bool_masked_pos is not None: - embeddings = torch.where( - bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings - ) - - # add the [CLS] token to the embedded patch tokens - cls_tokens = self.cls_token.expand(batch_size, -1, -1) - embeddings = torch.cat((cls_tokens, embeddings), dim=1) - - # add positional encoding to each token - embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) - - if self.config.num_windows > 1: - # reshape for windows - num_h_patches = height // self.config.patch_size - num_w_patches = width // self.config.patch_size - cls_token_with_pos_embed = embeddings[:, :1] - pixel_tokens_with_pos_embed = embeddings[:, 1:] - pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( - batch_size, num_h_patches, num_w_patches, -1 - ) - num_w_patches_per_window = num_w_patches // self.config.num_windows - num_h_patches_per_window = num_h_patches // self.config.num_windows - num_windows = self.config.num_windows - windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( - batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 - ) - windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) - windowed_pixel_tokens = windowed_pixel_tokens.reshape( - batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 - ) - windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) - embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) - - # add register tokens - embeddings = ( - torch.cat( - (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 - ) - if self.config.num_register_tokens > 0 - else embeddings - ) - - embeddings = self.dropout(embeddings) - - return embeddings - - -def eager_attention_forward( - module: nn.Module, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attention_mask: Optional[torch.Tensor], - scaling: float, - dropout: float = 0.0, - **kwargs, -): - # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling - - # Normalize the attention scores to probabilities. - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - - attn_output = torch.matmul(attn_weights, value) - attn_output = attn_output.transpose(1, 2).contiguous() - - return attn_output, attn_weights - - -class RFDetrSelfAttention(nn.Module): - def __init__(self, config: RFDetrConfig) -> None: - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): - raise ValueError( - f"The hidden size {config.hidden_size} is not a multiple of the number of attention " - f"heads {config.num_attention_heads}." - ) - - self.config = config - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - self.dropout_prob = config.attention_probs_dropout_prob - self.scaling = self.attention_head_size**-0.5 - self.is_causal = False - - self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) - self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) - self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) - - def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False - ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - query_layer = self.transpose_for_scores(self.query(hidden_states)) - - attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - - context_layer, attention_probs = attention_interface( - self, - query_layer, - key_layer, - value_layer, - head_mask, - is_causal=self.is_causal, - scaling=self.scaling, - dropout=0.0 if not self.training else self.dropout_prob, - ) - - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.reshape(new_context_layer_shape) - - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - - return outputs - - -class RFDetrSelfOutput(nn.Module): - """ - The residual connection is defined in RFDetrLayer instead of here (as is the case with other models), due to the - layernorm applied before each block. - """ - - def __init__(self, config: RFDetrConfig) -> None: - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - - return hidden_states - - -class RFDetrAttention(nn.Module): - def __init__(self, config: RFDetrConfig) -> None: - super().__init__() - self.attention = RFDetrSelfAttention(config) - self.output = RFDetrSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads: Set[int]) -> None: - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads - ) - - # Prune linear layers - self.attention.query = prune_linear_layer(self.attention.query, index) - self.attention.key = prune_linear_layer(self.attention.key, index) - self.attention.value = prune_linear_layer(self.attention.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) - self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - self_outputs = self.attention(hidden_states, head_mask, output_attentions) - - attention_output = self.output(self_outputs[0], hidden_states) - - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs - - -class RFDetrLayerScale(nn.Module): - def __init__(self, config) -> None: - super().__init__() - self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) - - def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: - return hidden_state * self.lambda1 - - -def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: - """ - Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - - Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, - however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the - layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the - argument. - """ - if drop_prob == 0.0 or not training: - return input - keep_prob = 1 - drop_prob - shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets - random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) - random_tensor.floor_() # binarize - output = input.div(keep_prob) * random_tensor - return output - - -class RFDetrDropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - - def __init__(self, drop_prob: Optional[float] = None) -> None: - super().__init__() - self.drop_prob = drop_prob - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return drop_path(hidden_states, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return "p={}".format(self.drop_prob) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.load_cuda_kernels +def load_cuda_kernels(): + from torch.utils.cpp_extension import load -class RFDetrMLP(nn.Module): - def __init__(self, config) -> None: - super().__init__() - in_features = out_features = config.hidden_size - hidden_features = int(config.hidden_size * config.mlp_ratio) - self.fc1 = nn.Linear(in_features, hidden_features, bias=True) - if isinstance(config.hidden_act, str): - self.activation = ACT2FN[config.hidden_act] - else: - self.activation = config.hidden_act - self.fc2 = nn.Linear(hidden_features, out_features, bias=True) - - def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: - hidden_state = self.fc1(hidden_state) - hidden_state = self.activation(hidden_state) - hidden_state = self.fc2(hidden_state) - return hidden_state - - -class RFDetrSwiGLUFFN(nn.Module): - def __init__(self, config) -> None: - super().__init__() - in_features = out_features = config.hidden_size - hidden_features = int(config.hidden_size * config.mlp_ratio) - hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 - - self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) - self.weights_out = nn.Linear(hidden_features, out_features, bias=True) - - def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: - hidden_state = self.weights_in(hidden_state) - x1, x2 = hidden_state.chunk(2, dim=-1) - hidden = nn.functional.silu(x1) * x2 - return self.weights_out(hidden) - - -class RFDetrLayer(nn.Module): - """This corresponds to the Block class in the original implementation.""" - - def __init__(self, config) -> None: - super().__init__() - - self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.attention = RFDetrAttention(config) - self.layer_scale1 = RFDetrLayerScale(config) - self.drop_path = RFDetrDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() - - self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - - if config.use_swiglu_ffn: - self.mlp = RFDetrSwiGLUFFN(config) - else: - self.mlp = RFDetrMLP(config) - self.layer_scale2 = RFDetrLayerScale(config) - - self.num_windows = config.num_windows - - def forward( - self, - hidden_states: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - run_full_attention: bool = False, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - assert head_mask is None, "head_mask is not supported for windowed attention" - assert not output_attentions, "output_attentions is not supported for windowed attention" - shortcut = hidden_states - if run_full_attention: - # reshape x to remove windows - B, HW, C = hidden_states.shape - num_windows_squared = self.num_windows**2 - hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) - - self_attention_outputs = self.attention( - self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention - head_mask, - output_attentions=output_attentions, - ) - attention_output = self_attention_outputs[0] - - if run_full_attention: - # reshape x to add windows back - B, HW, C = hidden_states.shape - num_windows_squared = self.num_windows**2 - # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) - attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) - - attention_output = self.layer_scale1(attention_output) - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights - - # first residual connection - hidden_states = self.drop_path(attention_output) + shortcut - - # in Dinov2WithRegisters, layernorm is also applied after self-attention - layer_output = self.norm2(hidden_states) - layer_output = self.mlp(layer_output) - layer_output = self.layer_scale2(layer_output) + global MultiScaleDeformableAttention - # second residual connection - layer_output = self.drop_path(layer_output) + hidden_states + root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] - outputs = (layer_output,) + outputs + MultiScaleDeformableAttention = load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) - return outputs +if is_timm_available(): + from timm import create_model -MultiScaleDeformableAttention = None +_CONFIG_FOR_DOC = "RFDetrConfig" +_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" # TODO +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction class MultiScaleDeformableAttentionFunction(Function): @staticmethod def forward( @@ -559,1545 +123,129 @@ def backward(context, grad_output): return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None -class RFLearnedPositionEmbedding(nn.Module): +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->RFDetr +class RFDetrDecoderOutput(ModelOutput): """ - This module learns positional embeddings up to a fixed maximum size. + Base class for outputs of the RFDetrDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. """ - def __init__(self, embedding_dim=256): - super().__init__() - self.row_embeddings = nn.Embedding(50, embedding_dim) - self.column_embeddings = nn.Embedding(50, embedding_dim) + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - def forward(self, pixel_values, pixel_mask=None): - height, width = pixel_values.shape[-2:] - width_values = torch.arange(width, device=pixel_values.device) - height_values = torch.arange(height, device=pixel_values.device) - x_emb = self.column_embeddings(width_values) - y_emb = self.row_embeddings(height_values) - pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) - pos = pos.permute(2, 0, 1) - pos = pos.unsqueeze(0) - pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) - return pos +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->RFDetr +class RFDetrModelOutput(ModelOutput): + """ + Base class for outputs of the Deformable DETR encoder-decoder model. -def load_cuda_kernels(): - from torch.utils.cpp_extension import load + Args: + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ - global MultiScaleDeformableAttention - - root = Path(__file__).resolve().parent.parent.parent / "kernels" / "r_f" - src_files = [ - root / filename - for filename in [ - "vision.cpp", - os.path.join("cpu", "ms_deform_attn_cpu.cpp"), - os.path.join("cuda", "ms_deform_attn_cuda.cu"), - ] - ] - - MultiScaleDeformableAttention = load( - "MultiScaleDeformableAttention", - src_files, - with_cuda=True, - extra_include_paths=[str(root)], - extra_cflags=["-DWITH_CUDA=1"], - extra_cuda_cflags=[ - "-DCUDA_HAS_FP16=1", - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - ], - ) - - -def multi_scale_deformable_attention( - value: Tensor, - value_spatial_shapes: Union[Tensor, List[Tuple]], - sampling_locations: Tensor, - attention_weights: Tensor, -) -> Tensor: - batch_size, _, num_heads, hidden_dim = value.shape - _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape - value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1) - sampling_grids = 2 * sampling_locations - 1 - sampling_value_list = [] - for level_id, (height, width) in enumerate(value_spatial_shapes): - # batch_size, height*width, num_heads, hidden_dim - # -> batch_size, height*width, num_heads*hidden_dim - # -> batch_size, num_heads*hidden_dim, height*width - # -> batch_size*num_heads, hidden_dim, height, width - value_l_ = ( - value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) - ) - # batch_size, num_queries, num_heads, num_points, 2 - # -> batch_size, num_heads, num_queries, num_points, 2 - # -> batch_size*num_heads, num_queries, num_points, 2 - sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) - # batch_size*num_heads, hidden_dim, num_queries, num_points - sampling_value_l_ = nn.functional.grid_sample( - value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False - ) - sampling_value_list.append(sampling_value_l_) - # (batch_size, num_queries, num_heads, num_levels, num_points) - # -> (batch_size, num_heads, num_queries, num_levels, num_points) - # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) - attention_weights = attention_weights.transpose(1, 2).reshape( - batch_size * num_heads, 1, num_queries, num_levels * num_points - ) - output = ( - (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) - .sum(-1) - .view(batch_size, num_heads * hidden_dim, num_queries) - ) - return output.transpose(1, 2).contiguous() - - -class RFMultiscaleDeformableAttention(nn.Module): - """ - Multiscale deformable attention as proposed in Deformable DETR. - """ - - def __init__(self, config: RFConfig, num_heads: int, n_points: int): - super().__init__() - - kernel_loaded = MultiScaleDeformableAttention is not None - if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded: - try: - load_cuda_kernels() - except Exception as e: - logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") - - if config.d_model % num_heads != 0: - raise ValueError( - f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" - ) - dim_per_head = config.d_model // num_heads - # check if dim_per_head is power of 2 - if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): - warnings.warn( - "You'd better set embed_dim (d_model) in RFMultiscaleDeformableAttention to make the" - " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" - " implementation." - ) - - self.im2col_step = 64 - - self.d_model = config.d_model - self.n_levels = config.num_feature_levels - self.n_heads = num_heads - self.n_points = n_points - - self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) - self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) - self.value_proj = nn.Linear(config.d_model, config.d_model) - self.output_proj = nn.Linear(config.d_model, config.d_model) - - self.disable_custom_kernels = config.disable_custom_kernels - - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - encoder_hidden_states=None, - encoder_attention_mask=None, - position_embeddings: Optional[torch.Tensor] = None, - reference_points=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - output_attentions: bool = False, - ): - # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) - - batch_size, num_queries, _ = hidden_states.shape - batch_size, sequence_length, _ = encoder_hidden_states.shape - total_elements = sum(height * width for height, width in spatial_shapes_list) - if total_elements != sequence_length: - raise ValueError( - "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" - ) - - value = self.value_proj(encoder_hidden_states) - if attention_mask is not None: - # we invert the attention_mask - value = value.masked_fill(~attention_mask[..., None], float(0)) - value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) - sampling_offsets = self.sampling_offsets(hidden_states).view( - batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 - ) - attention_weights = self.attention_weights(hidden_states).view( - batch_size, num_queries, self.n_heads, self.n_levels * self.n_points - ) - attention_weights = F.softmax(attention_weights, -1).view( - batch_size, num_queries, self.n_heads, self.n_levels, self.n_points - ) - # batch_size, num_queries, n_heads, n_levels, n_points, 2 - num_coordinates = reference_points.shape[-1] - if num_coordinates == 2: - offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) - sampling_locations = ( - reference_points[:, :, None, :, None, :] - + sampling_offsets / offset_normalizer[None, None, None, :, None, :] - ) - elif num_coordinates == 4: - sampling_locations = ( - reference_points[:, :, None, :, None, :2] - + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 - ) - else: - raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") - - if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling(): - # PyTorch implementation - output = multi_scale_deformable_attention( - value, spatial_shapes_list, sampling_locations, attention_weights - ) - else: - try: - # custom kernel - output = MultiScaleDeformableAttentionFunction.apply( - value, - spatial_shapes, - level_start_index, - sampling_locations, - attention_weights, - self.im2col_step, - ) - except Exception: - # PyTorch implementation - output = multi_scale_deformable_attention( - value, spatial_shapes_list, sampling_locations, attention_weights - ) - output = self.output_proj(output) - - return output, attention_weights - - -class RFEncoderLayer(nn.Module): - def __init__(self, config: RFConfig): - super().__init__() - self.embed_dim = config.d_model - self.self_attn = RFMultiscaleDeformableAttention( - config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points - ) - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.dropout = config.dropout - self.activation_fn = ACT2FN[config.activation_function] - self.activation_dropout = config.activation_dropout - self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) - self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - position_embeddings: torch.Tensor = None, - reference_points=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - output_attentions: bool = False, - ): - """ - Args: - hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Input to the layer. - attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): - Attention mask. - position_embeddings (`torch.FloatTensor`, *optional*): - Position embeddings, to be added to `hidden_states`. - reference_points (`torch.FloatTensor`, *optional*): - Reference points. - spatial_shapes (`torch.LongTensor`, *optional*): - Spatial shapes of the backbone feature maps. - level_start_index (`torch.LongTensor`, *optional*): - Level start index. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - - # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - encoder_hidden_states=hidden_states, - encoder_attention_mask=attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - output_attentions=output_attentions, - ) - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - residual = hidden_states - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - if self.training: - if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - -class RFPreTrainedModel(PreTrainedModel): - config_class = RFConfig - base_model_prefix = "model" - main_input_name = "pixel_values" - supports_gradient_checkpointing = True - _no_split_modules = [r"RFConvEncoder", r"RFEncoderLayer", r"RFDecoderLayer"] - - def _init_weights(self, module): - std = self.config.init_std - - if isinstance(module, RFLearnedPositionEmbedding): - nn.init.uniform_(module.row_embeddings.weight) - nn.init.uniform_(module.column_embeddings.weight) - elif isinstance(module, RFMultiscaleDeformableAttention): - nn.init.constant_(module.sampling_offsets.weight.data, 0.0) - default_dtype = torch.get_default_dtype() - thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( - 2.0 * math.pi / module.n_heads - ) - grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) - grid_init = ( - (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) - .view(module.n_heads, 1, 1, 2) - .repeat(1, module.n_levels, module.n_points, 1) - ) - for i in range(module.n_points): - grid_init[:, :, i, :] *= i + 1 - with torch.no_grad(): - module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) - nn.init.constant_(module.attention_weights.weight.data, 0.0) - nn.init.constant_(module.attention_weights.bias.data, 0.0) - nn.init.xavier_uniform_(module.value_proj.weight.data) - nn.init.constant_(module.value_proj.bias.data, 0.0) - nn.init.xavier_uniform_(module.output_proj.weight.data) - nn.init.constant_(module.output_proj.bias.data, 0.0) - elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - if hasattr(module, "reference_points") and not self.config.two_stage: - nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) - nn.init.constant_(module.reference_points.bias.data, 0.0) - if hasattr(module, "level_embed"): - nn.init.normal_(module.level_embed) - - -class RFDetrEncoder(RFPreTrainedModel): - """ - Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a - [`RFDetrEncoderLayer`]. - - The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. - - Args: - config: RFDetrConfig - """ - - def __init__(self, config: RFConfig): - super().__init__(config) - self.gradient_checkpointing = False - - self.dropout = config.dropout - self.layers = nn.ModuleList([RFEncoderLayer(config) for _ in range(config.encoder_layers)]) - - # Initialize weights and apply final processing - self.post_init() - - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios, device): - """ - Get reference points for each feature map. Used in decoder. - - Args: - spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of each feature map. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): - Valid ratios of each feature map. - device (`torch.device`): - Device on which to create the tensors. - Returns: - `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` - """ - reference_points_list = [] - for level, (height, width) in enumerate(spatial_shapes): - ref_y, ref_x = meshgrid( - torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), - torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), - indexing="ij", - ) - # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 - ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) - ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) - ref = torch.stack((ref_x, ref_y), -1) - reference_points_list.append(ref) - reference_points = torch.cat(reference_points_list, 1) - reference_points = reference_points[:, :, None] * valid_ratios[:, None] - return reference_points - - def forward( - self, - inputs_embeds=None, - attention_mask=None, - position_embeddings=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - valid_ratios=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: - - 1 for pixel features that are real (i.e. **not masked**), - - 0 for pixel features that are padding (i.e. **masked**). - [What are attention masks?](../glossary#attention-mask) - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Position embeddings that are added to the queries and keys in each self-attention layer. - spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of each feature map. - level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): - Starting index of each feature map. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): - Ratio of valid area in each feature level. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - hidden_states = inputs_embeds - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - spatial_shapes_tuple = tuple(spatial_shapes_list) - reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - for i, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - position_embeddings, - reference_points, - spatial_shapes, - spatial_shapes_list, - level_start_index, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions - ) - - -R_F_DETR_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it - as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. - - Parameters: - config ([`RFDetrConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -R_F_DETR_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See - [`BitImageProcessor.preprocess`] for details. - - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - """ - RFDetr backbone, to be used with frameworks like DETR and MaskFormer. - """, - R_F_DETR_START_DOCSTRING, -) -class RFDetrBackbone(RFDetrPreTrainedModel, BackboneMixin): - def __init__(self, config): - super().__init__(config) - super()._init_backbone(config) - self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] - self.embeddings = RFDetrEmbeddings(config) - self.encoder = RFDetrEncoder(config) - - self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - - self.num_register_tokens = config.num_register_tokens - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> RFDetrPatchEmbeddings: - return self.embeddings.patch_embeddings - - @add_start_docstrings_to_model_forward(R_F_DETR_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - pixel_values: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> BackboneOutput: - """ - Returns: - - Examples: - Returns: - - Examples: - Returns: - - Examples: - Returns: - - Examples: - - - - ```python - >>> from transformers import AutoImageProcessor, AutoBackbone - >>> import torch - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") - >>> model = AutoBackbone.from_pretrained( - ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] - ... ) - - >>> inputs = processor(image, return_tensors="pt") - - >>> outputs = model(**inputs) - >>> feature_maps = outputs.feature_maps - >>> list(feature_maps[-1].shape) - [1, 768, 16, 16] - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - embedding_output = self.embeddings(pixel_values) - - outputs = self.encoder( - embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict - ) - - hidden_states = outputs.hidden_states if return_dict else outputs[1] - - feature_maps = () - for stage, hidden_state in zip(self.stage_names, hidden_states): - if stage in self.out_features: - if self.config.apply_layernorm: - hidden_state = self.layernorm(hidden_state) - if self.config.reshape_hidden_states: - hidden_state = hidden_state[:, self.num_register_tokens + 1 :] - # this was actually a bug in the original implementation that we copied here, - # cause normally the order is height, width - batch_size, _, height, width = pixel_values.shape - patch_size = self.config.patch_size - - num_h_patches = height // patch_size - num_w_patches = width // patch_size - - if self.config.num_windows > 1: - # undo windowing - num_windows_squared = self.config.num_windows**2 - B, HW, C = hidden_state.shape - num_h_patches_per_window = num_h_patches // self.config.num_windows - num_w_patches_per_window = num_w_patches // self.config.num_windows - hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) - hidden_state = hidden_state.view( - B // num_windows_squared, - self.config.num_windows, - self.config.num_windows, - num_h_patches_per_window, - num_w_patches_per_window, - C, - ) - hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) - - hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) - hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() - - feature_maps += (hidden_state,) - - if not return_dict: - if output_hidden_states: - output = (feature_maps,) + outputs[1:] - else: - output = (feature_maps,) + outputs[2:] - return output - - return BackboneOutput( - feature_maps=feature_maps, - hidden_states=outputs.hidden_states if output_hidden_states else None, - attentions=outputs.attentions if output_attentions else None, - ) - - -class RFDetrLayerNorm(nn.Module): - """ - A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the - channel dimension for inputs that have shape (batch_size, channels, height, width). - https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 - """ - - def __init__(self, normalized_shape, eps=1e-6): - super().__init__() - self.weight = nn.Parameter(torch.ones(normalized_shape)) - self.bias = nn.Parameter(torch.zeros(normalized_shape)) - self.eps = eps - self.normalized_shape = (normalized_shape,) - - def forward(self, x): - u = x.mean(1, keepdim=True) - s = (x - u).pow(2).mean(1, keepdim=True) - x = (x - u) / torch.sqrt(s + self.eps) - x = self.weight[:, None, None] * x + self.bias[:, None, None] - return x - - -class ConvX(nn.Module): - """Conv-bn module""" - - def __init__(self, in_planes, out_planes, kernel=3, stride=1, groups=1, dilation=1, act="relu"): - super(ConvX, self).__init__() - self.conv = nn.Conv2d( - in_planes, - out_planes, - kernel_size=kernel, - stride=stride, - padding=kernel // 2, - groups=groups, - dilation=dilation, - bias=False, - ) - self.bn = nn.BatchNorm2d(out_planes) - self.act = ACT2FN[act] - - def forward(self, x): - """forward""" - out = self.act(self.bn(self.conv(x))) - return out - - -class Bottleneck(nn.Module): - """Standard bottleneck.""" - - def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, act="silu"): - """ch_in, ch_out, shortcut, groups, kernels, expand""" - super().__init__() - c_ = int(c2 * e) # hidden channels - self.cv1 = ConvX(c1, c_, k[0], 1, act=act) - self.cv2 = ConvX(c_, c2, k[1], 1, groups=g, act=act) - self.add = shortcut and c1 == c2 - - def forward(self, x): - """'forward()' applies the YOLOv5 FPN to input data.""" - return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) - - -class RFDetrC2f(nn.Module): - """Faster Implementation of CSP Bottleneck with 2 convolutions.""" - - def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, act="silu"): - """ch_in, ch_out, number, shortcut, groups, expansion""" - super().__init__() - self.c = int(c2 * e) # hidden channels - self.cv1 = ConvX(c1, 2 * self.c, 1, 1, act=act) - self.cv2 = ConvX((2 + n) * self.c, c2, 1, act=act) # optional act=FReLU(c2) - self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=(3, 3), e=1.0, act=act) for _ in range(n)) - - def forward(self, x): - """Forward pass using split() instead of chunk().""" - y = list(self.cv1(x).split((self.c, self.c), 1)) - y.extend(m(y[-1]) for m in self.m) - return self.cv2(torch.cat(y, 1)) - - -class RFDetrMultiScaleProjector(nn.Module): - """ - This module implements MultiScaleProjector in :paper:`lwdetr`. - It creates pyramid features built on top of the input feature map. - """ - - def __init__( - self, - in_channels, - out_channels, - scale_factors, - num_blocks=3, - layer_norm=False, - rms_norm=False, - survival_prob=1.0, - force_drop_last_n_features=0, - ): - """ - Args: - net (Backbone): module representing the subnetwork backbone. - Must be a subclass of :class:`Backbone`. - out_channels (int): number of channels in the output feature maps. - scale_factors (list[float]): list of scaling factors to upsample or downsample - the input features for creating pyramid features. - """ - super().__init__() - - self.scale_factors = scale_factors - self.survival_prob = survival_prob - self.force_drop_last_n_features = force_drop_last_n_features - - stages_sampling = [] - stages = [] - # use_bias = norm == "" - use_bias = False - self.use_extra_pool = False - for scale in scale_factors: - stages_sampling.append([]) - for in_dim in in_channels: - out_dim = in_dim - layers = [] - - # if in_dim > 512: - # layers.append(ConvX(in_dim, in_dim // 2, kernel=1)) - # in_dim = in_dim // 2 - - if scale == 4.0: - layers.extend( - [ - nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), - RFDetrLayerNorm(in_dim // 2), - nn.GELU(), - nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), - ] - ) - out_dim = in_dim // 4 - elif scale == 2.0: - # a hack to reduce the FLOPs and Params when the dimention of output feature is too large - # if in_dim > 512: - # layers = [ - # ConvX(in_dim, in_dim // 2, kernel=1), - # nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), - # ] - # out_dim = in_dim // 4 - # else: - layers.extend( - [ - nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), - ] - ) - out_dim = in_dim // 2 - elif scale == 1.0: - pass - elif scale == 0.5: - layers.extend( - [ - ConvX(in_dim, in_dim, 3, 2, layer_norm=layer_norm), - ] - ) - elif scale == 0.25: - self.use_extra_pool = True - continue - else: - raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) - layers = nn.Sequential(*layers) - stages_sampling[-1].append(layers) - stages_sampling[-1] = nn.ModuleList(stages_sampling[-1]) - - in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) - layers = [ - RFDetrC2f(in_dim, out_channels, num_blocks, layer_norm=layer_norm), - RFDetrLayerNorm(out_channels), - ] - layers = nn.Sequential(*layers) - stages.append(layers) - - self.stages_sampling = nn.ModuleList(stages_sampling) - self.stages = nn.ModuleList(stages) - - def forward(self, x): - """ - Args: - x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. - Returns: - dict[str->Tensor]: - mapping from feature map name to pyramid feature map tensor - in high to low resolution order. Returned feature names follow the FPN - convention: "p", where stage has stride = 2 ** stage e.g., - ["p2", "p3", ..., "p6"]. - """ - num_features = len(x) - if self.survival_prob < 1.0 and self.training: - final_drop_prob = 1 - self.survival_prob - drop_p = np.random.uniform() - for i in range(1, num_features): - critical_drop_prob = i * (final_drop_prob / (num_features - 1)) - if drop_p < critical_drop_prob: - x[i][:] = 0 - elif self.force_drop_last_n_features > 0: - for i in range(self.force_drop_last_n_features): - # don't do it inplace to ensure the compiler can optimize out the backbone layers - x[-(i + 1)] = torch.zeros_like(x[-(i + 1)]) - - results = [] - # x list of len(out_features_indexes) - for i, stage in enumerate(self.stages): - feat_fuse = [] - for j, stage_sampling in enumerate(self.stages_sampling[i]): - feat_fuse.append(stage_sampling(x[j])) - if len(feat_fuse) > 1: - feat_fuse = torch.cat(feat_fuse, dim=1) - else: - feat_fuse = feat_fuse[0] - results.append(stage(feat_fuse)) - if self.use_extra_pool: - results.append(F.max_pool2d(results[-1], kernel_size=1, stride=2, padding=0)) - return results - - -class RFMultiheadAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. - - Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). - """ - - def __init__( - self, - embed_dim: int, - num_heads: int, - dropout: float = 0.0, - bias: bool = True, - ): - super().__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - if self.head_dim * num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {num_heads})." - ) - self.scaling = self.head_dim**-0.5 - - self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - - def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): - return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, target_len, embed_dim = hidden_states.size() - # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: - hidden_states_original = hidden_states - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) - - # get queries, keys and values - query_states = self.q_proj(hidden_states) * self.scaling - key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) - value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) - - proj_shape = (batch_size * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) - key_states = key_states.view(*proj_shape) - value_states = value_states.view(*proj_shape) - - source_len = key_states.size(1) - - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): - raise ValueError( - f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" - f" {attn_weights.size()}" - ) - - # expand attention_mask - if attention_mask is not None: - # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] - attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, target_len, source_len): - raise ValueError( - f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" - f" {attention_mask.size()}" - ) - attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask - attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if output_attentions: - # this operation is a bit awkward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) - attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(batch_size, target_len, embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped - - -class RFDetrDecoderLayer(nn.Module): - def __init__(self, config: RFConfig): - super().__init__() - self.embed_dim = config.d_model - - # self-attention - self.self_attn = RFMultiheadAttention( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - dropout=config.attention_dropout, - ) - self.dropout = config.dropout - self.activation_fn = ACT2FN[config.activation_function] - self.activation_dropout = config.activation_dropout - - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - # cross-attention - self.encoder_attn = RFMultiscaleDeformableAttention( - config, - num_heads=config.decoder_attention_heads, - n_points=config.decoder_n_points, - ) - self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) - # feedforward neural networks - self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) - self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: Optional[torch.Tensor] = None, - reference_points=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ): - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(seq_len, batch, embed_dim)`. - position_embeddings (`torch.FloatTensor`, *optional*): - Position embeddings that are added to the queries and keys in the self-attention layer. - reference_points (`torch.FloatTensor`, *optional*): - Reference points. - spatial_shapes (`torch.LongTensor`, *optional*): - Spatial shapes. - level_start_index (`torch.LongTensor`, *optional*): - Level start index. - encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` - encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size - `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative - values. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - - # Self Attention - hidden_states, self_attn_weights = self.self_attn( - hidden_states=hidden_states, - position_embeddings=position_embeddings, - output_attentions=output_attentions, - ) - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - second_residual = hidden_states - - # Cross-Attention - cross_attn_weights = None - hidden_states, cross_attn_weights = self.encoder_attn( - hidden_states=hidden_states, - attention_mask=encoder_attention_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - output_attentions=output_attentions, - ) - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = second_residual + hidden_states - - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - # Fully Connected - residual = hidden_states - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights, cross_attn_weights) - - return outputs - - -@dataclass -class RFDecoderOutput(ModelOutput): - """ - Base class for outputs of the RFDecoder. This class adds two attributes to - BaseModelOutputWithCrossAttentions, namely: - - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) - - a stacked tensor of intermediate reference points. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): - Stacked intermediate hidden states (output of each layer of the decoder). - intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): - Stacked intermediate reference points (reference points of each layer of the decoder). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer - plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, - used to compute the weighted average in the cross-attention heads. - """ - - last_hidden_state: torch.FloatTensor = None - intermediate_hidden_states: torch.FloatTensor = None - intermediate_reference_points: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -class RFDecoderLayer(nn.Module): - def __init__(self, config: RFConfig): - super().__init__() - self.embed_dim = config.d_model - - # self-attention - self.self_attn = RFMultiheadAttention( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - dropout=config.attention_dropout, - ) - self.dropout = config.dropout - self.activation_fn = ACT2FN[config.activation_function] - self.activation_dropout = config.activation_dropout - - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - # cross-attention - self.encoder_attn = RFMultiscaleDeformableAttention( - config, - num_heads=config.decoder_attention_heads, - n_points=config.decoder_n_points, - ) - self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) - # feedforward neural networks - self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) - self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: Optional[torch.Tensor] = None, - reference_points=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ): - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(seq_len, batch, embed_dim)`. - position_embeddings (`torch.FloatTensor`, *optional*): - Position embeddings that are added to the queries and keys in the self-attention layer. - reference_points (`torch.FloatTensor`, *optional*): - Reference points. - spatial_shapes (`torch.LongTensor`, *optional*): - Spatial shapes. - level_start_index (`torch.LongTensor`, *optional*): - Level start index. - encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` - encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size - `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative - values. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - - # Self Attention - hidden_states, self_attn_weights = self.self_attn( - hidden_states=hidden_states, - position_embeddings=position_embeddings, - output_attentions=output_attentions, - ) - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - second_residual = hidden_states - - # Cross-Attention - cross_attn_weights = None - hidden_states, cross_attn_weights = self.encoder_attn( - hidden_states=hidden_states, - attention_mask=encoder_attention_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - output_attentions=output_attentions, - ) - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = second_residual + hidden_states - - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - # Fully Connected - residual = hidden_states - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights, cross_attn_weights) - - return outputs - - -def inverse_sigmoid(x, eps=1e-5): - x = x.clamp(min=0, max=1) - x1 = x.clamp(min=eps) - x2 = (1 - x).clamp(min=eps) - return torch.log(x1 / x2) - - -class RFDetrDecoder(RFPreTrainedModel): - """ - Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDetrDecoderLayer`]. - - The decoder updates the query embeddings through multiple self-attention and cross-attention layers. - - Some tweaks for Deformable DETR: - - - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. - - it also returns a stack of intermediate outputs and reference points from all decoding layers. - - Args: - config: RFDetrConfig - """ - - def __init__(self, config: RFConfig): - super().__init__(config) - - self.dropout = config.dropout - self.layers = nn.ModuleList([RFDecoderLayer(config) for _ in range(config.decoder_layers)]) - self.gradient_checkpointing = False - - # hack implementation for iterative bounding box refinement and two-stage Deformable DETR - self.bbox_embed = None - self.class_embed = None - - # Initialize weights and apply final processing - self.post_init() - - def forward( - self, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - position_embeddings=None, - reference_points=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - valid_ratios=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): - The query embeddings that are passed into the decoder. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - of the decoder. - encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected - in `[0, 1]`: - - 1 for pixels that are real (i.e. **not masked**), - - 0 for pixels that are padding (i.e. **masked**). - position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Position embeddings that are added to the queries and keys in each self-attention layer. - reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): - Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. - spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of the feature maps. - level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): - Indexes for the start of each feature level. In range `[0, sequence_length]`. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): - Ratio of valid area in each feature level. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if inputs_embeds is not None: - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None - intermediate = () - intermediate_reference_points = () - - for idx, decoder_layer in enumerate(self.layers): - num_coordinates = reference_points.shape[-1] - if num_coordinates == 4: - reference_points_input = ( - reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] - ) - elif reference_points.shape[-1] == 2: - reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] - else: - raise ValueError("Reference points' last dimension must be of size 2") - - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - position_embeddings, - reference_points_input, - spatial_shapes, - spatial_shapes_list, - level_start_index, - encoder_hidden_states, - encoder_attention_mask, - output_attentions, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - position_embeddings=position_embeddings, - encoder_hidden_states=encoder_hidden_states, - reference_points=reference_points_input, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - # hack implementation for iterative bounding box refinement - if self.bbox_embed is not None: - tmp = self.bbox_embed[idx](hidden_states) - num_coordinates = reference_points.shape[-1] - if num_coordinates == 4: - new_reference_points = tmp + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - elif num_coordinates == 2: - new_reference_points = tmp - new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - else: - raise ValueError( - f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" - ) - reference_points = new_reference_points.detach() - - intermediate += (hidden_states,) - intermediate_reference_points += (reference_points,) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) - - # Keep batch_size as first dimension - intermediate = torch.stack(intermediate, dim=1) - intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - intermediate, - intermediate_reference_points, - all_hidden_states, - all_self_attns, - all_cross_attentions, - ] - if v is not None - ) - return RFDecoderOutput( - last_hidden_state=hidden_states, - intermediate_hidden_states=intermediate, - intermediate_reference_points=intermediate_reference_points, - hidden_states=all_hidden_states, - attentions=all_self_attns, - cross_attentions=all_cross_attentions, - ) - - -class RFDetrPreTrainedModel(PreTrainedModel): - config_class = RFConfig - base_model_prefix = "model" - main_input_name = "pixel_values" - supports_gradient_checkpointing = True - _no_split_modules = [r"RFDetrConvEncoder", r"RFDetrEncoderLayer", r"RFDetrDecoderLayer"] - - def _init_weights(self, module): - std = self.config.init_std - - if isinstance(module, RFLearnedPositionEmbedding): - nn.init.uniform_(module.row_embeddings.weight) - nn.init.uniform_(module.column_embeddings.weight) - elif isinstance(module, RFMultiscaleDeformableAttention): - nn.init.constant_(module.sampling_offsets.weight.data, 0.0) - default_dtype = torch.get_default_dtype() - thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( - 2.0 * math.pi / module.n_heads - ) - grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) - grid_init = ( - (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) - .view(module.n_heads, 1, 1, 2) - .repeat(1, module.n_levels, module.n_points, 1) - ) - for i in range(module.n_points): - grid_init[:, :, i, :] *= i + 1 - with torch.no_grad(): - module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) - nn.init.constant_(module.attention_weights.weight.data, 0.0) - nn.init.constant_(module.attention_weights.bias.data, 0.0) - nn.init.xavier_uniform_(module.value_proj.weight.data) - nn.init.constant_(module.value_proj.bias.data, 0.0) - nn.init.xavier_uniform_(module.output_proj.weight.data) - nn.init.constant_(module.output_proj.bias.data, 0.0) - elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - if hasattr(module, "reference_points") and not self.config.two_stage: - nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) - nn.init.constant_(module.reference_points.bias.data, 0.0) - if hasattr(module, "level_embed"): - nn.init.normal_(module.level_embed) + init_reference_points: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None @dataclass -class RFModelOutput(ModelOutput): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->RFDetr +class RFDetrObjectDetectionOutput(ModelOutput): """ - Base class for outputs of the Deformable DETR encoder-decoder model. + Output type of [`RFDetrForObjectDetection`]. Args: - init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): - Initial reference points sent through the Transformer decoder. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~RFDetrProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the decoder of the model. - intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): - Stacked intermediate hidden states (output of each layer of the decoder). - intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): - Stacked intermediate reference points (reference points of each layer of the decoder). decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer @@ -2117,9 +265,15 @@ class RFModelOutput(ModelOutput): shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the - self-attention heads. + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, + 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average + in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. @@ -2128,21 +282,40 @@ class RFModelOutput(ModelOutput): Logits of predicted bounding boxes coordinates in the first stage. """ - init_reference_points: torch.FloatTensor = None - last_hidden_state: torch.FloatTensor = None - intermediate_hidden_states: torch.FloatTensor = None - intermediate_reference_points: torch.FloatTensor = None + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_last_hidden_state: Optional[torch.FloatTensor] = None encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - enc_outputs_class: Optional[torch.FloatTensor] = None - enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + enc_outputs_class: Optional = None + enc_outputs_coord_logits: Optional = None + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr._get_clones +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.inverse_sigmoid +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) -class RFFrozenBatchNorm2d(nn.Module): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrFrozenBatchNorm2d with DeformableDetr->RFDetr +class RFDetrFrozenBatchNorm2d(nn.Module): """ BatchNorm2d where the batch statistics and the affine parameters are fixed. @@ -2181,513 +354,748 @@ def forward(self, x): return x * scale + bias -def replace_batch_norm(model): - r""" - Recursively replace all `torch.nn.BatchNorm2d` with `RFFrozenBatchNorm2d`. +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.replace_batch_norm with DeformableDetr->RFDetr +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `RFDetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = RFDetrFrozenBatchNorm2d(module.num_features) + + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->RFDetr +class RFDetrConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by RFDetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API + if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. + requires_backends(self, ["timm"]) + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) + num_channels = kwargs.pop("in_chans", config.num_channels) + if config.dilation: + kwargs["output_stride"] = kwargs.get("output_stride", 16) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=out_indices, + in_chans=num_channels, + **kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->RFDetr + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvModel with DeformableDetr->RFDetr +class RFDetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->RFDetr +class RFDetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype) + x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLearnedPositionEmbedding with DeformableDetr->RFDetr +class RFDetrLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.build_position_encoding with DeformableDetr->RFDetr +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = RFDetrSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = RFDetrLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention +def multi_scale_deformable_attention( + value: Tensor, + value_spatial_shapes: Union[Tensor, List[Tuple]], + sampling_locations: Tensor, + attention_weights: Tensor, +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + - Args: - model (torch.nn.Module): - input model +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RFDetr +class RFDetrMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. """ - for name, module in model.named_children(): - if isinstance(module, nn.BatchNorm2d): - new_module = RFFrozenBatchNorm2d(module.num_features) - if not module.weight.device == torch.device("meta"): - new_module.weight.data.copy_(module.weight) - new_module.bias.data.copy_(module.bias) - new_module.running_mean.data.copy_(module.running_mean) - new_module.running_var.data.copy_(module.running_var) + def __init__(self, config: RFDetrConfig, num_heads: int, n_points: int): + super().__init__() - model._modules[name] = new_module + kernel_loaded = MultiScaleDeformableAttention is not None + if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded: + try: + load_cuda_kernels() + except Exception as e: + logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") - if len(list(module.children())) > 0: - replace_batch_norm(module) + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in RFDetrMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + self.im2col_step = 64 -class RFConvEncoder(nn.Module): - """ - Convolutional backbone, using either the AutoBackbone API or one from the timm library. + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points - nn.BatchNorm2d layers are replaced by RFFrozenBatchNorm2d as defined above. + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) - """ + self.disable_custom_kernels = config.disable_custom_kernels - def __init__(self, config): - super().__init__() + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings - self.config = config + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) - # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API - if config.use_timm_backbone: - # We default to values which were previously hard-coded. This enables configurability from the config - # using backbone arguments, while keeping the default behavior the same. - requires_backends(self, ["timm"]) - kwargs = getattr(config, "backbone_kwargs", {}) - kwargs = {} if kwargs is None else kwargs.copy() - out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) - num_channels = kwargs.pop("in_chans", config.num_channels) - if config.dilation: - kwargs["output_stride"] = kwargs.get("output_stride", 16) - backbone = create_model( - config.backbone, - pretrained=config.use_pretrained_backbone, - features_only=True, - out_indices=out_indices, - in_chans=num_channels, - **kwargs, + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + total_elements = sum(height * width for height, width in spatial_shapes_list) + if total_elements != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" ) - else: - backbone = load_backbone(config) - # replace batch norm by frozen batch norm - with torch.no_grad(): - replace_batch_norm(backbone) - self.model = backbone - self.intermediate_channel_sizes = ( - self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 ) - - backbone_model_type = None - if config.backbone is not None: - backbone_model_type = config.backbone - elif config.backbone_config is not None: - backbone_model_type = config.backbone_config.model_type + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) else: - raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") - - if "resnet" in backbone_model_type: - for name, parameter in self.model.named_parameters(): - if config.use_timm_backbone: - if "layer2" not in name and "layer3" not in name and "layer4" not in name: - parameter.requires_grad_(False) - else: - if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: - parameter.requires_grad_(False) + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") - def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): - # send pixel_values through the model to get list of feature maps - features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling(): + # PyTorch implementation + output = multi_scale_deformable_attention( + value, spatial_shapes_list, sampling_locations, attention_weights + ) + else: + try: + # custom kernel + output = MultiScaleDeformableAttentionFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + except Exception: + # PyTorch implementation + output = multi_scale_deformable_attention( + value, spatial_shapes_list, sampling_locations, attention_weights + ) + output = self.output_proj(output) - out = [] - for feature_map in features: - # downsample pixel_mask to match shape of corresponding feature_map - mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] - out.append((feature_map, mask)) - return out + return output, attention_weights -class RFConvModel(nn.Module): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->RFDetr +class RFDetrMultiheadAttention(nn.Module): """ - This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). """ - def __init__(self, conv_encoder, position_embedding): + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): super().__init__() - self.conv_encoder = conv_encoder - self.position_embedding = position_embedding + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 - def forward(self, pixel_values, pixel_mask): - # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples - out = self.conv_encoder(pixel_values, pixel_mask) - pos = [] - for feature_map, mask in out: - # position encoding - pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - return out, pos + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + source_len = key_states.size(1) -class RFSinePositionEmbedding(nn.Module): - """ - This is a more standard version of the position embedding, very similar to the one used by the Attention is all you - need paper, generalized to work on images. - """ + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): - super().__init__() - self.embedding_dim = embedding_dim - self.temperature = temperature - self.normalize = normalize - if scale is not None and normalize is False: - raise ValueError("normalize should be True if scale is passed") - if scale is None: - scale = 2 * math.pi - self.scale = scale + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) - def forward(self, pixel_values, pixel_mask): - if pixel_mask is None: - raise ValueError("No pixel mask provided") - y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype) - x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype) - if self.normalize: - eps = 1e-6 - y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device) - dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) - pos_x = x_embed[:, :, :, None] / dim_t - pos_y = y_embed[:, :, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) - return pos + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None -class RFEncoder(RFPreTrainedModel): - """ - Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a - [`RFEncoderLayer`]. + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + attn_output = torch.bmm(attn_probs, value_states) - Args: - config: RFConfig - """ + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) - def __init__(self, config: RFConfig): - super().__init__(config) - self.gradient_checkpointing = False + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) - self.dropout = config.dropout - self.layers = nn.ModuleList([RFEncoderLayer(config) for _ in range(config.encoder_layers)]) + attn_output = self.out_proj(attn_output) - # Initialize weights and apply final processing - self.post_init() + return attn_output, attn_weights_reshaped - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios, device): - """ - Get reference points for each feature map. Used in decoder. - Args: - spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of each feature map. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): - Valid ratios of each feature map. - device (`torch.device`): - Device on which to create the tensors. - Returns: - `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` - """ - reference_points_list = [] - for level, (height, width) in enumerate(spatial_shapes): - ref_y, ref_x = meshgrid( - torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), - torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), - indexing="ij", - ) - # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 - ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) - ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) - ref = torch.stack((ref_x, ref_y), -1) - reference_points_list.append(ref) - reference_points = torch.cat(reference_points_list, 1) - reference_points = reference_points[:, :, None] * valid_ratios[:, None] - return reference_points +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->RFDetr +class RFDetrEncoderLayer(nn.Module): + def __init__(self, config: RFDetrConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = RFDetrMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) def forward( self, - inputs_embeds=None, - attention_mask=None, - position_embeddings=None, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + reference_points=None, spatial_shapes=None, spatial_shapes_list=None, level_start_index=None, - valid_ratios=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, + output_attentions: bool = False, ): - r""" + """ Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: - - 1 for pixel features that are real (i.e. **not masked**), - - 0 for pixel features that are padding (i.e. **masked**). - [What are attention masks?](../glossary#attention-mask) - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Position embeddings that are added to the queries and keys in each self-attention layer. - spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of each feature map. - level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): - Starting index of each feature map. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): - Ratio of valid area in each feature level. + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = inputs_embeds hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) - spatial_shapes_tuple = tuple(spatial_shapes_list) - reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - for i, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - position_embeddings, - reference_points, - spatial_shapes, - spatial_shapes_list, - level_start_index, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions - ) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) -class RFDecoder(RFPreTrainedModel): - """ - Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDecoderLayer`]. + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + outputs = (hidden_states,) - Some tweaks for Deformable DETR: + if output_attentions: + outputs += (attn_weights,) - - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. - - it also returns a stack of intermediate outputs and reference points from all decoding layers. + return outputs - Args: - config: RFConfig - """ - def __init__(self, config: RFConfig): - super().__init__(config) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->RFDetr +class RFDetrDecoderLayer(nn.Module): + def __init__(self, config: RFDetrConfig): + super().__init__() + self.embed_dim = config.d_model + # self-attention + self.self_attn = RFDetrMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) self.dropout = config.dropout - self.layers = nn.ModuleList([RFDecoderLayer(config) for _ in range(config.decoder_layers)]) - self.gradient_checkpointing = False - - # hack implementation for iterative bounding box refinement and two-stage Deformable DETR - self.bbox_embed = None - self.class_embed = None + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout - # Initialize weights and apply final processing - self.post_init() + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = RFDetrMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) def forward( self, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - position_embeddings=None, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, reference_points=None, spatial_shapes=None, spatial_shapes_list=None, level_start_index=None, - valid_ratios=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, ): - r""" + """ Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): - The query embeddings that are passed into the decoder. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - of the decoder. - encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected - in `[0, 1]`: - - 1 for pixels that are real (i.e. **not masked**), - - 0 for pixels that are padding (i.e. **masked**). - position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Position embeddings that are added to the queries and keys in each self-attention layer. - reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): - Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. - spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of the feature maps. - level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): - Indexes for the start of each feature level. In range `[0, sequence_length]`. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): - Ratio of valid area in each feature level. - + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if inputs_embeds is not None: - hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None - intermediate = () - intermediate_reference_points = () + second_residual = hidden_states - for idx, decoder_layer in enumerate(self.layers): - num_coordinates = reference_points.shape[-1] - if num_coordinates == 4: - reference_points_input = ( - reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] - ) - elif reference_points.shape[-1] == 2: - reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] - else: - raise ValueError("Reference points' last dimension must be of size 2") + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) - if output_hidden_states: - all_hidden_states += (hidden_states,) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - position_embeddings, - reference_points_input, - spatial_shapes, - spatial_shapes_list, - level_start_index, - encoder_hidden_states, - encoder_attention_mask, - output_attentions, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - position_embeddings=position_embeddings, - encoder_hidden_states=encoder_hidden_states, - reference_points=reference_points_input, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - ) + hidden_states = self.encoder_attn_layer_norm(hidden_states) - hidden_states = layer_outputs[0] + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) - # hack implementation for iterative bounding box refinement - if self.bbox_embed is not None: - tmp = self.bbox_embed[idx](hidden_states) - num_coordinates = reference_points.shape[-1] - if num_coordinates == 4: - new_reference_points = tmp + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - elif num_coordinates == 2: - new_reference_points = tmp - new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - else: - raise ValueError( - f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" - ) - reference_points = new_reference_points.detach() + outputs = (hidden_states,) - intermediate += (hidden_states,) - intermediate_reference_points += (reference_points,) + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) - if output_attentions: - all_self_attns += (layer_outputs[1],) + return outputs - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) - # Keep batch_size as first dimension - intermediate = torch.stack(intermediate, dim=1) - intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->RFDetr +class RFDetrPreTrainedModel(PreTrainedModel): + config_class = RFDetrConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = [r"RFDetrConvEncoder", r"RFDetrEncoderLayer", r"RFDetrDecoderLayer"] - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) + def _init_weights(self, module): + std = self.config.init_std - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - intermediate, - intermediate_reference_points, - all_hidden_states, - all_self_attns, - all_cross_attentions, - ] - if v is not None + if isinstance(module, RFDetrLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, RFDetrMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads ) - return RFDecoderOutput( - last_hidden_state=hidden_states, - intermediate_hidden_states=intermediate, - intermediate_reference_points=intermediate_reference_points, - hidden_states=all_hidden_states, - attentions=all_self_attns, - cross_attentions=all_cross_attentions, - ) - - -def build_position_encoding(config): - n_steps = config.d_model // 2 - if config.position_embedding_type == "sine": - # TODO find a better way of exposing other arguments - position_embedding = RFSinePositionEmbedding(n_steps, normalize=True) - elif config.position_embedding_type == "learned": - position_embedding = RFLearnedPositionEmbedding(n_steps) - else: - raise ValueError(f"Not supported {config.position_embedding_type}") - - return position_embedding + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) -R_F_START_DOCSTRING = r""" +RFDETR_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) @@ -2697,18 +1105,18 @@ def build_position_encoding(config): and behavior. Parameters: - config ([`RFConfig`]): + config ([`RFDetrConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -R_F_INPUTS_DOCSTRING = r""" +RFDETR_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Padding will be ignored by default should you provide it. - Pixel values can be obtained using [`AutoImageProcessor`]. See [`RFImageProcessor.__call__`] + Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`] for details. pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): @@ -2742,461 +1150,362 @@ def build_position_encoding(config): """ -@add_start_docstrings( +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->RFDetr +class RFDetrEncoder(RFDetrPreTrainedModel): """ - The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw - hidden-states without any specific head on top. - """, - R_F_START_DOCSTRING, -) -class RFDetrModel(RFPreTrainedModel): - def __init__(self, config: RFConfig): - super().__init__(config) - - # Create backbone + positional encoding - backbone = RFConvEncoder(config) - position_embeddings = build_position_encoding(config) - self.backbone = RFConvModel(backbone, position_embeddings) - - # Create input projection layers - if config.num_feature_levels > 1: - num_backbone_outs = len(backbone.intermediate_channel_sizes) - input_proj_list = [] - for _ in range(num_backbone_outs): - in_channels = backbone.intermediate_channel_sizes[_] - input_proj_list.append( - nn.Sequential( - nn.Conv2d(in_channels, config.d_model, kernel_size=1), - nn.GroupNorm(32, config.d_model), - ) - ) - for _ in range(config.num_feature_levels - num_backbone_outs): - input_proj_list.append( - nn.Sequential( - nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), - nn.GroupNorm(32, config.d_model), - ) - ) - in_channels = config.d_model - self.input_proj = nn.ModuleList(input_proj_list) - else: - self.input_proj = nn.ModuleList( - [ - nn.Sequential( - nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), - nn.GroupNorm(32, config.d_model), - ) - ] - ) + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`RFDetrEncoderLayer`]. - if not config.two_stage: - self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. - self.encoder = RFEncoder(config) - self.decoder = RFDecoder(config) + Args: + config: RFDetrConfig + """ - self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + def __init__(self, config: RFDetrConfig): + super().__init__(config) + self.gradient_checkpointing = False - if config.two_stage: - self.enc_output = nn.Linear(config.d_model, config.d_model) - self.enc_output_norm = nn.LayerNorm(config.d_model) - self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) - self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) - else: - self.reference_points = nn.Linear(config.d_model, 2) + self.dropout = config.dropout + self.layers = nn.ModuleList([RFDetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + # Initialize weights and apply final processing self.post_init() - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def freeze_backbone(self): - for name, param in self.backbone.conv_encoder.model.named_parameters(): - param.requires_grad_(False) - - def unfreeze_backbone(self): - for name, param in self.backbone.conv_encoder.model.named_parameters(): - param.requires_grad_(True) - - def get_valid_ratio(self, mask, dtype=torch.float32): - """Get the valid ratio of all feature maps.""" - - _, height, width = mask.shape - valid_height = torch.sum(mask[:, :, 0], 1) - valid_width = torch.sum(mask[:, 0, :], 1) - valid_ratio_height = valid_height.to(dtype) / height - valid_ratio_width = valid_width.to(dtype) / width - valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) - return valid_ratio - - def get_proposal_pos_embed(self, proposals): - """Get the position embedding of the proposals.""" - - num_pos_feats = self.config.d_model // 2 - temperature = 10000 - scale = 2 * math.pi - - dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) - dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) - # batch_size, num_queries, 4 - proposals = proposals.sigmoid() * scale - # batch_size, num_queries, 4, 128 - pos = proposals[:, :, :, None] / dim_t - # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 - pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) - return pos - - def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): - """Generate the encoder output proposals from encoded enc_output. + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. Args: - enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. - padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. - spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps. - + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. Returns: - `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. - - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to - directly predict a bounding box. (without the need of a decoder) - - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse - sigmoid. + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` """ - batch_size = enc_output.shape[0] - proposals = [] - _cur = 0 + reference_points_list = [] for level, (height, width) in enumerate(spatial_shapes): - mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) - valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) - valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) - - grid_y, grid_x = meshgrid( - torch.linspace(0, height - 1, height, dtype=enc_output.dtype, device=enc_output.device), - torch.linspace(0, width - 1, width, dtype=enc_output.dtype, device=enc_output.device), + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), indexing="ij", ) - grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points - scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) - grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale - width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) - proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) - proposals.append(proposal) - _cur += height * width - output_proposals = torch.cat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) - output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid - output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) - output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + spatial_shapes_tuple = tuple(spatial_shapes_list) + reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + position_embeddings, + reference_points, + spatial_shapes, + spatial_shapes_list, + level_start_index, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) - # assign each pixel as an object query - object_query = enc_output - object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) - object_query = object_query.masked_fill(~output_proposals_valid, float(0)) - object_query = self.enc_output_norm(self.enc_output(object_query)) - return object_query, output_proposals + hidden_states = layer_outputs[0] - @add_start_docstrings_to_model_forward(R_F_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=RFModelOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - pixel_values: torch.FloatTensor, - pixel_mask: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_outputs: Optional[torch.FloatTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.FloatTensor], RFModelOutput]: - r""" - Returns: + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) - Examples: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) - ```python - >>> from transformers import AutoImageProcessor, RFDetrModel - >>> from PIL import Image - >>> import requests + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") - >>> model = RFDetrModel.from_pretrained("SenseTime/deformable-detr") +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->RFDetr +class RFDetrDecoder(RFDetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDetrDecoderLayer`]. - >>> inputs = image_processor(images=image, return_tensors="pt") + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. - >>> outputs = model(**inputs) + Some tweaks for Deformable DETR: - >>> last_hidden_states = outputs.last_hidden_state - >>> list(last_hidden_states.shape) - [1, 300, 256] - ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. - batch_size, num_channels, height, width = pixel_values.shape - device = pixel_values.device + Args: + config: RFDetrConfig + """ - if pixel_mask is None: - pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + def __init__(self, config: RFDetrConfig): + super().__init__(config) - # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) - # First, sent pixel_values + pixel_mask through Backbone to obtain the features - # which is a list of tuples - features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + self.dropout = config.dropout + self.layers = nn.ModuleList([RFDetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False - # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) - sources = [] - masks = [] - for level, (source, mask) in enumerate(features): - sources.append(self.input_proj[level](source)) - masks.append(mask) - if mask is None: - raise ValueError("No attention mask was provided") + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None - # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage - if self.config.num_feature_levels > len(sources): - _len_sources = len(sources) - for level in range(_len_sources, self.config.num_feature_levels): - if level == _len_sources: - source = self.input_proj[level](features[-1][0]) - else: - source = self.input_proj[level](sources[-1]) - mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to( - torch.bool - )[0] - pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) - sources.append(source) - masks.append(mask) - position_embeddings_list.append(pos_l) + # Initialize weights and apply final processing + self.post_init() - # Create queries - query_embeds = None - if not self.config.two_stage: - query_embeds = self.query_position_embeddings.weight + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. - # Prepare encoder inputs (by flattening) - source_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes_list = [] - for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): - batch_size, num_channels, height, width = source.shape - spatial_shape = (height, width) - spatial_shapes_list.append(spatial_shape) - source = source.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose(1, 2) - lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) - lvl_pos_embed_flatten.append(lvl_pos_embed) - source_flatten.append(source) - mask_flatten.append(mask) - source_flatten = torch.cat(source_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) - spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder - # Also provide spatial_shapes, level_start_index and valid_ratios - if encoder_outputs is None: - encoder_outputs = self.encoder( - inputs_embeds=source_flatten, - attention_mask=mask_flatten, - position_embeddings=lvl_pos_embed_flatten, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, - ) + if inputs_embeds is not None: + hidden_states = inputs_embeds - # Fifth, prepare decoder inputs - batch_size, _, num_channels = encoder_outputs[0].shape - enc_outputs_class = None - enc_outputs_coord_logits = None - if self.config.two_stage: - object_query_embedding, output_proposals = self.gen_encoder_output_proposals( - encoder_outputs[0], ~mask_flatten, spatial_shapes_list - ) + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () - # hack implementation for two-stage Deformable DETR - # apply a detection head to each pixel (A.4 in paper) - # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) - # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) - delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) - enc_outputs_coord_logits = delta_bbox + output_proposals + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif reference_points.shape[-1] == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Reference points' last dimension must be of size 2") - # only keep top scoring `config.two_stage_num_proposals` proposals - topk = self.config.two_stage_num_proposals - topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] - topk_coords_logits = torch.gather( - enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) - ) + if output_hidden_states: + all_hidden_states += (hidden_states,) - topk_coords_logits = topk_coords_logits.detach() - reference_points = topk_coords_logits.sigmoid() - init_reference_points = reference_points - pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) - query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) - else: - query_embed, target = torch.split(query_embeds, num_channels, dim=1) - query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) - target = target.unsqueeze(0).expand(batch_size, -1, -1) - reference_points = self.reference_points(query_embed).sigmoid() - init_reference_points = reference_points + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + position_embeddings, + reference_points_input, + spatial_shapes, + spatial_shapes_list, + level_start_index, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) - decoder_outputs = self.decoder( - inputs_embeds=target, - position_embeddings=query_embed, - encoder_hidden_states=encoder_outputs[0], - encoder_attention_mask=mask_flatten, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + hidden_states = layer_outputs[0] - if not return_dict: - enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) - tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() - return tuple_outputs + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) - return RFModelOutput( - init_reference_points=init_reference_points, - last_hidden_state=decoder_outputs.last_hidden_state, - intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, - intermediate_reference_points=decoder_outputs.intermediate_reference_points, - decoder_hidden_states=decoder_outputs.hidden_states, - decoder_attentions=decoder_outputs.attentions, - cross_attentions=decoder_outputs.cross_attentions, - encoder_last_hidden_state=encoder_outputs.last_hidden_state, - encoder_hidden_states=encoder_outputs.hidden_states, - encoder_attentions=encoder_outputs.attentions, - enc_outputs_class=enc_outputs_class, - enc_outputs_coord_logits=enc_outputs_coord_logits, - ) + if output_attentions: + all_self_attns += (layer_outputs[1],) + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) -@dataclass -class RFObjectDetectionOutput(ModelOutput): - """ - Output type of [`RFForObjectDetection`]. + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): - Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a - bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized - scale-invariant IoU loss. - loss_dict (`Dict`, *optional*): - A dictionary containing the individual losses. Useful for logging. - logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): - Classification logits (including no-object) for all queries. - pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): - Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These - values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding - possible padding). You can use [`~RFProcessor.post_process_object_detection`] to retrieve the - unnormalized bounding boxes. - auxiliary_outputs (`list[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) - and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and - `pred_boxes`) for each decoder layer. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the decoder of the model. - decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer - plus the initial embedding outputs. - decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, - num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted - average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the - weighted average in the cross-attention heads. - encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder of the model. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each - layer plus the initial embedding outputs. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, - 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average - in the self-attention heads. - intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): - Stacked intermediate hidden states (output of each layer of the decoder). - intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): - Stacked intermediate reference points (reference points of each layer of the decoder). - init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): - Initial reference points sent through the Transformer decoder. - enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): - Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are - picked as region proposals in the first stage. Output of bounding box binary classification (i.e. - foreground and background). - enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): - Logits of predicted bounding boxes coordinates in the first stage. - """ + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) - loss: Optional[torch.FloatTensor] = None - loss_dict: Optional[Dict] = None - logits: torch.FloatTensor = None - pred_boxes: torch.FloatTensor = None - auxiliary_outputs: Optional[List[Dict]] = None - init_reference_points: Optional[torch.FloatTensor] = None - last_hidden_state: Optional[torch.FloatTensor] = None - intermediate_hidden_states: Optional[torch.FloatTensor] = None - intermediate_reference_points: Optional[torch.FloatTensor] = None - decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_last_hidden_state: Optional[torch.FloatTensor] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - enc_outputs_class: Optional = None - enc_outputs_coord_logits: Optional = None + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return RFDetrDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) @add_start_docstrings( """ - The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + The bare RF DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without any specific head on top. """, - R_F_START_DOCSTRING, + RFDETR_START_DOCSTRING, ) -class RFModel(RFPreTrainedModel): - def __init__(self, config: RFConfig): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr, DEFORMABLE_DETR->RFDETR +class RFDetrModel(RFDetrPreTrainedModel): + def __init__(self, config: RFDetrConfig): super().__init__(config) # Create backbone + positional encoding - backbone = RFConvEncoder(config) + backbone = RFDetrConvEncoder(config) position_embeddings = build_position_encoding(config) - self.backbone = RFConvModel(backbone, position_embeddings) + self.backbone = RFDetrConvModel(backbone, position_embeddings) # Create input projection layers if config.num_feature_levels > 1: @@ -3232,8 +1541,8 @@ def __init__(self, config: RFConfig): if not config.two_stage: self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) - self.encoder = RFEncoder(config) - self.decoder = RFDecoder(config) + self.encoder = RFDetrEncoder(config) + self.decoder = RFDetrDecoder(config) self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) @@ -3338,8 +1647,8 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) object_query = self.enc_output_norm(self.enc_output(object_query)) return object_query, output_proposals - @add_start_docstrings_to_model_forward(R_F_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=RFModelOutput, config_class=_CONFIG_FOR_DOC) + @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFDetrModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -3351,14 +1660,14 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.FloatTensor], RFModelOutput]: + ) -> Union[Tuple[torch.FloatTensor], RFDetrModelOutput]: r""" Returns: Examples: ```python - >>> from transformers import AutoImageProcessor, RFModel + >>> from transformers import AutoImageProcessor, RFDetrModel >>> from PIL import Image >>> import requests @@ -3366,7 +1675,7 @@ def forward( >>> image = Image.open(requests.get(url, stream=True).raw) >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") - >>> model = RFModel.from_pretrained("SenseTime/deformable-detr") + >>> model = RFDetrModel.from_pretrained("SenseTime/deformable-detr") >>> inputs = image_processor(images=image, return_tensors="pt") @@ -3526,7 +1835,7 @@ def forward( return tuple_outputs - return RFModelOutput( + return RFDetrModelOutput( init_reference_points=init_reference_points, last_hidden_state=decoder_outputs.last_hidden_state, intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, @@ -3542,217 +1851,222 @@ def forward( ) -class RFMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. - - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - - """ - - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) - - -@add_start_docstrings( - """ - Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on - top, for tasks such as COCO detection. - """, - R_F_START_DOCSTRING, -) -class RFDetrForObjectDetection(RFPreTrainedModel): - # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] - # We can't initialize the model on meta device as some weights are modified during the initialization - _no_split_modules = None - - def __init__(self, config: RFConfig): - super().__init__(config) - - # Deformable DETR encoder-decoder model - self.model = RFModel(config) - # Detection heads on top - self.class_embed = nn.Linear(config.d_model, config.num_labels) - self.bbox_embed = RFMLPPredictionHead( - input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 - ) - - prior_prob = 0.01 - bias_value = -math.log((1 - prior_prob) / prior_prob) - self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value - nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) - nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) - - # if two-stage, the last class_embed and bbox_embed is for region proposal generation - num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers - if config.with_box_refine: - self.class_embed = _get_clones(self.class_embed, num_pred) - self.bbox_embed = _get_clones(self.bbox_embed, num_pred) - nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) - # hack implementation for iterative bounding box refinement - self.model.decoder.bbox_embed = self.bbox_embed - else: - nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) - self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) - self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) - self.model.decoder.bbox_embed = None - if config.two_stage: - # hack implementation for two-stage - self.model.decoder.class_embed = self.class_embed - for box_embed in self.bbox_embed: - nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(R_F_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=RFObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - pixel_values: torch.FloatTensor, - pixel_mask: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_outputs: Optional[torch.FloatTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[List[dict]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.FloatTensor], RFObjectDetectionOutput]: - r""" - labels (`List[Dict]` of len `(batch_size,)`, *optional*): - Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the - following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch - respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes - in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. - - Returns: - - Examples: - - ```python - >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") - >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") - - >>> inputs = image_processor(images=image, return_tensors="pt") - >>> outputs = model(**inputs) - - >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) - >>> target_sizes = torch.tensor([image.size[::-1]]) - >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ - ... 0 - ... ] - >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): - ... box = [round(i, 2) for i in box.tolist()] - ... print( - ... f"Detected {model.config.id2label[label.item()]} with confidence " - ... f"{round(score.item(), 3)} at location {box}" - ... ) - Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] - Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] - Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # First, sent images through DETR base model to obtain encoder + decoder outputs - outputs = self.model( - pixel_values, - pixel_mask=pixel_mask, - decoder_attention_mask=decoder_attention_mask, - encoder_outputs=encoder_outputs, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] - init_reference = outputs.init_reference_points if return_dict else outputs[0] - inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] - - # class logits + predicted bounding boxes - outputs_classes = [] - outputs_coords = [] - - for level in range(hidden_states.shape[1]): - if level == 0: - reference = init_reference - else: - reference = inter_references[:, level - 1] - reference = inverse_sigmoid(reference) - outputs_class = self.class_embed[level](hidden_states[:, level]) - delta_bbox = self.bbox_embed[level](hidden_states[:, level]) - if reference.shape[-1] == 4: - outputs_coord_logits = delta_bbox + reference - elif reference.shape[-1] == 2: - delta_bbox[..., :2] += reference - outputs_coord_logits = delta_bbox - else: - raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") - outputs_coord = outputs_coord_logits.sigmoid() - outputs_classes.append(outputs_class) - outputs_coords.append(outputs_coord) - outputs_class = torch.stack(outputs_classes) - outputs_coord = torch.stack(outputs_coords) - - logits = outputs_class[-1] - pred_boxes = outputs_coord[-1] - - loss, loss_dict, auxiliary_outputs = None, None, None - if labels is not None: - loss, loss_dict, auxiliary_outputs = self.loss_function( - logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord - ) - if not return_dict: - if auxiliary_outputs is not None: - output = (logits, pred_boxes) + auxiliary_outputs + outputs - else: - output = (logits, pred_boxes) + outputs - tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output - - return tuple_outputs - - dict_outputs = RFObjectDetectionOutput( - loss=loss, - loss_dict=loss_dict, - logits=logits, - pred_boxes=pred_boxes, - auxiliary_outputs=auxiliary_outputs, - last_hidden_state=outputs.last_hidden_state, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - intermediate_hidden_states=outputs.intermediate_hidden_states, - intermediate_reference_points=outputs.intermediate_reference_points, - init_reference_points=outputs.init_reference_points, - enc_outputs_class=outputs.enc_outputs_class, - enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, - ) - - return dict_outputs +# # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr +# class RFDetrMLPPredictionHead(nn.Module): +# """ +# Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, +# height and width of a bounding box w.r.t. an image. + +# Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + +# """ + +# def __init__(self, input_dim, hidden_dim, output_dim, num_layers): +# super().__init__() +# self.num_layers = num_layers +# h = [hidden_dim] * (num_layers - 1) +# self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + +# def forward(self, x): +# for i, layer in enumerate(self.layers): +# x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) +# return x + + +# @add_start_docstrings( +# """ +# RF DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on +# top, for tasks such as COCO detection. +# """, +# RFDETR_START_DOCSTRING, +# ) +# # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->RFDetr +# class RFDetrForObjectDetection(RFDetrPreTrainedModel): +# # When using clones, all layers > 0 will be clones, but layer 0 *is* required +# _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] +# # We can't initialize the model on meta device as some weights are modified during the initialization +# _no_split_modules = None + +# def __init__(self, config: RFDetrConfig): +# super().__init__(config) + +# # Deformable DETR encoder-decoder model +# self.model = RFDetrModel(config) +# # Detection heads on top +# self.class_embed = nn.Linear(config.d_model, config.num_labels) +# self.bbox_embed = RFDetrMLPPredictionHead( +# input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 +# ) + +# prior_prob = 0.01 +# bias_value = -math.log((1 - prior_prob) / prior_prob) +# self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value +# nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) +# nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + +# # if two-stage, the last class_embed and bbox_embed is for region proposal generation +# num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers +# if config.with_box_refine: +# self.class_embed = _get_clones(self.class_embed, num_pred) +# self.bbox_embed = _get_clones(self.bbox_embed, num_pred) +# nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) +# # hack implementation for iterative bounding box refinement +# self.model.decoder.bbox_embed = self.bbox_embed +# else: +# nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) +# self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) +# self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) +# self.model.decoder.bbox_embed = None +# if config.two_stage: +# # hack implementation for two-stage +# self.model.decoder.class_embed = self.class_embed +# for box_embed in self.bbox_embed: +# nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + +# # Initialize weights and apply final processing +# self.post_init() + +# @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) +# @replace_return_docstrings(output_type=RFDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) +# def forward( +# self, +# pixel_values: torch.FloatTensor, +# pixel_mask: Optional[torch.LongTensor] = None, +# decoder_attention_mask: Optional[torch.FloatTensor] = None, +# encoder_outputs: Optional[torch.FloatTensor] = None, +# inputs_embeds: Optional[torch.FloatTensor] = None, +# decoder_inputs_embeds: Optional[torch.FloatTensor] = None, +# labels: Optional[List[dict]] = None, +# output_attentions: Optional[bool] = None, +# output_hidden_states: Optional[bool] = None, +# return_dict: Optional[bool] = None, +# ) -> Union[Tuple[torch.FloatTensor], RFDetrObjectDetectionOutput]: +# r""" +# labels (`List[Dict]` of len `(batch_size,)`, *optional*): +# Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the +# following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch +# respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes +# in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + +# Returns: + +# Examples: + +# ```python +# >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection +# >>> from PIL import Image +# >>> import requests + +# >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +# >>> image = Image.open(requests.get(url, stream=True).raw) + +# >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") +# >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") + +# >>> inputs = image_processor(images=image, return_tensors="pt") +# >>> outputs = model(**inputs) + +# >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) +# >>> target_sizes = torch.tensor([image.size[::-1]]) +# >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ +# ... 0 +# ... ] +# >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): +# ... box = [round(i, 2) for i in box.tolist()] +# ... print( +# ... f"Detected {model.config.id2label[label.item()]} with confidence " +# ... f"{round(score.item(), 3)} at location {box}" +# ... ) +# Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] +# Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] +# Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] +# ```""" +# return_dict = return_dict if return_dict is not None else self.config.use_return_dict + +# # First, sent images through DETR base model to obtain encoder + decoder outputs +# outputs = self.model( +# pixel_values, +# pixel_mask=pixel_mask, +# decoder_attention_mask=decoder_attention_mask, +# encoder_outputs=encoder_outputs, +# inputs_embeds=inputs_embeds, +# decoder_inputs_embeds=decoder_inputs_embeds, +# output_attentions=output_attentions, +# output_hidden_states=output_hidden_states, +# return_dict=return_dict, +# ) + +# hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] +# init_reference = outputs.init_reference_points if return_dict else outputs[0] +# inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + +# # class logits + predicted bounding boxes +# outputs_classes = [] +# outputs_coords = [] + +# for level in range(hidden_states.shape[1]): +# if level == 0: +# reference = init_reference +# else: +# reference = inter_references[:, level - 1] +# reference = inverse_sigmoid(reference) +# outputs_class = self.class_embed[level](hidden_states[:, level]) +# delta_bbox = self.bbox_embed[level](hidden_states[:, level]) +# if reference.shape[-1] == 4: +# outputs_coord_logits = delta_bbox + reference +# elif reference.shape[-1] == 2: +# delta_bbox[..., :2] += reference +# outputs_coord_logits = delta_bbox +# else: +# raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") +# outputs_coord = outputs_coord_logits.sigmoid() +# outputs_classes.append(outputs_class) +# outputs_coords.append(outputs_coord) +# outputs_class = torch.stack(outputs_classes) +# outputs_coord = torch.stack(outputs_coords) + +# logits = outputs_class[-1] +# pred_boxes = outputs_coord[-1] + +# loss, loss_dict, auxiliary_outputs = None, None, None +# if labels is not None: +# loss, loss_dict, auxiliary_outputs = self.loss_function( +# logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord +# ) +# if not return_dict: +# if auxiliary_outputs is not None: +# output = (logits, pred_boxes) + auxiliary_outputs + outputs +# else: +# output = (logits, pred_boxes) + outputs +# tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + +# return tuple_outputs + +# dict_outputs = RFDetrObjectDetectionOutput( +# loss=loss, +# loss_dict=loss_dict, +# logits=logits, +# pred_boxes=pred_boxes, +# auxiliary_outputs=auxiliary_outputs, +# last_hidden_state=outputs.last_hidden_state, +# decoder_hidden_states=outputs.decoder_hidden_states, +# decoder_attentions=outputs.decoder_attentions, +# cross_attentions=outputs.cross_attentions, +# encoder_last_hidden_state=outputs.encoder_last_hidden_state, +# encoder_hidden_states=outputs.encoder_hidden_states, +# encoder_attentions=outputs.encoder_attentions, +# intermediate_hidden_states=outputs.intermediate_hidden_states, +# intermediate_reference_points=outputs.intermediate_reference_points, +# init_reference_points=outputs.init_reference_points, +# enc_outputs_class=outputs.enc_outputs_class, +# enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, +# ) + +# return dict_outputs + + +__all__ = [ + "RFDetrForObjectDetection", + "RFDetrModel", + "RFDetrPreTrainedModel", +] diff --git a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py new file mode 100644 index 000000000000..abe0b378e4b7 --- /dev/null +++ b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py @@ -0,0 +1,850 @@ +import collections +from typing import Callable, Dict, List, Optional, Set, Tuple, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ...utils.backbone_utils import BackboneMixin +from .configuration_rf_detr_dinov2_with_registers import ( + RFDetrDinov2WithRegistersConfig, +) + + +logger = logging.get_logger(__name__) + +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base" # TODO + +_CONFIG_FOR_DOC = "RFDetrDinov2WithRegistersConfig" + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersPatchEmbeddings with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class RFDetrDinov2WithRegistersEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = RFDetrDinov2WithRegistersPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + if self.config.num_windows > 1: + # reshape for windows + num_h_patches = height // self.config.patch_size + num_w_patches = width // self.config.patch_size + cls_token_with_pos_embed = embeddings[:, :1] + pixel_tokens_with_pos_embed = embeddings[:, 1:] + pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( + batch_size, num_h_patches, num_w_patches, -1 + ) + num_w_patches_per_window = num_w_patches // self.config.num_windows + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_windows = self.config.num_windows + windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( + batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 + ) + windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) + windowed_pixel_tokens = windowed_pixel_tokens.reshape( + batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 + ) + windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) + embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) + + # add register tokens + embeddings = ( + torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + if self.config.num_register_tokens > 0 + else embeddings + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersSelfAttention with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersSelfAttention(nn.Module): + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersSelfOutput with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersSelfOutput(nn.Module): + """ + The residual connection is defined in RFDetrDinov2WithRegistersLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersAttention with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersAttention(nn.Module): + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + self.attention = RFDetrDinov2WithRegistersSelfAttention(config) + self.output = RFDetrDinov2WithRegistersSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersLayerScale with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersDropPath with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersMLP with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersSwiGLUFFN with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class RFDetrDinov2WithRegistersLayer(nn.Module): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = RFDetrDinov2WithRegistersAttention(config) + self.layer_scale1 = RFDetrDinov2WithRegistersLayerScale(config) + self.drop_path = ( + RFDetrDinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = RFDetrDinov2WithRegistersSwiGLUFFN(config) + else: + self.mlp = RFDetrDinov2WithRegistersMLP(config) + self.layer_scale2 = RFDetrDinov2WithRegistersLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + run_full_attention: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + assert head_mask is None, "head_mask is not supported for windowed attention" + assert not output_attentions, "output_attentions is not supported for windowed attention" + shortcut = hidden_states + if run_full_attention: + # reshape x to remove windows + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) + + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + if run_full_attention: + # reshape x to add windows back + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) + attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + shortcut + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersEncoder with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersEncoder(nn.Module): + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([RFDetrDinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersPreTrainedModel with Dinov2WithRegisters->RFDetrDinov2WithRegisters +class RFDetrDinov2WithRegistersPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RFDetrDinov2WithRegistersConfig + base_model_prefix = "dinov2_with_registers" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["RFDetrDinov2WithRegistersSwiGLUFFN"] + _supports_sdpa = True + _supports_flash_attn_2 = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, RFDetrDinov2WithRegistersEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + +_EXPECTED_OUTPUT_SHAPE = [1, 257, 768] + +RFDETR_DINOV2_WITH_REGISTERS_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`Dinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +RFDETR_DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BitImageProcessor.preprocess`] for details. + + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare RFDetrDinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.", + RFDETR_DINOV2_WITH_REGISTERS_START_DOCSTRING, +) +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersModel with Dinov2WithRegisters->RFDetrDinov2WithRegisters, DINOV2_WITH_REGISTERS->RFDETR_DINOV2_WITH_REGISTERS +class RFDetrDinov2WithRegistersModel(RFDetrDinov2WithRegistersPreTrainedModel): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super().__init__(config) + self.config = config + + self.embeddings = RFDetrDinov2WithRegistersEmbeddings(config) + self.encoder = RFDetrDinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> RFDetrDinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(RFDETR_DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + head_outputs = (sequence_output, pooled_output) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +RFDETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BitImageProcessor.preprocess`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + """ + Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer. + """, + RFDETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING, +) +# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersBackbone with Dinov2WithRegisters->RFDetrDinov2WithRegisters, DINOV2_WITH_REGISTERS->RFDETR_DINOV2_WITH_REGISTERS +class RFDetrDinov2WithRegistersBackbone(RFDetrDinov2WithRegistersPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = RFDetrDinov2WithRegistersEmbeddings(config) + self.encoder = RFDetrDinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.num_register_tokens = config.num_register_tokens + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> RFDetrDinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + @add_start_docstrings_to_model_forward(RFDETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + Returns: + + Examples: + + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = ["RFDetrDinov2WithRegistersBackbone", "RFDetrDinov2WithRegistersPreTrainedModel"] diff --git a/src/transformers/models/rf_detr/modular_rf_detr.py b/src/transformers/models/rf_detr/modular_rf_detr.py deleted file mode 100644 index 30de1bdcd4a5..000000000000 --- a/src/transformers/models/rf_detr/modular_rf_detr.py +++ /dev/null @@ -1,543 +0,0 @@ -from numbers import Number -from typing import List, Optional, Tuple, Union - -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - -from ...activations import ACT2FN -from ...configuration_utils import PretrainedConfig -from ...modeling_outputs import BackboneOutput, BaseModelOutput -from ..auto import CONFIG_MAPPING -from ..deformable_detr.modeling_deformable_detr import ( - DeformableDetrDecoder, - DeformableDetrDecoderLayer, - DeformableDetrEncoder, - DeformableDetrForObjectDetection, - DeformableDetrModel, - DeformableDetrPreTrainedModel, -) -from ..dinov2_with_registers.configuration_dinov2_with_registers import Dinov2WithRegistersConfig -from ..dinov2_with_registers.modeling_dinov2_with_registers import ( - Dinov2WithRegistersBackbone, - Dinov2WithRegistersEmbeddings, - Dinov2WithRegistersEncoder, - Dinov2WithRegistersLayer, -) -from ..vitdet.modeling_vitdet import VitDetLayerNorm - - -class RFDetrConfig(PretrainedConfig): - model_type = "rf_detr" - sub_configs = {"backbone_config": Dinov2WithRegistersConfig} - - def __init__( - self, - backbone_config=None, - num_windows: int = 4, - window_block_indexes=None, - out_feature_indexes: List[int] = [2, 5, 8, 11], - scale_factors: List[Number[2.0, 1.0, 0.5, 0.25]] = [1.0], - layer_norm: bool = False, - rms_norm: bool = False, - **kwargs, - ): - self.out_feature_indexes = out_feature_indexes - - if isinstance(backbone_config, dict): - backbone_config["out_indices"] = out_feature_indexes - backbone_config["model_type"] = ( - backbone_config["model_type"] if "model_type" in backbone_config else "dinov2_with_registers" - ) - backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) - elif backbone_config is None: - backbone_config = CONFIG_MAPPING["dinov2_with_registers"](out_indices=out_feature_indexes) - self.backbone_config = backbone_config - self.backbone_config.num_windows = num_windows - self.backbone_config.window_block_indexes = ( - list(range(backbone_config.num_hidden_layers)) if window_block_indexes is None else window_block_indexes - ) - - self.scale_factors = [1.0] if scale_factors is None else scale_factors - assert self.scale_factors > 0, "scale_factors must be a list of at least one element" - assert sorted(self.scale_factors) == self.scale_factors, "scale_factors must be sorted" - assert all(scale in [2.0, 1.0, 0.5, 0.25] for scale in self.scale_factors), ( - "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]" - ) - - self.layer_norm = layer_norm - self.rms_norm = rms_norm - super().__init__(**kwargs) - - -class RFDetrEmbeddings(Dinov2WithRegistersEmbeddings): - def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: - batch_size, _, height, width = pixel_values.shape - target_dtype = self.patch_embeddings.projection.weight.dtype - embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) - - if bool_masked_pos is not None: - embeddings = torch.where( - bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings - ) - - # add the [CLS] token to the embedded patch tokens - cls_tokens = self.cls_token.expand(batch_size, -1, -1) - embeddings = torch.cat((cls_tokens, embeddings), dim=1) - - # add positional encoding to each token - embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) - - if self.config.num_windows > 1: - # reshape for windows - num_h_patches = height // self.config.patch_size - num_w_patches = width // self.config.patch_size - cls_token_with_pos_embed = embeddings[:, :1] - pixel_tokens_with_pos_embed = embeddings[:, 1:] - pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( - batch_size, num_h_patches, num_w_patches, -1 - ) - num_w_patches_per_window = num_w_patches // self.config.num_windows - num_h_patches_per_window = num_h_patches // self.config.num_windows - num_windows = self.config.num_windows - windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( - batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 - ) - windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) - windowed_pixel_tokens = windowed_pixel_tokens.reshape( - batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 - ) - windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) - embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) - - # add register tokens - embeddings = ( - torch.cat( - (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 - ) - if self.config.num_register_tokens > 0 - else embeddings - ) - - embeddings = self.dropout(embeddings) - - return embeddings - - -class RFDetrLayer(Dinov2WithRegistersLayer): - def __init__(self, config): - super(Dinov2WithRegistersLayer).__init__(config) - - self.num_windows = config.num_windows - - def forward( - self, - hidden_states: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - run_full_attention: bool = False, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - assert head_mask is None, "head_mask is not supported for windowed attention" - assert not output_attentions, "output_attentions is not supported for windowed attention" - shortcut = hidden_states - if run_full_attention: - # reshape x to remove windows - B, HW, C = hidden_states.shape - num_windows_squared = self.num_windows**2 - hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) - - self_attention_outputs = self.attention( - self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention - head_mask, - output_attentions=output_attentions, - ) - attention_output = self_attention_outputs[0] - - if run_full_attention: - # reshape x to add windows back - B, HW, C = hidden_states.shape - num_windows_squared = self.num_windows**2 - # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) - attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) - - attention_output = self.layer_scale1(attention_output) - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights - - # first residual connection - hidden_states = self.drop_path(attention_output) + shortcut - - # in Dinov2WithRegisters, layernorm is also applied after self-attention - layer_output = self.norm2(hidden_states) - layer_output = self.mlp(layer_output) - layer_output = self.layer_scale2(layer_output) - - # second residual connection - layer_output = self.drop_path(layer_output) + hidden_states - - outputs = (layer_output,) + outputs - - return outputs - - -class RFDetrEncoder(Dinov2WithRegistersEncoder): - def forward( - self, - hidden_states: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if i > int(self.config.out_features[-1][5:]): # TODO check this - # early stop if we have reached the last output feature - break - - run_full_attention = i not in self.config.window_block_indexes - - layer_head_mask = head_mask[i] if head_mask is not None else None - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - layer_module.__call__, - hidden_states, - layer_head_mask, - output_attentions, - run_full_attention, - ) - else: - layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, run_full_attention) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - ) - - -class RFDetrBackbone(Dinov2WithRegistersBackbone): - def forward( - self, - pixel_values: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> BackboneOutput: - """ - Returns: - - Examples: - Returns: - - Examples: - - - ```python - >>> from transformers import AutoImageProcessor, AutoBackbone - >>> import torch - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") - >>> model = AutoBackbone.from_pretrained( - ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] - ... ) - - >>> inputs = processor(image, return_tensors="pt") - - >>> outputs = model(**inputs) - >>> feature_maps = outputs.feature_maps - >>> list(feature_maps[-1].shape) - [1, 768, 16, 16] - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - embedding_output = self.embeddings(pixel_values) - - outputs = self.encoder( - embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict - ) - - hidden_states = outputs.hidden_states if return_dict else outputs[1] - - feature_maps = () - for stage, hidden_state in zip(self.stage_names, hidden_states): - if stage in self.out_features: - if self.config.apply_layernorm: - hidden_state = self.layernorm(hidden_state) - if self.config.reshape_hidden_states: - hidden_state = hidden_state[:, self.num_register_tokens + 1 :] - # this was actually a bug in the original implementation that we copied here, - # cause normally the order is height, width - batch_size, _, height, width = pixel_values.shape - patch_size = self.config.patch_size - - num_h_patches = height // patch_size - num_w_patches = width // patch_size - - if self.config.num_windows > 1: - # undo windowing - num_windows_squared = self.config.num_windows**2 - B, HW, C = hidden_state.shape - num_h_patches_per_window = num_h_patches // self.config.num_windows - num_w_patches_per_window = num_w_patches // self.config.num_windows - hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) - hidden_state = hidden_state.view( - B // num_windows_squared, - self.config.num_windows, - self.config.num_windows, - num_h_patches_per_window, - num_w_patches_per_window, - C, - ) - hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) - - hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) - hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() - - feature_maps += (hidden_state,) - - if not return_dict: - if output_hidden_states: - output = (feature_maps,) + outputs[1:] - else: - output = (feature_maps,) + outputs[2:] - return output - - return BackboneOutput( - feature_maps=feature_maps, - hidden_states=outputs.hidden_states if output_hidden_states else None, - attentions=outputs.attentions if output_attentions else None, - ) - - -class RFDetrLayerNorm(VitDetLayerNorm): - pass - - -class ConvX(nn.Module): - """Conv-bn module""" - - def __init__(self, in_planes, out_planes, kernel=3, stride=1, groups=1, dilation=1, act="relu"): - super(ConvX, self).__init__() - self.conv = nn.Conv2d( - in_planes, - out_planes, - kernel_size=kernel, - stride=stride, - padding=kernel // 2, - groups=groups, - dilation=dilation, - bias=False, - ) - self.bn = nn.BatchNorm2d(out_planes) - self.act = ACT2FN[act] - - def forward(self, x): - """forward""" - out = self.act(self.bn(self.conv(x))) - return out - - -class Bottleneck(nn.Module): - """Standard bottleneck.""" - - def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, act="silu"): - """ch_in, ch_out, shortcut, groups, kernels, expand""" - super().__init__() - c_ = int(c2 * e) # hidden channels - self.cv1 = ConvX(c1, c_, k[0], 1, act=act) - self.cv2 = ConvX(c_, c2, k[1], 1, groups=g, act=act) - self.add = shortcut and c1 == c2 - - def forward(self, x): - """'forward()' applies the YOLOv5 FPN to input data.""" - return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) - - -class RFDetrC2f(nn.Module): - """Faster Implementation of CSP Bottleneck with 2 convolutions.""" - - def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, act="silu"): - """ch_in, ch_out, number, shortcut, groups, expansion""" - super().__init__() - self.c = int(c2 * e) # hidden channels - self.cv1 = ConvX(c1, 2 * self.c, 1, 1, act=act) - self.cv2 = ConvX((2 + n) * self.c, c2, 1, act=act) # optional act=FReLU(c2) - self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=(3, 3), e=1.0, act=act) for _ in range(n)) - - def forward(self, x): - """Forward pass using split() instead of chunk().""" - y = list(self.cv1(x).split((self.c, self.c), 1)) - y.extend(m(y[-1]) for m in self.m) - return self.cv2(torch.cat(y, 1)) - - -class RFDetrMultiScaleProjector(nn.Module): - """ - This module implements MultiScaleProjector in :paper:`lwdetr`. - It creates pyramid features built on top of the input feature map. - """ - - def __init__( - self, - config: RFDetrConfig, - in_channels, - out_channels, - scale_factors, - num_blocks=3, - ): - """ - Args: - net (Backbone): module representing the subnetwork backbone. - Must be a subclass of :class:`Backbone`. - out_channels (int): number of channels in the output feature maps. - scale_factors (list[float]): list of scaling factors to upsample or downsample - the input features for creating pyramid features. - """ - super().__init__() - - self.scale_factors = config.scale_factors - in_channels = [config.backbone_config.hidden_size] * len(config.out_feature_indexes) - - stages_sampling = [] - stages = [] - - self.use_extra_pool = False - for scale in scale_factors: - stages_sampling.append([]) - for in_dim in in_channels: - layers = [] - - # if in_dim > 512: - # layers.append(ConvX(in_dim, in_dim // 2, kernel=1)) - # in_dim = in_dim // 2 - - if scale == 4.0: - layers.extend( - [ - nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), - RFDetrLayerNorm(in_dim // 2), - nn.GELU(), - nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), - ] - ) - elif scale == 2.0: - # a hack to reduce the FLOPs and Params when the dimention of output feature is too large - # if in_dim > 512: - # layers = [ - # ConvX(in_dim, in_dim // 2, kernel=1), - # nn.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2), - # ] - # out_dim = in_dim // 4 - # else: - layers.extend( - [ - nn.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2), - ] - ) - elif scale == 1.0: - pass - elif scale == 0.5: - layers.extend( - [ - ConvX(in_dim, in_dim, 3, 2, layer_norm=config.layer_norm), - ] - ) - elif scale == 0.25: - self.use_extra_pool = True - continue - else: - raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) - layers = nn.Sequential(*layers) - stages_sampling[-1].append(layers) - stages_sampling[-1] = nn.ModuleList(stages_sampling[-1]) - - in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) - layers = [ - RFDetrC2f(in_dim, out_channels, num_blocks, layer_norm=config.layer_norm), - RFDetrLayerNorm(out_channels), - ] - layers = nn.Sequential(*layers) - stages.append(layers) - - self.stages_sampling = nn.ModuleList(stages_sampling) - self.stages = nn.ModuleList(stages) - - def forward(self, x): - """ - Args: - x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. - Returns: - dict[str->Tensor]: - mapping from feature map name to pyramid feature map tensor - in high to low resolution order. Returned feature names follow the FPN - convention: "p", where stage has stride = 2 ** stage e.g., - ["p2", "p3", ..., "p6"]. - """ - results = [] - # x list of len(out_features_indexes) - for i, stage in enumerate(self.stages): - feat_fuse = [] - for j, stage_sampling in enumerate(self.stages_sampling[i]): - feat_fuse.append(stage_sampling(x[j])) - if len(feat_fuse) > 1: - feat_fuse = torch.cat(feat_fuse, dim=1) - else: - feat_fuse = feat_fuse[0] - results.append(stage(feat_fuse)) - if self.use_extra_pool: - results.append(F.max_pool2d(results[-1], kernel_size=1, stride=2, padding=0)) - return results - - -class RFDetrDecoderLayer(DeformableDetrDecoderLayer): - pass - - -class RFDetrDecoder(DeformableDetrDecoder): - pass - - -class RFDetrPreTrainedModel(DeformableDetrPreTrainedModel): - pass - - -class RFDetrDecoder(DeformableDetrDecoder): - pass - - -class RFDetrEncoder(DeformableDetrEncoder): - pass - - -class RFDetrModel(DeformableDetrModel): - pass - - -class RFDetrForObjectDetection(DeformableDetrForObjectDetection): - pass diff --git a/src/transformers/models/rf_detr/run_rfdetr.py b/src/transformers/models/rf_detr/run_rfdetr.py new file mode 100644 index 000000000000..0ec581d49343 --- /dev/null +++ b/src/transformers/models/rf_detr/run_rfdetr.py @@ -0,0 +1,18 @@ +import io + +import requests +from PIL import Image + +from transformers import AutoImageProcessor, RFDetrBackbone, RFDetrConfig + + +images = ["https://media.roboflow.com/notebooks/examples/dog-2.jpeg"] + +images = [Image.open(io.BytesIO(requests.get(url).content)) for url in images] + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") +inputs = processor(images, return_tensors="pt") + +config = RFDetrConfig() +backbone = RFDetrBackbone(config=config.backbone_config) +# model = RFDetrForObjectDetection.from_config() From e9aefc81d10113f51325a4546f5755e4299be042 Mon Sep 17 00:00:00 2001 From: steven Date: Wed, 26 Mar 2025 23:55:13 +0100 Subject: [PATCH 3/6] feat: implemented first draft RFDetr architecture --- .../models/auto/configuration_auto.py | 1 - .../models/rf_detr/configuration_rf_detr.py | 116 +-- ...iguration_rf_detr_dinov2_with_registers.py | 14 +- .../models/rf_detr/modeling_rf_detr.py | 939 ++++++++++-------- .../modeling_rf_detr_dinov2_with_registers.py | 213 ++-- .../modular_rf_detr_dinov2_with_registers.py | 292 ++++++ src/transformers/utils/dummy_pt_objects.py | 28 + 7 files changed, 984 insertions(+), 619 deletions(-) create mode 100644 src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 152f35e95ab5..27945df21f45 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -261,7 +261,6 @@ ("rembert", "RemBertConfig"), ("resnet", "ResNetConfig"), ("retribert", "RetriBertConfig"), - ("rf_detr_dinov2_with_registers", "RFDetrDinov2WithRegistersConfig"), ("rf_detr", "RFDetrConfig"), ("roberta", "RobertaConfig"), ("roberta-prelayernorm", "RobertaPreLayerNormConfig"), diff --git a/src/transformers/models/rf_detr/configuration_rf_detr.py b/src/transformers/models/rf_detr/configuration_rf_detr.py index dc62390c7762..0b333b0d5beb 100644 --- a/src/transformers/models/rf_detr/configuration_rf_detr.py +++ b/src/transformers/models/rf_detr/configuration_rf_detr.py @@ -1,25 +1,10 @@ -# coding=utf-8 -# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Deformable DETR model configuration""" - -from typing import List +from typing import List, Optional from ...configuration_utils import PretrainedConfig from ...utils import logging from ...utils.backbone_utils import verify_backbone_config_arguments from ..auto import CONFIG_MAPPING +from .configuration_rf_detr_dinov2_with_registers import RFDetrDinov2WithRegistersConfig logger = logging.get_logger(__name__) @@ -28,20 +13,18 @@ class RFDetrConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`RFDetrModel`]. It is used to instantiate - a Deformable DETR model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the Deformable DETR + an RF DETR model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the RF DETR [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + TODO: Add more details about the architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: - use_timm_backbone (`bool`, *optional*, defaults to `True`): - Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] - API. backbone_config (`PretrainedConfig` or `dict`, *optional*): - The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which - case it will default to `ResNetConfig()`. + The configuration of the backbone model. num_channels (`int`, *optional*, defaults to 3): The number of input channels. num_queries (`int`, *optional*, defaults to 300): @@ -135,9 +118,10 @@ class RFDetrConfig(PretrainedConfig): out_feature_indexes (`List`, *optional*, defaults to `[2, 5, 8, 11]`): scale_factors (`List`, *optional*, defaults to `[1.0]`): layer_norm (`bool`, *optional*, defaults to `False`): - rms_norm (`bool`, *optional*, defaults to `False`): - projector_out_channels (`int`, *optional*, defaults to 256): + projector_in_channels (`int`, *optional*, defaults to 256): projector_num_blocks (`int`, *optional*, defaults to 3): + projector_survival_prob (`float`, *optional*, defaults to 1.0): + projector_force_drop_last_n_features (`int`, *optional*, defaults to 0): Examples: @@ -162,32 +146,32 @@ class RFDetrConfig(PretrainedConfig): def __init__( self, - use_timm_backbone=True, + init_std=0.02, + init_xavier_std=1.0, + # backbone + use_timm_backbone=False, backbone_config=None, - num_channels=3, + backbone=None, + use_pretrained_backbone=False, + backbone_kwargs=None, + # RFDetrModel num_queries=300, - max_position_embeddings=1024, + # RFDetrEncoder encoder_layers=6, encoder_ffn_dim=1024, encoder_attention_heads=8, + encoder_layerdrop=0.0, + # RFDetrDecoder decoder_layers=6, decoder_ffn_dim=1024, decoder_attention_heads=8, - encoder_layerdrop=0.0, - is_encoder_decoder=True, activation_function="relu", d_model=256, dropout=0.1, attention_dropout=0.0, activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, - return_intermediate=True, auxiliary_loss=False, position_embedding_type="sine", - backbone="resnet50", - use_pretrained_backbone=True, - backbone_kwargs=None, dilation=False, num_feature_levels=4, encoder_n_points=4, @@ -208,28 +192,28 @@ def __init__( out_feature_indexes: List[int] = [2, 5, 8, 11], scale_factors: List[float] = [1.0], layer_norm: bool = False, - rms_norm: bool = False, - projector_out_channels: int = 256, + projector_in_channels: Optional[List[int]] = None, projector_num_blocks: int = 3, # TODO rename + projector_survival_prob: float = 1.0, + projector_force_drop_last_n_features: int = 0, + projector_activation_function: str = "silu", + hidden_expansion: float = 0.5, + batch_norm_eps: float = 1e-5, + is_encoder_decoder=True, **kwargs, ): - # We default to values which were previously hard-coded in the model. This enables configurability of the config - # while keeping the default behavior the same. - if use_timm_backbone and backbone_kwargs is None: - backbone_kwargs = {} - if dilation: - backbone_kwargs["output_stride"] = 16 - backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4] - backbone_kwargs["in_chans"] = num_channels - # Backwards compatibility - elif not use_timm_backbone and backbone in (None, "resnet50"): - if backbone_config is None: - logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") - backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) - elif isinstance(backbone_config, dict): - backbone_model_type = backbone_config.get("model_type") - config_class = CONFIG_MAPPING[backbone_model_type] - backbone_config = config_class.from_dict(backbone_config) + if backbone_config is None and backbone is None: + logger.info( + "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone." + ) + backbone_config = RFDetrDinov2WithRegistersConfig( + out_features=[f"stage{i}" for i in out_feature_indexes], + return_dict=False, + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) verify_backbone_config_arguments( use_timm_backbone=use_timm_backbone, @@ -241,9 +225,7 @@ def __init__( self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config - self.num_channels = num_channels self.num_queries = num_queries - self.max_position_embeddings = max_position_embeddings self.d_model = d_model self.encoder_ffn_dim = encoder_ffn_dim self.encoder_layers = encoder_layers @@ -288,15 +270,27 @@ def __init__( self.scale_factors = [1.0] if scale_factors is None else scale_factors assert len(self.scale_factors) > 0, "scale_factors must be a list of at least one element" - assert sorted(self.scale_factors) == self.scale_factors, "scale_factors must be sorted" + assert sorted(self.scale_factors, reverse=True) == self.scale_factors, "scale_factors must be reverse sorted" assert all(scale in [2.0, 1.0, 0.5, 0.25] for scale in self.scale_factors), ( "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]" ) self.layer_norm = layer_norm - self.rms_norm = rms_norm - self.projector_out_channels = projector_out_channels + self.projector_in_channels = ( + projector_in_channels + if projector_in_channels is not None + else [backbone_config.hidden_size] * len(out_feature_indexes) + ) + assert len(self.projector_in_channels) == len(out_feature_indexes), ( + "projector_in_channels must have the same length as out_feature_indexes" + ) self.projector_num_blocks = projector_num_blocks + self.projector_survival_prob = projector_survival_prob + self.projector_force_drop_last_n_features = projector_force_drop_last_n_features + self.projector_activation_function = projector_activation_function + self.hidden_expansion = hidden_expansion + self.batch_norm_eps = batch_norm_eps + self.encoder_hidden_dim = backbone_config.hidden_size super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py index 6af39627f87b..b43b8ec4dc04 100644 --- a/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py +++ b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py @@ -1,10 +1,15 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rf_detr_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 from ...configuration_utils import PretrainedConfig from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices class RFDetrDinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): r""" - TODO This is the configuration class to store the configuration of a [`RFDetrDinov2WithRegistersModel`]. It is used to instantiate an RFDetrDinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DINOv2 with Registers @@ -137,9 +142,10 @@ def __init__( self.reshape_hidden_states = reshape_hidden_states self.num_windows = num_windows - self.window_block_indexes = ( - list(range(self.num_hidden_layers)) if window_block_indexes is None else window_block_indexes - ) + window_block_indexes = set(range(self._out_indices[-1] + 1)) + window_block_indexes.difference_update(self._out_indices) + window_block_indexes = list(window_block_indexes) + self.window_block_indexes = window_block_indexes __all__ = ["RFDetrDinov2WithRegistersConfig"] diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py index 88ebbc663a8d..98108a743d07 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr.py @@ -1,18 +1,16 @@ import copy import math -import os import warnings from dataclasses import dataclass -from pathlib import Path from typing import Dict, List, Optional, Tuple, Union +import numpy as np import torch import torch.nn.functional as F from torch import Tensor, nn -from torch.autograd import Function -from torch.autograd.function import once_differentiable -from ...activations import ACT2FN +from ...activations import ACT2CLS, ACT2FN +from ...integrations.hub_kernels import use_kernel_forward_from_hub from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel @@ -21,13 +19,9 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_ninja_available, is_timm_available, - is_torch_cuda_available, - is_torchdynamo_compiling, logging, replace_return_docstrings, - requires_backends, ) from ...utils.backbone_utils import load_backbone from .configuration_rf_detr import RFDetrConfig @@ -36,91 +30,67 @@ logger = logging.get_logger(__name__) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.load_cuda_kernels -def load_cuda_kernels(): - from torch.utils.cpp_extension import load - - global MultiScaleDeformableAttention - - root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr" - src_files = [ - root / filename - for filename in [ - "vision.cpp", - os.path.join("cpu", "ms_deform_attn_cpu.cpp"), - os.path.join("cuda", "ms_deform_attn_cuda.cu"), - ] - ] - - MultiScaleDeformableAttention = load( - "MultiScaleDeformableAttention", - src_files, - with_cuda=True, - extra_include_paths=[str(root)], - extra_cflags=["-DWITH_CUDA=1"], - extra_cuda_cflags=[ - "-DCUDA_HAS_FP16=1", - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - ], - ) - - if is_timm_available(): - from timm import create_model + pass _CONFIG_FOR_DOC = "RFDetrConfig" _CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" # TODO -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction -class MultiScaleDeformableAttentionFunction(Function): - @staticmethod +@use_kernel_forward_from_hub("MultiScaleDeformableAttention") +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention +class MultiScaleDeformableAttention(nn.Module): def forward( - context, - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - im2col_step, + self, + value: Tensor, + value_spatial_shapes: Tensor, + value_spatial_shapes_list: List[Tuple], + level_start_index: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + im2col_step: int, ): - context.im2col_step = im2col_step - output = MultiScaleDeformableAttention.ms_deform_attn_forward( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - context.im2col_step, - ) - context.save_for_backward( - value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes_list): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id] + .flatten(2) + .transpose(1, 2) + .reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points ) - return output - - @staticmethod - @once_differentiable - def backward(context, grad_output): - ( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - ) = context.saved_tensors - grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, - grad_output, - context.im2col_step, + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) ) - - return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + return output.transpose(1, 2).contiguous() @dataclass @@ -331,14 +301,27 @@ def __init__(self, n): self.register_buffer("running_var", torch.ones(n)) def _load_from_state_dict( - self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, ): num_batches_tracked_key = prefix + "num_batches_tracked" if num_batches_tracked_key in state_dict: del state_dict[num_batches_tracked_key] super()._load_from_state_dict( - state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, ) def forward(self, x): @@ -379,7 +362,184 @@ def replace_batch_norm(model): replace_batch_norm(module) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->RFDetr +class RFDetrConvNormLayer(nn.Module): + def __init__( + self, + config: RFDetrConfig, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + padding: int = None, + activation: str = None, + ): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2 if padding is None else padding, + bias=False, + ) + self.norm = ( + nn.LayerNorm(out_channels) if config.layernorm else nn.BatchNorm2d(out_channels, config.batch_norm_eps) + ) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, hidden_state): + hidden_state = self.conv(hidden_state) + hidden_state = self.norm(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +# Copied from transformers.models.rt_detr.modeling_rt_detr.RTDetrRepVggBlock with RTDetr->RFDetr, activation_function->projector_activation_function +class RFDetrRepVggBlock(nn.Module): + """ + RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again". + """ + + def __init__(self, config: RFDetrConfig): + super().__init__() + + activation = config.projector_activation_function + hidden_channels = int(config.encoder_hidden_dim * config.hidden_expansion) + self.conv1 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1, padding=1) + self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1, padding=0) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, x): + y = self.conv1(x) + self.conv2(x) + return self.activation(y) + + +class RFDetrCSPRepLayer(nn.Module): + """ + Cross Stage Partial (CSP) network layer with RepVGG blocks. + """ + + def __init__(self, config: RFDetrConfig, in_channels): + super().__init__() + + out_channels = config.d_model + num_blocks = 3 + activation = config.projector_activation_function + + hidden_channels = int(out_channels * config.hidden_expansion) + self.conv1 = RFDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.conv2 = RFDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.bottlenecks = nn.Sequential(*[RFDetrRepVggBlock(config) for _ in range(num_blocks)]) + if hidden_channels != out_channels: + self.conv3 = RFDetrConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation) + else: + self.conv3 = nn.Identity() + + def forward(self, hidden_state): + hidden_state_1 = self.conv1(hidden_state) + hidden_state_1 = self.bottlenecks(hidden_state_1) + hidden_state_2 = self.conv2(hidden_state) + return self.conv3(hidden_state_1 + hidden_state_2) + + +class RFDetrScaleProjectorLayer(nn.Module): + def __init__(self, config: RFDetrConfig, scale: float, in_channels: int): + super().__init__() + self.use_extra_pool = False + layers = [] + if scale == 2.0: + layers.append(nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)) + elif scale == 1.0: + pass + elif scale == 0.5: + layers.extend(RFDetrConvNormLayer(in_channels, in_channels, 3, 2)) + else: + raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) + self.layers = nn.Sequential(*layers) + + def forward(self, hidden_state): + return self.layers(hidden_state) + + +class RFDetrScaleProjector(nn.Module): + def __init__(self, config: RFDetrConfig, scale: float, in_channels: List[int]): + super().__init__() + + self.sampling_layers = nn.ModuleList( + [RFDetrScaleProjectorLayer(config, scale, in_channel) for in_channel in in_channels] + ) + + in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) + self.stage_layers = nn.ModuleList( + [ + RFDetrCSPRepLayer(config, in_dim), + nn.LayerNorm(config.d_model), + ] + ) + + def forward(self, hidden_states): + features = [layer(hidden_state) for layer, hidden_state in zip(self.layers, hidden_states)] + features = torch.cat(features, dim=1) + output = self.stage_layers(features) + return output + + +class RFDetrMultiScaleProjector(nn.Module): + """ + This module implements MultiScaleProjector in :paper:`lwdetr`. + It creates pyramid features built on top of the input feature map. + """ + + def __init__( + self, + config: RFDetrConfig, + ): + """ + Args: + net (Backbone): module representing the subnetwork backbone. + Must be a subclass of :class:`Backbone`. + out_channels (int): number of channels in the output feature maps. + scale_factors (list[float]): list of scaling factors to upsample or downsample + the input features for creating pyramid features. + """ + super().__init__() + in_channels = config.projector_in_channels + self.survival_prob = config.projector_survival_prob + self.force_drop_last_n_features = config.projector_force_drop_last_n_features + scale_factors = config.scale_factors + + self.scale_layers = nn.ModuleList( + [RFDetrScaleProjector(config, scale, in_channels) for scale in scale_factors] + ) + + def forward(self, features): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: + mapping from feature map name to pyramid feature map tensor + in high to low resolution order. Returned feature names follow the FPN + convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + num_features = len(features) + if self.survival_prob < 1.0 and self.training: + final_drop_prob = 1 - self.survival_prob + drop_p = np.random.uniform() + for i in range(1, num_features): + critical_drop_prob = i * (final_drop_prob / (num_features - 1)) + if drop_p < critical_drop_prob: + features[i][:] = 0 + elif self.force_drop_last_n_features > 0: + for i in range(self.force_drop_last_n_features): + # don't do it inplace to ensure the compiler can optimize out the backbone layers + features[-(i + 1)] = torch.zeros_like(features[-(i + 1)]) + + outputs = [layer(x) for layer, x in zip(self.scale_layers, features)] + return outputs + + class RFDetrConvEncoder(nn.Module): """ Convolutional backbone, using either the AutoBackbone API or one from the timm library. @@ -393,27 +553,7 @@ def __init__(self, config): self.config = config - # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API - if config.use_timm_backbone: - # We default to values which were previously hard-coded. This enables configurability from the config - # using backbone arguments, while keeping the default behavior the same. - requires_backends(self, ["timm"]) - kwargs = getattr(config, "backbone_kwargs", {}) - kwargs = {} if kwargs is None else kwargs.copy() - out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) - num_channels = kwargs.pop("in_chans", config.num_channels) - if config.dilation: - kwargs["output_stride"] = kwargs.get("output_stride", 16) - backbone = create_model( - config.backbone, - pretrained=config.use_pretrained_backbone, - features_only=True, - out_indices=out_indices, - in_chans=num_channels, - **kwargs, - ) - else: - backbone = load_backbone(config) + backbone = load_backbone(config) # replace batch norm by frozen batch norm with torch.no_grad(): @@ -440,11 +580,14 @@ def __init__(self, config): if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: parameter.requires_grad_(False) - # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->RFDetr + self.projector = RFDetrMultiScaleProjector(config) + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): # send pixel_values through the model to get list of feature maps features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + features = self.projector(features) + out = [] for feature_map in features: # downsample pixel_mask to match shape of corresponding feature_map @@ -531,7 +674,13 @@ def forward(self, pixel_values, pixel_mask=None): height_values = torch.arange(height, device=pixel_values.device) x_emb = self.column_embeddings(width_values) y_emb = self.row_embeddings(height_values) - pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = torch.cat( + [ + x_emb.unsqueeze(0).repeat(height, 1, 1), + y_emb.unsqueeze(1).repeat(1, width, 1), + ], + dim=-1, + ) pos = pos.permute(2, 0, 1) pos = pos.unsqueeze(0) pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) @@ -552,49 +701,6 @@ def build_position_encoding(config): return position_embedding -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention -def multi_scale_deformable_attention( - value: Tensor, - value_spatial_shapes: Union[Tensor, List[Tuple]], - sampling_locations: Tensor, - attention_weights: Tensor, -) -> Tensor: - batch_size, _, num_heads, hidden_dim = value.shape - _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape - value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1) - sampling_grids = 2 * sampling_locations - 1 - sampling_value_list = [] - for level_id, (height, width) in enumerate(value_spatial_shapes): - # batch_size, height*width, num_heads, hidden_dim - # -> batch_size, height*width, num_heads*hidden_dim - # -> batch_size, num_heads*hidden_dim, height*width - # -> batch_size*num_heads, hidden_dim, height, width - value_l_ = ( - value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) - ) - # batch_size, num_queries, num_heads, num_points, 2 - # -> batch_size, num_heads, num_queries, num_points, 2 - # -> batch_size*num_heads, num_queries, num_points, 2 - sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) - # batch_size*num_heads, hidden_dim, num_queries, num_points - sampling_value_l_ = nn.functional.grid_sample( - value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False - ) - sampling_value_list.append(sampling_value_l_) - # (batch_size, num_queries, num_heads, num_levels, num_points) - # -> (batch_size, num_heads, num_queries, num_levels, num_points) - # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) - attention_weights = attention_weights.transpose(1, 2).reshape( - batch_size * num_heads, 1, num_queries, num_levels * num_points - ) - output = ( - (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) - .sum(-1) - .view(batch_size, num_heads * hidden_dim, num_queries) - ) - return output.transpose(1, 2).contiguous() - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RFDetr class RFDetrMultiscaleDeformableAttention(nn.Module): """ @@ -604,12 +710,7 @@ class RFDetrMultiscaleDeformableAttention(nn.Module): def __init__(self, config: RFDetrConfig, num_heads: int, n_points: int): super().__init__() - kernel_loaded = MultiScaleDeformableAttention is not None - if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded: - try: - load_cuda_kernels() - except Exception as e: - logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") + self.attn = MultiScaleDeformableAttention() if config.d_model % num_heads != 0: raise ValueError( @@ -696,27 +797,16 @@ def forward( else: raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") - if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling(): - # PyTorch implementation - output = multi_scale_deformable_attention( - value, spatial_shapes_list, sampling_locations, attention_weights - ) - else: - try: - # custom kernel - output = MultiScaleDeformableAttentionFunction.apply( - value, - spatial_shapes, - level_start_index, - sampling_locations, - attention_weights, - self.im2col_step, - ) - except Exception: - # PyTorch implementation - output = multi_scale_deformable_attention( - value, spatial_shapes_list, sampling_locations, attention_weights - ) + output = self.attn( + value, + spatial_shapes, + spatial_shapes_list, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + output = self.output_proj(output) return output, attention_weights @@ -825,7 +915,11 @@ def forward( attn_output = torch.bmm(attn_probs, value_states) - if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + if attn_output.size() != ( + batch_size * self.num_heads, + target_len, + self.head_dim, + ): raise ValueError( f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" f" {attn_output.size()}" @@ -846,7 +940,9 @@ def __init__(self, config: RFDetrConfig): super().__init__() self.embed_dim = config.d_model self.self_attn = RFDetrMultiscaleDeformableAttention( - config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + config, + num_heads=config.encoder_attention_heads, + n_points=config.encoder_n_points, ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.dropout = config.dropout @@ -1048,7 +1144,11 @@ class RFDetrPreTrainedModel(PreTrainedModel): base_model_prefix = "model" main_input_name = "pixel_values" supports_gradient_checkpointing = True - _no_split_modules = [r"RFDetrConvEncoder", r"RFDetrEncoderLayer", r"RFDetrDecoderLayer"] + _no_split_modules = [ + r"RFDetrConvEncoder", + r"RFDetrEncoderLayer", + r"RFDetrDecoderLayer", + ] def _init_weights(self, module): std = self.config.init_std @@ -1294,7 +1394,9 @@ def forward( if not return_dict: return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, ) @@ -1497,7 +1599,7 @@ def forward( """, RFDETR_START_DOCSTRING, ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr, DEFORMABLE_DETR->RFDETR +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->RFDetr, DEFORMABLE_DETR->RFDETR class RFDetrModel(RFDetrPreTrainedModel): def __init__(self, config: RFDetrConfig): super().__init__(config) @@ -1522,7 +1624,13 @@ def __init__(self, config: RFDetrConfig): for _ in range(config.num_feature_levels - num_backbone_outs): input_proj_list.append( nn.Sequential( - nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.Conv2d( + in_channels, + config.d_model, + kernel_size=3, + stride=2, + padding=1, + ), nn.GroupNorm(32, config.d_model), ) ) @@ -1532,7 +1640,11 @@ def __init__(self, config: RFDetrConfig): self.input_proj = nn.ModuleList( [ nn.Sequential( - nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.Conv2d( + backbone.intermediate_channel_sizes[-1], + config.d_model, + kernel_size=1, + ), nn.GroupNorm(32, config.d_model), ) ] @@ -1622,8 +1734,20 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) grid_y, grid_x = meshgrid( - torch.linspace(0, height - 1, height, dtype=enc_output.dtype, device=enc_output.device), - torch.linspace(0, width - 1, width, dtype=enc_output.dtype, device=enc_output.device), + torch.linspace( + 0, + height - 1, + height, + dtype=enc_output.dtype, + device=enc_output.device, + ), + torch.linspace( + 0, + width - 1, + width, + dtype=enc_output.dtype, + device=enc_output.device, + ), indexing="ij", ) grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) @@ -1799,7 +1923,9 @@ def forward( topk = self.config.two_stage_num_proposals topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] topk_coords_logits = torch.gather( - enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + enc_outputs_coord_logits, + 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4), ) topk_coords_logits = topk_coords_logits.detach() @@ -1851,218 +1977,227 @@ def forward( ) -# # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr -# class RFDetrMLPPredictionHead(nn.Module): -# """ -# Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, -# height and width of a bounding box w.r.t. an image. - -# Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - -# """ - -# def __init__(self, input_dim, hidden_dim, output_dim, num_layers): -# super().__init__() -# self.num_layers = num_layers -# h = [hidden_dim] * (num_layers - 1) -# self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - -# def forward(self, x): -# for i, layer in enumerate(self.layers): -# x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) -# return x - - -# @add_start_docstrings( -# """ -# RF DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on -# top, for tasks such as COCO detection. -# """, -# RFDETR_START_DOCSTRING, -# ) -# # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->RFDetr -# class RFDetrForObjectDetection(RFDetrPreTrainedModel): -# # When using clones, all layers > 0 will be clones, but layer 0 *is* required -# _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] -# # We can't initialize the model on meta device as some weights are modified during the initialization -# _no_split_modules = None - -# def __init__(self, config: RFDetrConfig): -# super().__init__(config) - -# # Deformable DETR encoder-decoder model -# self.model = RFDetrModel(config) -# # Detection heads on top -# self.class_embed = nn.Linear(config.d_model, config.num_labels) -# self.bbox_embed = RFDetrMLPPredictionHead( -# input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 -# ) - -# prior_prob = 0.01 -# bias_value = -math.log((1 - prior_prob) / prior_prob) -# self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value -# nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) -# nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) - -# # if two-stage, the last class_embed and bbox_embed is for region proposal generation -# num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers -# if config.with_box_refine: -# self.class_embed = _get_clones(self.class_embed, num_pred) -# self.bbox_embed = _get_clones(self.bbox_embed, num_pred) -# nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) -# # hack implementation for iterative bounding box refinement -# self.model.decoder.bbox_embed = self.bbox_embed -# else: -# nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) -# self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) -# self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) -# self.model.decoder.bbox_embed = None -# if config.two_stage: -# # hack implementation for two-stage -# self.model.decoder.class_embed = self.class_embed -# for box_embed in self.bbox_embed: -# nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) - -# # Initialize weights and apply final processing -# self.post_init() - -# @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) -# @replace_return_docstrings(output_type=RFDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) -# def forward( -# self, -# pixel_values: torch.FloatTensor, -# pixel_mask: Optional[torch.LongTensor] = None, -# decoder_attention_mask: Optional[torch.FloatTensor] = None, -# encoder_outputs: Optional[torch.FloatTensor] = None, -# inputs_embeds: Optional[torch.FloatTensor] = None, -# decoder_inputs_embeds: Optional[torch.FloatTensor] = None, -# labels: Optional[List[dict]] = None, -# output_attentions: Optional[bool] = None, -# output_hidden_states: Optional[bool] = None, -# return_dict: Optional[bool] = None, -# ) -> Union[Tuple[torch.FloatTensor], RFDetrObjectDetectionOutput]: -# r""" -# labels (`List[Dict]` of len `(batch_size,)`, *optional*): -# Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the -# following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch -# respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes -# in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. - -# Returns: - -# Examples: - -# ```python -# >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection -# >>> from PIL import Image -# >>> import requests - -# >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" -# >>> image = Image.open(requests.get(url, stream=True).raw) - -# >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") -# >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") - -# >>> inputs = image_processor(images=image, return_tensors="pt") -# >>> outputs = model(**inputs) - -# >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) -# >>> target_sizes = torch.tensor([image.size[::-1]]) -# >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ -# ... 0 -# ... ] -# >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): -# ... box = [round(i, 2) for i in box.tolist()] -# ... print( -# ... f"Detected {model.config.id2label[label.item()]} with confidence " -# ... f"{round(score.item(), 3)} at location {box}" -# ... ) -# Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] -# Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] -# Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] -# ```""" -# return_dict = return_dict if return_dict is not None else self.config.use_return_dict - -# # First, sent images through DETR base model to obtain encoder + decoder outputs -# outputs = self.model( -# pixel_values, -# pixel_mask=pixel_mask, -# decoder_attention_mask=decoder_attention_mask, -# encoder_outputs=encoder_outputs, -# inputs_embeds=inputs_embeds, -# decoder_inputs_embeds=decoder_inputs_embeds, -# output_attentions=output_attentions, -# output_hidden_states=output_hidden_states, -# return_dict=return_dict, -# ) - -# hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] -# init_reference = outputs.init_reference_points if return_dict else outputs[0] -# inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] - -# # class logits + predicted bounding boxes -# outputs_classes = [] -# outputs_coords = [] - -# for level in range(hidden_states.shape[1]): -# if level == 0: -# reference = init_reference -# else: -# reference = inter_references[:, level - 1] -# reference = inverse_sigmoid(reference) -# outputs_class = self.class_embed[level](hidden_states[:, level]) -# delta_bbox = self.bbox_embed[level](hidden_states[:, level]) -# if reference.shape[-1] == 4: -# outputs_coord_logits = delta_bbox + reference -# elif reference.shape[-1] == 2: -# delta_bbox[..., :2] += reference -# outputs_coord_logits = delta_bbox -# else: -# raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") -# outputs_coord = outputs_coord_logits.sigmoid() -# outputs_classes.append(outputs_class) -# outputs_coords.append(outputs_coord) -# outputs_class = torch.stack(outputs_classes) -# outputs_coord = torch.stack(outputs_coords) - -# logits = outputs_class[-1] -# pred_boxes = outputs_coord[-1] - -# loss, loss_dict, auxiliary_outputs = None, None, None -# if labels is not None: -# loss, loss_dict, auxiliary_outputs = self.loss_function( -# logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord -# ) -# if not return_dict: -# if auxiliary_outputs is not None: -# output = (logits, pred_boxes) + auxiliary_outputs + outputs -# else: -# output = (logits, pred_boxes) + outputs -# tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output - -# return tuple_outputs - -# dict_outputs = RFDetrObjectDetectionOutput( -# loss=loss, -# loss_dict=loss_dict, -# logits=logits, -# pred_boxes=pred_boxes, -# auxiliary_outputs=auxiliary_outputs, -# last_hidden_state=outputs.last_hidden_state, -# decoder_hidden_states=outputs.decoder_hidden_states, -# decoder_attentions=outputs.decoder_attentions, -# cross_attentions=outputs.cross_attentions, -# encoder_last_hidden_state=outputs.encoder_last_hidden_state, -# encoder_hidden_states=outputs.encoder_hidden_states, -# encoder_attentions=outputs.encoder_attentions, -# intermediate_hidden_states=outputs.intermediate_hidden_states, -# intermediate_reference_points=outputs.intermediate_reference_points, -# init_reference_points=outputs.init_reference_points, -# enc_outputs_class=outputs.enc_outputs_class, -# enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, -# ) - -# return dict_outputs +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr +class RFDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@add_start_docstrings( + """ + RF DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """, + RFDETR_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with Deformable->RF, DEFORMABLE_DETR->RFDETR +class RFDetrForObjectDetection(RFDetrPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: RFDetrConfig): + super().__init__(config) + + # RF DETR encoder-decoder model + self.model = RFDetrModel(config) + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = RFDetrMLPPredictionHead( + input_dim=config.d_model, + hidden_dim=config.d_model, + output_dim=4, + num_layers=3, + ) + + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + for box_embed in self.bbox_embed: + nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[List[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], RFDetrObjectDetectionOutput]: + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + outputs_class, + outputs_coord, + ) + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = RFDetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs __all__ = [ diff --git a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py index abe0b378e4b7..ad1ec42236be 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py @@ -1,15 +1,20 @@ -import collections -from typing import Callable, Dict, List, Optional, Set, Tuple, Union +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rf_detr_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +from typing import Callable, Optional, Set, Tuple, Union import torch from torch import nn from ...activations import ACT2FN -from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_outputs import BackboneOutput, BaseModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( - add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging, @@ -17,20 +22,15 @@ torch_int, ) from ...utils.backbone_utils import BackboneMixin -from .configuration_rf_detr_dinov2_with_registers import ( - RFDetrDinov2WithRegistersConfig, -) +from .configuration_rf_detr_dinov2_with_registers import RFDetrDinov2WithRegistersConfig logger = logging.get_logger(__name__) -# Base docstring -_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base" # TODO - +# General docstring _CONFIG_FOR_DOC = "RFDetrDinov2WithRegistersConfig" -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersPatchEmbeddings with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersPatchEmbeddings(nn.Module): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial @@ -189,7 +189,6 @@ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Te return embeddings -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.eager_attention_forward def eager_attention_forward( module: nn.Module, query: torch.Tensor, @@ -220,7 +219,6 @@ def eager_attention_forward( return attn_output, attn_weights -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersSelfAttention with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersSelfAttention(nn.Module): def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: super().__init__() @@ -283,7 +281,6 @@ def forward( return outputs -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersSelfOutput with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersSelfOutput(nn.Module): """ The residual connection is defined in RFDetrDinov2WithRegistersLayer instead of here (as is the case with other models), due to the @@ -302,7 +299,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersAttention with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersAttention(nn.Module): def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: super().__init__() @@ -342,7 +338,6 @@ def forward( return outputs -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersLayerScale with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersLayerScale(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -352,7 +347,6 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return hidden_state * self.lambda1 -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.drop_path def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). @@ -373,7 +367,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals return output -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersDropPath with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersDropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" @@ -388,7 +381,6 @@ def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersMLP with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersMLP(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -408,7 +400,6 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return hidden_state -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersSwiGLUFFN with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersSwiGLUFFN(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -446,18 +437,19 @@ def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: else: self.mlp = RFDetrDinov2WithRegistersMLP(config) self.layer_scale2 = RFDetrDinov2WithRegistersLayerScale(config) + self.num_windows = config.num_windows def forward( self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - run_full_attention: bool = False, + remove_windows: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: assert head_mask is None, "head_mask is not supported for windowed attention" assert not output_attentions, "output_attentions is not supported for windowed attention" shortcut = hidden_states - if run_full_attention: + if remove_windows: # reshape x to remove windows B, HW, C = hidden_states.shape num_windows_squared = self.num_windows**2 @@ -470,7 +462,7 @@ def forward( ) attention_output = self_attention_outputs[0] - if run_full_attention: + if remove_windows: # reshape x to add windows back B, HW, C = hidden_states.shape num_windows_squared = self.num_windows**2 @@ -496,7 +488,6 @@ def forward( return outputs -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersEncoder with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersEncoder(nn.Module): def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: super().__init__() @@ -519,6 +510,12 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + if i > int(self.config.out_features[-1][5:]): + # early stop if we have reached the last output feature + break + + remove_windows = i not in self.config.window_block_indexes + layer_head_mask = head_mask[i] if head_mask is not None else None if self.gradient_checkpointing and self.training: @@ -527,9 +524,10 @@ def forward( hidden_states, layer_head_mask, output_attentions, + remove_windows, ) else: - layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, remove_windows) hidden_states = layer_outputs[0] @@ -548,7 +546,6 @@ def forward( ) -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersPreTrainedModel with Dinov2WithRegisters->RFDetrDinov2WithRegisters class RFDetrDinov2WithRegistersPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -556,7 +553,7 @@ class RFDetrDinov2WithRegistersPreTrainedModel(PreTrainedModel): """ config_class = RFDetrDinov2WithRegistersConfig - base_model_prefix = "dinov2_with_registers" + base_model_prefix = "rf_detr_dinov2_with_registers" main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = ["RFDetrDinov2WithRegistersSwiGLUFFN"] @@ -590,134 +587,18 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No ).to(module.cls_token.dtype) -_EXPECTED_OUTPUT_SHAPE = [1, 257, 768] - -RFDETR_DINOV2_WITH_REGISTERS_START_DOCSTRING = r""" +RF_DETR_DINOV2_WITH_REGISTERS_START_DOCSTRING = r""" This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: - config ([`Dinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model. + config ([`RFDetrDinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -RFDETR_DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See - [`BitImageProcessor.preprocess`] for details. - - bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): - Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for - pre-training. - - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare RFDetrDinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.", - RFDETR_DINOV2_WITH_REGISTERS_START_DOCSTRING, -) -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersModel with Dinov2WithRegisters->RFDetrDinov2WithRegisters, DINOV2_WITH_REGISTERS->RFDETR_DINOV2_WITH_REGISTERS -class RFDetrDinov2WithRegistersModel(RFDetrDinov2WithRegistersPreTrainedModel): - def __init__(self, config: RFDetrDinov2WithRegistersConfig): - super().__init__(config) - self.config = config - - self.embeddings = RFDetrDinov2WithRegistersEmbeddings(config) - self.encoder = RFDetrDinov2WithRegistersEncoder(config) - - self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> RFDetrDinov2WithRegistersPatchEmbeddings: - return self.embeddings.patch_embeddings - - def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward(RFDETR_DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=BaseModelOutputWithPooling, - config_class=_CONFIG_FOR_DOC, - modality="vision", - expected_output=_EXPECTED_OUTPUT_SHAPE, - ) - def forward( - self, - pixel_values: Optional[torch.Tensor] = None, - bool_masked_pos: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - - embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) - - encoder_outputs = self.encoder( - embedding_output, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] - sequence_output = self.layernorm(sequence_output) - pooled_output = sequence_output[:, 0, :] - - if not return_dict: - head_outputs = (sequence_output, pooled_output) - return head_outputs + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -RFDETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r""" +RF_DETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See @@ -742,11 +623,10 @@ def forward( @add_start_docstrings( """ - Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer. + RFDetrDinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer. """, - RFDETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING, + RF_DETR_DINOV2_WITH_REGISTERS_START_DOCSTRING, ) -# Copied from transformers.models.dinov2_with_registers.modeling_dinov2_with_registers.Dinov2WithRegistersBackbone with Dinov2WithRegisters->RFDetrDinov2WithRegisters, DINOV2_WITH_REGISTERS->RFDETR_DINOV2_WITH_REGISTERS class RFDetrDinov2WithRegistersBackbone(RFDetrDinov2WithRegistersPreTrainedModel, BackboneMixin): def __init__(self, config): super().__init__(config) @@ -765,7 +645,7 @@ def __init__(self, config): def get_input_embeddings(self) -> RFDetrDinov2WithRegistersPatchEmbeddings: return self.embeddings.patch_embeddings - @add_start_docstrings_to_model_forward(RFDETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(RF_DETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -781,6 +661,13 @@ def forward( Returns: Examples: + Returns: + + Examples: + Returns: + + Examples: + ```python @@ -829,8 +716,32 @@ def forward( # cause normally the order is height, width batch_size, _, height, width = pixel_values.shape patch_size = self.config.patch_size - hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + + num_h_patches = height // patch_size + num_w_patches = width // patch_size + + if self.config.num_windows > 1: + # undo windowing + num_windows_squared = self.config.num_windows**2 + batch_size, height_width, channels = hidden_state.shape + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_w_patches_per_window = num_w_patches // self.config.num_windows + hidden_state = hidden_state.reshape( + batch_size // num_windows_squared, num_windows_squared * height_width, channels + ) + hidden_state = hidden_state.view( + batch_size // num_windows_squared, + self.config.num_windows, + self.config.num_windows, + num_h_patches_per_window, + num_w_patches_per_window, + channels, + ) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) + + hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) if not return_dict: diff --git a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py new file mode 100644 index 000000000000..e4a53b858371 --- /dev/null +++ b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py @@ -0,0 +1,292 @@ +from typing import Optional, Tuple, Union + +import torch + +from ...modeling_outputs import BackboneOutput, BaseModelOutput +from ..dinov2_with_registers.configuration_dinov2_with_registers import Dinov2WithRegistersConfig +from ..dinov2_with_registers.modeling_dinov2_with_registers import ( + Dinov2WithRegistersBackbone, + Dinov2WithRegistersEmbeddings, + Dinov2WithRegistersEncoder, + Dinov2WithRegistersLayer, +) + + +class RFDetrDinov2WithRegistersConfig(Dinov2WithRegistersConfig): + def __init__(self, num_windows: int = 4, window_block_indexes=None, **super_kwargs): + super(Dinov2WithRegistersConfig).__init__(**super_kwargs) + + self.num_windows = num_windows + window_block_indexes = set(range(self._out_indices[-1] + 1)) + window_block_indexes.difference_update(self._out_indices) + window_block_indexes = list(window_block_indexes) + self.window_block_indexes = window_block_indexes + + +class RFDetrDinov2WithRegistersEmbeddings(Dinov2WithRegistersEmbeddings): + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + if self.config.num_windows > 1: + # reshape for windows + num_h_patches = height // self.config.patch_size + num_w_patches = width // self.config.patch_size + cls_token_with_pos_embed = embeddings[:, :1] + pixel_tokens_with_pos_embed = embeddings[:, 1:] + pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( + batch_size, num_h_patches, num_w_patches, -1 + ) + num_w_patches_per_window = num_w_patches // self.config.num_windows + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_windows = self.config.num_windows + windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( + batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 + ) + windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) + windowed_pixel_tokens = windowed_pixel_tokens.reshape( + batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 + ) + windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) + embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) + + # add register tokens + embeddings = ( + torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + if self.config.num_register_tokens > 0 + else embeddings + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class RFDetrDinov2WithRegistersLayer(Dinov2WithRegistersLayer): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super().__init__(config) + self.num_windows = config.num_windows + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + remove_windows: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + assert head_mask is None, "head_mask is not supported for windowed attention" + assert not output_attentions, "output_attentions is not supported for windowed attention" + shortcut = hidden_states + if remove_windows: + # reshape x to remove windows + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) + + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + if remove_windows: + # reshape x to add windows back + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) + attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + shortcut + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class RFDetrDinov2WithRegistersEncoder(Dinov2WithRegistersEncoder): + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if i > int(self.config.out_features[-1][5:]): + # early stop if we have reached the last output feature + break + + remove_windows = i not in self.config.window_block_indexes + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + remove_windows, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, remove_windows) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class RFDetrDinov2WithRegistersBackbone(Dinov2WithRegistersBackbone): + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + Returns: + + Examples: + + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + + num_h_patches = height // patch_size + num_w_patches = width // patch_size + + if self.config.num_windows > 1: + # undo windowing + num_windows_squared = self.config.num_windows**2 + B, HW, C = hidden_state.shape + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_w_patches_per_window = num_w_patches // self.config.num_windows + hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) + hidden_state = hidden_state.view( + B // num_windows_squared, + self.config.num_windows, + self.config.num_windows, + num_h_patches_per_window, + num_w_patches_per_window, + C, + ) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) + + hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = [ + "RFDetrDinov2WithRegistersConfig", + "RFDetrDinov2WithRegistersBackbone", +] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index a7051cffca81..a0683b901966 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -8387,6 +8387,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class RFDetrDinov2WithRegistersBackbone(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RFDetrForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RFDetrModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RFDetrPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class RobertaForCausalLM(metaclass=DummyObject): _backends = ["torch"] From 22c4ce8ffe48612a74d561501c6b23d241d35b09 Mon Sep 17 00:00:00 2001 From: steven Date: Mon, 31 Mar 2025 23:59:43 +0200 Subject: [PATCH 4/6] (draft) model conversion --- .../models/rf_detr/configuration_rf_detr.py | 41 +- .../rf_detr/convert_rf_detr_weights_to_hf.py | 313 ++++++++++ .../models/rf_detr/modeling_rf_detr.py | 537 ++++++++---------- .../modeling_rf_detr_dinov2_with_registers.py | 18 +- .../modular_rf_detr_dinov2_with_registers.py | 9 + 5 files changed, 585 insertions(+), 333 deletions(-) create mode 100644 src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py diff --git a/src/transformers/models/rf_detr/configuration_rf_detr.py b/src/transformers/models/rf_detr/configuration_rf_detr.py index 0b333b0d5beb..0bce9b42a0aa 100644 --- a/src/transformers/models/rf_detr/configuration_rf_detr.py +++ b/src/transformers/models/rf_detr/configuration_rf_detr.py @@ -161,24 +161,27 @@ def __init__( encoder_ffn_dim=1024, encoder_attention_heads=8, encoder_layerdrop=0.0, + encoder_n_points=4, # RFDetrDecoder - decoder_layers=6, - decoder_ffn_dim=1024, - decoder_attention_heads=8, - activation_function="relu", + decoder_layers=3, d_model=256, - dropout=0.1, attention_dropout=0.0, + dropout=0.1, + activation_function="relu", activation_dropout=0.0, + decoder_self_attention_heads=8, + decoder_cross_attention_heads=16, + decoder_n_points=4, + decoder_ffn_dim=2048, + # LWDetr + layer_norm: bool = True, + ## auxiliary_loss=False, position_embedding_type="sine", dilation=False, - num_feature_levels=4, - encoder_n_points=4, - decoder_n_points=4, - two_stage=False, + two_stage=True, two_stage_num_proposals=300, - with_box_refine=False, + with_box_refine=True, class_cost=1, bbox_cost=5, giou_cost=2, @@ -191,15 +194,18 @@ def __init__( disable_custom_kernels=False, out_feature_indexes: List[int] = [2, 5, 8, 11], scale_factors: List[float] = [1.0], - layer_norm: bool = False, projector_in_channels: Optional[List[int]] = None, projector_num_blocks: int = 3, # TODO rename projector_survival_prob: float = 1.0, projector_force_drop_last_n_features: int = 0, projector_activation_function: str = "silu", - hidden_expansion: float = 0.5, + csp_hidden_expansion: float = 0.5, + bottleneck_hidden_expansion: float = 0.5, batch_norm_eps: float = 1e-5, + bbox_reparam: bool = True, is_encoder_decoder=True, + num_groups=13, + light_reference_point_refinement: bool = True, **kwargs, ): if backbone_config is None and backbone is None: @@ -232,7 +238,8 @@ def __init__( self.encoder_attention_heads = encoder_attention_heads self.decoder_ffn_dim = decoder_ffn_dim self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads + self.decoder_self_attention_heads = decoder_self_attention_heads + self.decoder_cross_attention_heads = decoder_cross_attention_heads self.dropout = dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout @@ -247,7 +254,6 @@ def __init__( self.backbone_kwargs = backbone_kwargs self.dilation = dilation # deformable attributes - self.num_feature_levels = num_feature_levels self.encoder_n_points = encoder_n_points self.decoder_n_points = decoder_n_points self.two_stage = two_stage @@ -275,6 +281,7 @@ def __init__( "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]" ) + self.num_feature_levels = len(scale_factors) self.layer_norm = layer_norm self.projector_in_channels = ( projector_in_channels @@ -288,9 +295,13 @@ def __init__( self.projector_survival_prob = projector_survival_prob self.projector_force_drop_last_n_features = projector_force_drop_last_n_features self.projector_activation_function = projector_activation_function - self.hidden_expansion = hidden_expansion + self.csp_hidden_expansion = csp_hidden_expansion + self.bottleneck_expansion = bottleneck_hidden_expansion self.batch_norm_eps = batch_norm_eps self.encoder_hidden_dim = backbone_config.hidden_size + self.bbox_reparam = bbox_reparam + self.num_groups = num_groups + self.light_reference_point_refinement = light_reference_point_refinement super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py new file mode 100644 index 000000000000..ff90e4737273 --- /dev/null +++ b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py @@ -0,0 +1,313 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert RF Detr checkpoints to Hugging Face Transformers format.""" + +import argparse +import json +import re +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import ( + AutoConfig, + RFDetrConfig, + RFDetrDinov2WithRegistersConfig, + RFDetrForObjectDetection, + RTDetrImageProcessor, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_rt_detr_v2_config(model_name: str) -> RFDetrConfig: + if model_name in ["rf-detr-base", "rf-detr-base-2"]: + dinov2_size = "small" + elif model_name == "rf-detr-large": + dinov2_size = "base" + + base_backbone_model_name = f"facebook/dinov2-with-registers-{dinov2_size}" + num_register_tokens = 0 + out_indices = [2, 5, 8, 11] + base_backbone = AutoConfig.from_pretrained( + base_backbone_model_name, + num_register_tokens=num_register_tokens, + out_indices=out_indices, + ) + + num_windows = 4 + backbone_config = RFDetrDinov2WithRegistersConfig( + **base_backbone.to_dict(), + num_windows=num_windows, + ) + + scale_factors = [2.0, 0.5] + d_model = 384 + decoder_self_attention_heads = 12 + decoder_cross_attention_heads = 24 + num_labels = 91 + config = RFDetrConfig( + backbone_config=backbone_config, + scale_factors=scale_factors, + d_model=d_model, + decoder_self_attention_heads=decoder_self_attention_heads, + decoder_cross_attention_heads=decoder_cross_attention_heads, + num_labels=num_labels, + ) + + config.num_labels = 80 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + if model_name in ["rf-detr-base", "rf-detr-base-2"]: + pass + # config.backbone_config.hidden_sizes = [64, 128, 256, 512] + # config.backbone_config.depths = [2, 2, 2, 2] + # config.backbone_config.layer_type = "basic" + # config.encoder_in_channels = [128, 256, 512] + # config.hidden_expansion = 0.5 + # config.decoder_layers = 3 + elif model_name == "rf-detr-large": + pass + # config.backbone_config.hidden_sizes = [64, 128, 256, 512] + # config.backbone_config.depths = [3, 4, 6, 3] + # config.backbone_config.layer_type = "basic" + # config.encoder_in_channels = [128, 256, 512] + # config.hidden_expansion = 0.5 + # config.decoder_layers = 4 + + return config + + +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + r"backbone.0.encoder.encoder": r"model.backbone.conv_encoder.model", + r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).(weight|bias)": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.\4", + r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.conv", + r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.norm", + r"backbone.0.projector.stages.(\d+).0.cv1.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv1.conv", + r"backbone.0.projector.stages.(\d+).0.cv1.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv1.norm", + r"backbone.0.projector.stages.(\d+).0.cv2.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv2.conv", + r"backbone.0.projector.stages.(\d+).0.cv2.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv2.norm", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv1.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv1.conv", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv1.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv1.norm", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv2.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv2.conv", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv2.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv2.norm", + r"backbone.0.projector.stages.(\d+).1": r"model.backbone.conv_encoder.projector.scale_layers.\1.layer_norm", + r"transformer.decoder.layers.(\d+).self_attn.out_proj": r"model.decoder.layers.\1.self_attn.out_proj", + r"transformer.decoder.layers.(\d+).norm1": r"model.decoder.layers.\1.self_attn_layer_norm", + r"transformer.decoder.layers.(\d+).cross_attn.sampling_offsets": r"model.decoder.layers.\1.encoder_attn.sampling_offsets", + r"transformer.decoder.layers.(\d+).cross_attn.attention_weights": r"model.decoder.layers.\1.encoder_attn.attention_weights", + r"transformer.decoder.layers.(\d+).cross_attn.value_proj": r"model.decoder.layers.\1.encoder_attn.value_proj", + r"transformer.decoder.layers.(\d+).cross_attn.output_proj": r"model.decoder.layers.\1.encoder_attn.output_proj", + r"transformer.decoder.layers.(\d+).norm2": r"model.decoder.layers.\1.encoder_attn_layer_norm", + r"transformer.decoder.layers.(\d+).linear1": r"model.decoder.layers.\1.fc1", + r"transformer.decoder.layers.(\d+).linear2": r"model.decoder.layers.\1.fc2", + r"transformer.decoder.layers.(\d+).norm3": r"model.decoder.layers.\1.final_layer_norm", + r"transformer.decoder.norm": r"model.decoder.norm", + r"transformer.decoder.ref_point_head": r"model.decoder.reference_points_head", + r"refpoint_embed": r"model.reference_point_embeddings", + r"class_embed": r"model.decoder.class_embed", + r"bbox_embed": r"model.decoder.bbox_embed", + r"transformer.enc_output": r"model.enc_output", + r"transformer.enc_output_norm": r"model.enc_output_norm", + r"transformer.enc_out_bbox_embed": r"model.enc_out_bbox_embed", +} + + +def convert_old_keys_to_new_keys(state_dict_keys: dict = None): + # Use the mapping to rename keys + for original_key, converted_key in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + for key in list(state_dict_keys.keys()): + new_key = re.sub(original_key, converted_key, key) + if new_key != key: + state_dict_keys[new_key] = state_dict_keys.pop(key) + + return state_dict_keys + + +def read_in_q_k_v(state_dict, config: RFDetrConfig): + prefix = "transformer.decoder.layers" + decoder_hidden_dim = config.d_model + + for i in range(config.decoder_layers): + # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:decoder_hidden_dim, :] + state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:decoder_hidden_dim] + state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ + decoder_hidden_dim : 2 * decoder_hidden_dim, : + ] + state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[ + decoder_hidden_dim : 2 * decoder_hidden_dim + ] + state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-decoder_hidden_dim:, :] + state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-decoder_hidden_dim:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id): + """ + Copy/paste/tweak model's weights to our RTDETR structure. + """ + + # load default config + config = get_rt_detr_v2_config(model_name) + + # load original model from torch hub + model_name_to_checkpoint_url = { + "rf-detr-base": "https://storage.googleapis.com/rfdetr/rf-detr-base-coco.pth", + # below is a less converged model that may be better for finetuning but worse for inference + "rf-detr-base-2": "https://storage.googleapis.com/rfdetr/rf-detr-base-2.pth", + "rf-detr-large": "https://storage.googleapis.com/rfdetr/rf-detr-large.pth", + } + logger.info(f"Converting model {model_name}...") + state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[ + "model" + ] + + # rename keys + state_dict = convert_old_keys_to_new_keys(state_dict) + for key in state_dict.copy().keys(): + if key.startswith("query_feat"): + del state_dict[key] + + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict, config) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + for key in state_dict.copy().keys(): + if key.endswith("num_batches_tracked"): + del state_dict[key] + + # finally, create HuggingFace model and load state dict + model = RFDetrForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + + # load image processor + image_processor = RTDetrImageProcessor() + + # prepare image + img = prepare_img() + + # preprocess image + transformations = transforms.Compose( + [ + transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR), + transforms.ToTensor(), + ] + ) + original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension + + encoding = image_processor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + assert torch.allclose(original_pixel_values, pixel_values) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + pixel_values = pixel_values.to(device) + + # Pass image by the model + with torch.no_grad(): + outputs = model(pixel_values) + + if model_name == "rf-detr-base": + expected_slice_logits = torch.tensor( + [[-3.7045, -5.1913, -6.1787], [-4.0106, -9.3450, -5.2043], [-4.1287, -4.7463, -5.8634]] + ) + expected_slice_boxes = torch.tensor( + [[0.2582, 0.5497, 0.4764], [0.1684, 0.1985, 0.2120], [0.7665, 0.4146, 0.4669]] + ) + elif model_name == "rf-detr-base-2": + expected_slice_logits = torch.tensor( + [[-4.6108, -5.9453, -3.8505], [-3.8702, -6.1136, -5.5677], [-3.7790, -6.4538, -5.9449]] + ) + expected_slice_boxes = torch.tensor( + [[0.1691, 0.1984, 0.2118], [0.2594, 0.5506, 0.4736], [0.7669, 0.4136, 0.4654]] + ) + elif model_name == "rf-detr-large": + expected_slice_logits = torch.tensor( + [[-4.7881, -4.6754, -6.1624], [-5.4441, -6.6486, -4.3840], [-3.5455, -4.9318, -6.3544]] + ) + expected_slice_boxes = torch.tensor( + [[0.2588, 0.5487, 0.4747], [0.5497, 0.2760, 0.0573], [0.7688, 0.4133, 0.4634]] + ) + else: + raise ValueError(f"Unknown rf_detr_name: {model_name}") + assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4) + assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3) + + if output_dir is not None: + Path(output_dir).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {output_dir}") + model.save_pretrained(output_dir) + print(f"Saving image processor to {output_dir}") + image_processor.save_pretrained(output_dir) + + if push_to_hub: + # Upload model, image processor and config to the hub + logger.info("Uploading PyTorch model and image processor to the hub...") + config.push_to_hub( + repo_id=repo_id, + commit_message="Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py", + ) + model.push_to_hub( + repo_id=repo_id, + commit_message="Add model from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py", + ) + image_processor.push_to_hub( + repo_id=repo_id, + commit_message="Add image processor from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py", + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", + default="rf-detr-large", + type=str, + help="model_name of the checkpoint you'd like to convert.", + ) + parser.add_argument("--output_dir", default=None, type=str, help="Location to write HF model and image processor") + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") + parser.add_argument( + "--repo_id", + type=str, + help="repo_id where the model will be pushed to.", + ) + args = parser.parse_args() + write_model_and_image_processor(args.model_name, args.output_dir, args.push_to_hub, args.repo_id) diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py index 98108a743d07..9b74ab2d88d3 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr.py @@ -12,7 +12,6 @@ from ...activations import ACT2CLS, ACT2FN from ...integrations.hub_kernels import use_kernel_forward_from_hub from ...modeling_attn_mask_utils import _prepare_4d_attention_mask -from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import meshgrid from ...utils import ( @@ -383,7 +382,9 @@ def __init__( bias=False, ) self.norm = ( - nn.LayerNorm(out_channels) if config.layernorm else nn.BatchNorm2d(out_channels, config.batch_norm_eps) + RFDetrLayerNorm(out_channels, data_format="channels_first") + if config.layer_norm + else nn.BatchNorm2d(out_channels, config.batch_norm_eps) ) self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() @@ -395,18 +396,17 @@ def forward(self, hidden_state): # Copied from transformers.models.rt_detr.modeling_rt_detr.RTDetrRepVggBlock with RTDetr->RFDetr, activation_function->projector_activation_function -class RFDetrRepVggBlock(nn.Module): +class RFDetrCSPRepBottleneck(nn.Module): """ RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again". """ - def __init__(self, config: RFDetrConfig): + def __init__(self, config: RFDetrConfig, hidden_channels: int): super().__init__() activation = config.projector_activation_function - hidden_channels = int(config.encoder_hidden_dim * config.hidden_expansion) - self.conv1 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1, padding=1) - self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1, padding=0) + self.conv1 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1) + self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1) self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() def forward(self, x): @@ -419,27 +419,35 @@ class RFDetrCSPRepLayer(nn.Module): Cross Stage Partial (CSP) network layer with RepVGG blocks. """ - def __init__(self, config: RFDetrConfig, in_channels): + def __init__(self, config: RFDetrConfig, in_channels: int): super().__init__() out_channels = config.d_model - num_blocks = 3 + num_blocks = config.projector_num_blocks activation = config.projector_activation_function - hidden_channels = int(out_channels * config.hidden_expansion) - self.conv1 = RFDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) - self.conv2 = RFDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) - self.bottlenecks = nn.Sequential(*[RFDetrRepVggBlock(config) for _ in range(num_blocks)]) - if hidden_channels != out_channels: - self.conv3 = RFDetrConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation) - else: - self.conv3 = nn.Identity() + self.hidden_channels = int(out_channels * config.csp_hidden_expansion) + self.conv1 = RFDetrConvNormLayer(config, in_channels, 2 * self.hidden_channels, 1, 1, activation=activation) + self.conv2 = RFDetrConvNormLayer( + config, + (2 + num_blocks) * self.hidden_channels, + out_channels, + 1, + 1, + activation=activation, + ) + self.bottlenecks = nn.Sequential( + *[RFDetrCSPRepBottleneck(config, self.hidden_channels) for _ in range(num_blocks)] + ) - def forward(self, hidden_state): - hidden_state_1 = self.conv1(hidden_state) - hidden_state_1 = self.bottlenecks(hidden_state_1) - hidden_state_2 = self.conv2(hidden_state) - return self.conv3(hidden_state_1 + hidden_state_2) + def forward(self, hidden_states): + hidden_states = self.conv1(hidden_states) + all_hidden_states = list(hidden_states.split(self.hidden_channels, 1)) + hidden_states = all_hidden_states[-1] + all_hidden_states.extend(bottleneck(hidden_states) for bottleneck in self.bottlenecks) + hidden_states = torch.cat(all_hidden_states, 1) + hidden_states = self.conv2(hidden_states) + return hidden_states class RFDetrScaleProjectorLayer(nn.Module): @@ -452,7 +460,7 @@ def __init__(self, config: RFDetrConfig, scale: float, in_channels: int): elif scale == 1.0: pass elif scale == 0.5: - layers.extend(RFDetrConvNormLayer(in_channels, in_channels, 3, 2)) + layers.append(RFDetrConvNormLayer(config, in_channels, in_channels, 3, 2)) else: raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) self.layers = nn.Sequential(*layers) @@ -461,6 +469,37 @@ def forward(self, hidden_state): return self.layers(hidden_state) +# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->RFDetr +class RFDetrLayerNorm(nn.Module): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {self.data_format}") + self.normalized_shape = (normalized_shape,) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.data_format == "channels_last": + x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + input_dtype = x.dtype + x = x.float() + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = x.to(dtype=input_dtype) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + class RFDetrScaleProjector(nn.Module): def __init__(self, config: RFDetrConfig, scale: float, in_channels: List[int]): super().__init__() @@ -470,18 +509,15 @@ def __init__(self, config: RFDetrConfig, scale: float, in_channels: List[int]): ) in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) - self.stage_layers = nn.ModuleList( - [ - RFDetrCSPRepLayer(config, in_dim), - nn.LayerNorm(config.d_model), - ] - ) + self.stage_layer = RFDetrCSPRepLayer(config, in_dim) + self.layer_norm = RFDetrLayerNorm(config.d_model, data_format="channels_first") def forward(self, hidden_states): - features = [layer(hidden_state) for layer, hidden_state in zip(self.layers, hidden_states)] + features = [layer(hidden_state) for layer, hidden_state in zip(self.sampling_layers, hidden_states)] features = torch.cat(features, dim=1) - output = self.stage_layers(features) - return output + hidden_state = self.stage_layer(features) + hidden_state = self.layer_norm(hidden_state) + return hidden_state class RFDetrMultiScaleProjector(nn.Module): @@ -536,7 +572,7 @@ def forward(self, features): # don't do it inplace to ensure the compiler can optimize out the backbone layers features[-(i + 1)] = torch.zeros_like(features[-(i + 1)]) - outputs = [layer(x) for layer, x in zip(self.scale_layers, features)] + outputs = [layer(features) for layer in self.scale_layers] return outputs @@ -585,7 +621,6 @@ def __init__(self, config): def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): # send pixel_values through the model to get list of feature maps features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps - features = self.projector(features) out = [] @@ -1033,7 +1068,7 @@ def __init__(self, config: RFDetrConfig): # self-attention self.self_attn = RFDetrMultiheadAttention( embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, + num_heads=config.decoder_self_attention_heads, dropout=config.attention_dropout, ) self.dropout = config.dropout @@ -1044,7 +1079,7 @@ def __init__(self, config: RFDetrConfig): # cross-attention self.encoder_attn = RFDetrMultiscaleDeformableAttention( config, - num_heads=config.decoder_attention_heads, + num_heads=config.decoder_cross_attention_heads, n_points=config.decoder_n_points, ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) @@ -1250,156 +1285,6 @@ def _init_weights(self, module): """ -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->RFDetr -class RFDetrEncoder(RFDetrPreTrainedModel): - """ - Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a - [`RFDetrEncoderLayer`]. - - The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. - - Args: - config: RFDetrConfig - """ - - def __init__(self, config: RFDetrConfig): - super().__init__(config) - self.gradient_checkpointing = False - - self.dropout = config.dropout - self.layers = nn.ModuleList([RFDetrEncoderLayer(config) for _ in range(config.encoder_layers)]) - - # Initialize weights and apply final processing - self.post_init() - - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios, device): - """ - Get reference points for each feature map. Used in decoder. - - Args: - spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of each feature map. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): - Valid ratios of each feature map. - device (`torch.device`): - Device on which to create the tensors. - Returns: - `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` - """ - reference_points_list = [] - for level, (height, width) in enumerate(spatial_shapes): - ref_y, ref_x = meshgrid( - torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), - torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), - indexing="ij", - ) - # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 - ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) - ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) - ref = torch.stack((ref_x, ref_y), -1) - reference_points_list.append(ref) - reference_points = torch.cat(reference_points_list, 1) - reference_points = reference_points[:, :, None] * valid_ratios[:, None] - return reference_points - - def forward( - self, - inputs_embeds=None, - attention_mask=None, - position_embeddings=None, - spatial_shapes=None, - spatial_shapes_list=None, - level_start_index=None, - valid_ratios=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: - - 1 for pixel features that are real (i.e. **not masked**), - - 0 for pixel features that are padding (i.e. **masked**). - [What are attention masks?](../glossary#attention-mask) - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Position embeddings that are added to the queries and keys in each self-attention layer. - spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): - Spatial shapes of each feature map. - level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): - Starting index of each feature map. - valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): - Ratio of valid area in each feature level. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - hidden_states = inputs_embeds - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - spatial_shapes_tuple = tuple(spatial_shapes_list) - reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - for i, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - position_embeddings, - reference_points, - spatial_shapes, - spatial_shapes_list, - level_start_index, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions, - ) - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->RFDetr class RFDetrDecoder(RFDetrPreTrainedModel): """ @@ -1427,15 +1312,48 @@ def __init__(self, config: RFDetrConfig): self.bbox_embed = None self.class_embed = None + self.config = config + self.reference_points_head = RFDetrMLPPredictionHead(2 * config.d_model, config.d_model, config.d_model, 2) + + self.norm = RFDetrLayerNorm(config.d_model, data_format="channels_first") + # Initialize weights and apply final processing self.post_init() + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def get_reference_points(self, reference_points_embeds, valid_ratios): + obj_center = reference_points_embeds[..., :4] + + refpoints_input = ( + obj_center[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) # bs, nq, nlevel, 4 + query_sine_embed = self.get_proposal_pos_embed(refpoints_input[:, :, 0, :], self.d_model / 2) # bs, nq, 256*2 + position_query_embeddings = self.reference_points_head(query_sine_embed) + return refpoints_input, position_query_embeddings + def forward( self, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, position_embeddings=None, + reference_points_embeddings=None, reference_points=None, spatial_shapes=None, spatial_shapes_list=None, @@ -1504,6 +1422,12 @@ def forward( else: raise ValueError("Reference points' last dimension must be of size 2") + if self.config.bbox_reparam: + query_sine_embed = self.get_proposal_pos_embed( + reference_points_input[:, :, 0, :], self.d_model / 2 + ) # bs, nq, 256*2 + position_embeddings = self.reference_points_head(query_sine_embed) + if output_hidden_states: all_hidden_states += (hidden_states,) @@ -1539,17 +1463,24 @@ def forward( if self.bbox_embed is not None: tmp = self.bbox_embed[idx](hidden_states) num_coordinates = reference_points.shape[-1] - if num_coordinates == 4: - new_reference_points = tmp + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - elif num_coordinates == 2: - new_reference_points = tmp - new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - else: - raise ValueError( - f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + if self.config.bbox_reparam: + new_reference_points_cxcy = ( + tmp[..., :2] * reference_points_embeddings[..., 2:] + reference_points_embeddings[..., :2] ) + new_reference_points_wh = tmp[..., 2:].exp() * reference_points_embeddings[..., 2:] + new_reference_points = torch.concat([new_reference_points_cxcy, new_reference_points_wh], dim=-1) + else: + if num_coordinates == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) reference_points = new_reference_points.detach() intermediate += (hidden_states,) @@ -1604,63 +1535,32 @@ class RFDetrModel(RFDetrPreTrainedModel): def __init__(self, config: RFDetrConfig): super().__init__(config) + self.bbox_reparam = config.bbox_reparam + self.two_stage = config.two_stage + self.num_groups = config.num_groups + self.num_queries = config.num_queries + self.d_model = config.d_model + # Create backbone + positional encoding backbone = RFDetrConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = RFDetrConvModel(backbone, position_embeddings) - # Create input projection layers - if config.num_feature_levels > 1: - num_backbone_outs = len(backbone.intermediate_channel_sizes) - input_proj_list = [] - for _ in range(num_backbone_outs): - in_channels = backbone.intermediate_channel_sizes[_] - input_proj_list.append( - nn.Sequential( - nn.Conv2d(in_channels, config.d_model, kernel_size=1), - nn.GroupNorm(32, config.d_model), - ) - ) - for _ in range(config.num_feature_levels - num_backbone_outs): - input_proj_list.append( - nn.Sequential( - nn.Conv2d( - in_channels, - config.d_model, - kernel_size=3, - stride=2, - padding=1, - ), - nn.GroupNorm(32, config.d_model), - ) - ) - in_channels = config.d_model - self.input_proj = nn.ModuleList(input_proj_list) - else: - self.input_proj = nn.ModuleList( - [ - nn.Sequential( - nn.Conv2d( - backbone.intermediate_channel_sizes[-1], - config.d_model, - kernel_size=1, - ), - nn.GroupNorm(32, config.d_model), - ) - ] - ) - if not config.two_stage: - self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + self.query_position_embeddings = nn.Embedding(config.num_queries * config.num_groups, config.d_model) + + self.reference_point_embeddings = nn.Embedding(config.num_queries * config.num_groups, 4) + nn.init.constant_(self.reference_point_embeddings.weight.data, 0) - self.encoder = RFDetrEncoder(config) self.decoder = RFDetrDecoder(config) self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) if config.two_stage: - self.enc_output = nn.Linear(config.d_model, config.d_model) - self.enc_output_norm = nn.LayerNorm(config.d_model) + self.enc_output = nn.ModuleList( + [nn.Linear(config.d_model, config.d_model) for _ in range(config.num_groups)] + ) + self.enc_output_norm = nn.ModuleList([nn.LayerNorm(config.d_model) for _ in range(config.num_groups)]) self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) else: @@ -1768,7 +1668,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) object_query = enc_output object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) object_query = object_query.masked_fill(~output_proposals_valid, float(0)) - object_query = self.enc_output_norm(self.enc_output(object_query)) + # object_query = self.enc_output_norm(self.enc_output(object_query)) return object_query, output_proposals @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) @@ -1824,100 +1724,109 @@ def forward( # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) # First, sent pixel_values + pixel_mask through Backbone to obtain the features # which is a list of tuples - features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + features, _ = self.backbone(pixel_values, pixel_mask) # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) sources = [] masks = [] - for level, (source, mask) in enumerate(features): - sources.append(self.input_proj[level](source)) + for source, mask in enumerate(features): + sources.append(source) masks.append(mask) if mask is None: raise ValueError("No attention mask was provided") - # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage - if self.config.num_feature_levels > len(sources): - _len_sources = len(sources) - for level in range(_len_sources, self.config.num_feature_levels): - if level == _len_sources: - source = self.input_proj[level](features[-1][0]) - else: - source = self.input_proj[level](sources[-1]) - mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to( - torch.bool - )[0] - pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) - sources.append(source) - masks.append(mask) - position_embeddings_list.append(pos_l) - # Create queries query_embeds = None if not self.config.two_stage: - query_embeds = self.query_position_embeddings.weight + if self.training: + query_embeds = self.query_position_embeddings.weight + else: + query_embeds = self.query_position_embeddings.weight[: self.num_queries] + + if self.training: + reference_point_embeds = self.reference_point_embeddings.weight + else: + reference_point_embeds = self.reference_point_embeddings.weight[: self.num_queries] # Prepare encoder inputs (by flattening) source_flatten = [] mask_flatten = [] - lvl_pos_embed_flatten = [] spatial_shapes_list = [] - for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + for source, mask in enumerate(zip(sources, masks)): batch_size, num_channels, height, width = source.shape spatial_shape = (height, width) spatial_shapes_list.append(spatial_shape) source = source.flatten(2).transpose(1, 2) mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose(1, 2) - lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) - lvl_pos_embed_flatten.append(lvl_pos_embed) source_flatten.append(source) mask_flatten.append(mask) source_flatten = torch.cat(source_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) - lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1) - # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder - # Also provide spatial_shapes, level_start_index and valid_ratios - if encoder_outputs is None: - encoder_outputs = self.encoder( - inputs_embeds=source_flatten, - attention_mask=mask_flatten, - position_embeddings=lvl_pos_embed_flatten, - spatial_shapes=spatial_shapes, - spatial_shapes_list=spatial_shapes_list, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, - ) - # Fifth, prepare decoder inputs - batch_size, _, num_channels = encoder_outputs[0].shape + encoder_hidden_states = source_flatten + batch_size, _, num_channels = encoder_hidden_states.shape enc_outputs_class = None enc_outputs_coord_logits = None - if self.config.two_stage: + + if self.two_stage: object_query_embedding, output_proposals = self.gen_encoder_output_proposals( - encoder_outputs[0], ~mask_flatten, spatial_shapes_list + source_flatten, ~mask_flatten, spatial_shapes_list ) + reference_points = [] + encoder_hidden_states = [] + boxes_ts = [] + num_groups = self.num_groups if self.training else 1 + for group_id in range(num_groups): + object_query_embedding = self.enc_output[group_id](object_query_embedding) + object_query_embedding = self.enc_output_norm[group_id](object_query_embedding) + + enc_outputs_class = self.enc_out_class_embed[group_id](object_query_embedding) + + if self.bbox_reparam: + enc_outputs_coord_delta = self.enc_out_bbox_embed[group_id](object_query_embedding) + enc_outputs_coord_cxcy = ( + enc_outputs_coord_delta[..., :2] * output_proposals[..., 2:] + output_proposals[..., :2] + ) + enc_outputs_coord_wh = enc_outputs_coord_delta[..., 2:].exp() * output_proposals[..., 2:] + enc_outputs_coord_logits = torch.concat([enc_outputs_coord_cxcy, enc_outputs_coord_wh], dim=-1) + else: + delta_bbox = self.enc_out_bbox_embed[group_id](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + topk = self.config.two_stage_num_proposals + enc_outputs_class = torch.max(enc_outputs_class, dim=-1)[0] + topk_proposals = torch.topk(enc_outputs_class, topk, dim=1)[1] # bs, nq + + reference_point_embedding = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) # unsigmoid + # for decoder layer, detached as initial ones, (bs, nq, 4) + reference_point_embedding_detached = reference_point_embedding.detach() + + # get memory tgt + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ) + + reference_points.append(reference_point_embedding_detached) + encoder_hidden_states.append(target) + boxes_ts.append(reference_point_embedding) + reference_points = torch.cat(reference_points, dim=1) + encoder_hidden_states = torch.cat(encoder_hidden_states, dim=1) + boxes_ts = torch.cat(boxes_ts, dim=1) + # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) - delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) - enc_outputs_coord_logits = delta_bbox + output_proposals + # delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + # enc_outputs_coord_logits = delta_bbox + output_proposals # only keep top scoring `config.two_stage_num_proposals` proposals topk = self.config.two_stage_num_proposals @@ -1943,7 +1852,7 @@ def forward( decoder_outputs = self.decoder( inputs_embeds=target, position_embeddings=query_embed, - encoder_hidden_states=encoder_outputs[0], + encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=mask_flatten, reference_points=reference_points, spatial_shapes=spatial_shapes, @@ -1969,9 +1878,10 @@ def forward( decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, - encoder_last_hidden_state=encoder_outputs.last_hidden_state, - encoder_hidden_states=encoder_outputs.hidden_states, - encoder_attentions=encoder_outputs.attentions, + # TODO + # encoder_last_hidden_state=features.last_hidden_state, + # encoder_hidden_states=features.hidden_states, + # encoder_attentions=features.attentions, enc_outputs_class=enc_outputs_class, enc_outputs_coord_logits=enc_outputs_coord_logits, ) @@ -2036,21 +1946,28 @@ def __init__(self, config: RFDetrConfig): # if two-stage, the last class_embed and bbox_embed is for region proposal generation num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers if config.with_box_refine: - self.class_embed = _get_clones(self.class_embed, num_pred) - self.bbox_embed = _get_clones(self.bbox_embed, num_pred) - nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # self.class_embed = _get_clones(self.class_embed, num_pred) + # self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + # nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) # hack implementation for iterative bounding box refinement self.model.decoder.bbox_embed = self.bbox_embed else: - nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) - self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) - self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + # nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + # self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + # self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) self.model.decoder.bbox_embed = None if config.two_stage: # hack implementation for two-stage self.model.decoder.class_embed = self.class_embed - for box_embed in self.bbox_embed: - nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + # for box_embed in self.bbox_embed: + # nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + self.model.enc_out_bbox_embed = nn.ModuleList( + [copy.deepcopy(self.bbox_embed) for _ in range(config.num_groups)] + ) + self.model.enc_out_class_embed = nn.ModuleList( + [copy.deepcopy(self.class_embed) for _ in range(config.num_groups)] + ) # Initialize weights and apply final processing self.post_init() @@ -2112,7 +2029,7 @@ def forward( ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # First, sent images through DETR base model to obtain encoder + decoder outputs + # First, sent images through RfDETR base model to obtain backbone + decoder outputs outputs = self.model( pixel_values, pixel_mask=pixel_mask, diff --git a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py index ad1ec42236be..6d56fbb39514 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py @@ -74,7 +74,11 @@ def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) - self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.register_tokens = ( + nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + if config.num_register_tokens > 0 + else None + ) self.patch_embeddings = RFDetrDinov2WithRegistersPatchEmbeddings(config) num_patches = self.patch_embeddings.num_patches self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) @@ -723,19 +727,17 @@ def forward( if self.config.num_windows > 1: # undo windowing num_windows_squared = self.config.num_windows**2 - batch_size, height_width, channels = hidden_state.shape + B, HW, C = hidden_state.shape num_h_patches_per_window = num_h_patches // self.config.num_windows num_w_patches_per_window = num_w_patches // self.config.num_windows - hidden_state = hidden_state.reshape( - batch_size // num_windows_squared, num_windows_squared * height_width, channels - ) + hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) hidden_state = hidden_state.view( - batch_size // num_windows_squared, + B // num_windows_squared, self.config.num_windows, self.config.num_windows, num_h_patches_per_window, num_w_patches_per_window, - channels, + C, ) hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) @@ -758,4 +760,4 @@ def forward( ) -__all__ = ["RFDetrDinov2WithRegistersBackbone", "RFDetrDinov2WithRegistersPreTrainedModel"] +__all__ = ["RFDetrDinov2WithRegistersBackbone"] diff --git a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py index e4a53b858371..7203dcfc4e1b 100644 --- a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py +++ b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple, Union import torch +from torch import nn from ...modeling_outputs import BackboneOutput, BaseModelOutput from ..dinov2_with_registers.configuration_dinov2_with_registers import Dinov2WithRegistersConfig @@ -24,6 +25,14 @@ def __init__(self, num_windows: int = 4, window_block_indexes=None, **super_kwar class RFDetrDinov2WithRegistersEmbeddings(Dinov2WithRegistersEmbeddings): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super(Dinov2WithRegistersEmbeddings).__init__(config) + self.register_tokens = ( + nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + if config.num_register_tokens > 0 + else None + ) + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape target_dtype = self.patch_embeddings.projection.weight.dtype From c1a0fe88e087778d893c6bb20d4e0728c92f77ba Mon Sep 17 00:00:00 2001 From: steven Date: Tue, 1 Apr 2025 23:15:44 +0200 Subject: [PATCH 5/6] fix: set scaling to None in RFDetrDinov2WithRegistersSelfAttention --- .../rf_detr/modeling_rf_detr_dinov2_with_registers.py | 2 +- .../rf_detr/modular_rf_detr_dinov2_with_registers.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py index 6d56fbb39514..54530aacffa7 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py @@ -237,7 +237,7 @@ def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.dropout_prob = config.attention_probs_dropout_prob - self.scaling = self.attention_head_size**-0.5 + self.scaling = None self.is_causal = False self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) diff --git a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py index 7203dcfc4e1b..7e7a785111d6 100644 --- a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py +++ b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py @@ -10,6 +10,7 @@ Dinov2WithRegistersEmbeddings, Dinov2WithRegistersEncoder, Dinov2WithRegistersLayer, + Dinov2WithRegistersSelfAttention, ) @@ -86,6 +87,12 @@ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Te return embeddings +class RFDetrDinov2WithRegistersSelfAttention(Dinov2WithRegistersSelfAttention): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super(Dinov2WithRegistersSelfAttention).__init__(config) + self.scaling = None + + class RFDetrDinov2WithRegistersLayer(Dinov2WithRegistersLayer): def __init__(self, config: RFDetrDinov2WithRegistersConfig): super().__init__(config) From e4832456748a5737b1de0b618a2781f8375b25f2 Mon Sep 17 00:00:00 2001 From: steven Date: Tue, 8 Apr 2025 23:13:22 +0200 Subject: [PATCH 6/6] (draft) --- .../rf_detr/convert_rf_detr_weights_to_hf.py | 38 +++++++++++++--- .../models/rf_detr/modeling_rf_detr.py | 44 ++++++++----------- 2 files changed, 50 insertions(+), 32 deletions(-) diff --git a/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py index ff90e4737273..8f920f19b6fc 100644 --- a/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py +++ b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py @@ -31,10 +31,22 @@ RFDetrDinov2WithRegistersConfig, RFDetrForObjectDetection, RTDetrImageProcessor, + RTDetrImageProcessorFast, ) from transformers.utils import logging +torch.set_printoptions(precision=6, sci_mode=False) + + +def custom_repr(self): + # return f"{tuple(self.shape)} {self.flatten()[-10:].tolist()} {original_repr(self)}" + return f"{tuple(self.shape)} {self.flatten()[-3:].tolist()}" + + +original_repr = torch.Tensor.__repr__ +torch.Tensor.__repr__ = custom_repr + logging.set_verbosity_info() logger = logging.get_logger(__name__) @@ -129,11 +141,11 @@ def get_rt_detr_v2_config(model_name: str) -> RFDetrConfig: r"transformer.decoder.norm": r"model.decoder.norm", r"transformer.decoder.ref_point_head": r"model.decoder.reference_points_head", r"refpoint_embed": r"model.reference_point_embeddings", - r"class_embed": r"model.decoder.class_embed", - r"bbox_embed": r"model.decoder.bbox_embed", r"transformer.enc_output": r"model.enc_output", r"transformer.enc_output_norm": r"model.enc_output_norm", r"transformer.enc_out_bbox_embed": r"model.enc_out_bbox_embed", + r"transformer.enc_out_class_embed": r"model.enc_out_class_embed", + r"query_feat": r"model.query_position_embeddings", } @@ -169,6 +181,16 @@ def read_in_q_k_v(state_dict, config: RFDetrConfig): state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-decoder_hidden_dim:] +def copy_weights(state_dict, config): + for key, value in dict(state_dict.items()).items(): + if key.startswith("bbox_embed"): + new_key = f"model.decoder.{key}" + state_dict[new_key] = value + if key.startswith("class_embed"): + new_key = f"model.decoder.{key}" + state_dict[new_key] = value + + # We will verify our results on an image of cute cats def prepare_img(): url = "http://images.cocodataset.org/val2017/000000039769.jpg" @@ -197,7 +219,7 @@ def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[ "model" ] - + original_state_dict = state_dict.copy() # rename keys state_dict = convert_old_keys_to_new_keys(state_dict) for key in state_dict.copy().keys(): @@ -206,6 +228,8 @@ def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id # query, key and value matrices need special treatment read_in_q_k_v(state_dict, config) + # certain weights are copied from the RFDetrForObjectDetection to the RFDetrDecoder + copy_weights(state_dict, config) # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them for key in state_dict.copy().keys(): if key.endswith("num_batches_tracked"): @@ -213,11 +237,13 @@ def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id # finally, create HuggingFace model and load state dict model = RFDetrForObjectDetection(config) + target_state_dict = model.state_dict() model.load_state_dict(state_dict) + loaded_state_dict = model.state_dict() model.eval() # load image processor - image_processor = RTDetrImageProcessor() + image_processor = RTDetrImageProcessorFast(size={"height": 560, "width": 560}, do_normalize=True) # prepare image img = prepare_img() @@ -225,7 +251,7 @@ def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id # preprocess image transformations = transforms.Compose( [ - transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR), + transforms.Resize([560, 560], interpolation=transforms.InterpolationMode.BILINEAR), transforms.ToTensor(), ] ) @@ -234,8 +260,6 @@ def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id encoding = image_processor(images=img, return_tensors="pt") pixel_values = encoding["pixel_values"] - assert torch.allclose(original_pixel_values, pixel_values) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) pixel_values = pixel_values.to(device) diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py index 9b74ab2d88d3..c719f942b46c 100644 --- a/src/transformers/models/rf_detr/modeling_rf_detr.py +++ b/src/transformers/models/rf_detr/modeling_rf_detr.py @@ -373,6 +373,7 @@ def __init__( activation: str = None, ): super().__init__() + activation = config.projector_activation_function if activation is None else activation self.conv = nn.Conv2d( in_channels, out_channels, @@ -406,12 +407,13 @@ def __init__(self, config: RFDetrConfig, hidden_channels: int): activation = config.projector_activation_function self.conv1 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1) - self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1) + self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1) self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() - def forward(self, x): - y = self.conv1(x) + self.conv2(x) - return self.activation(y) + def forward(self, hidden_states): + output_states = self.conv1(hidden_states) + output_states = self.conv2(output_states) + return hidden_states + output_states class RFDetrCSPRepLayer(nn.Module): @@ -424,27 +426,22 @@ def __init__(self, config: RFDetrConfig, in_channels: int): out_channels = config.d_model num_blocks = config.projector_num_blocks - activation = config.projector_activation_function self.hidden_channels = int(out_channels * config.csp_hidden_expansion) - self.conv1 = RFDetrConvNormLayer(config, in_channels, 2 * self.hidden_channels, 1, 1, activation=activation) - self.conv2 = RFDetrConvNormLayer( - config, - (2 + num_blocks) * self.hidden_channels, - out_channels, - 1, - 1, - activation=activation, - ) - self.bottlenecks = nn.Sequential( - *[RFDetrCSPRepBottleneck(config, self.hidden_channels) for _ in range(num_blocks)] + self.conv1 = RFDetrConvNormLayer(config, in_channels, 2 * self.hidden_channels, 1, 1) + self.conv2 = RFDetrConvNormLayer(config, (2 + num_blocks) * self.hidden_channels, out_channels, 1, 1) + self.bottlenecks = nn.ModuleList( + [RFDetrCSPRepBottleneck(config, self.hidden_channels) for _ in range(num_blocks)] ) def forward(self, hidden_states): hidden_states = self.conv1(hidden_states) all_hidden_states = list(hidden_states.split(self.hidden_channels, 1)) hidden_states = all_hidden_states[-1] - all_hidden_states.extend(bottleneck(hidden_states) for bottleneck in self.bottlenecks) + for bottleneck in self.bottlenecks: + new_hidden_states = bottleneck(hidden_states) + all_hidden_states.append(new_hidden_states) + # all_hidden_states.extend(bottleneck(hidden_states) for bottleneck in self.bottlenecks) hidden_states = torch.cat(all_hidden_states, 1) hidden_states = self.conv2(hidden_states) return hidden_states @@ -460,7 +457,7 @@ def __init__(self, config: RFDetrConfig, scale: float, in_channels: int): elif scale == 1.0: pass elif scale == 0.5: - layers.append(RFDetrConvNormLayer(config, in_channels, in_channels, 3, 2)) + layers.append(RFDetrConvNormLayer(config, in_channels, in_channels, 3, 2, activation="relu")) else: raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) self.layers = nn.Sequential(*layers) @@ -1546,23 +1543,20 @@ def __init__(self, config: RFDetrConfig): position_embeddings = build_position_encoding(config) self.backbone = RFDetrConvModel(backbone, position_embeddings) - if not config.two_stage: - self.query_position_embeddings = nn.Embedding(config.num_queries * config.num_groups, config.d_model) + self.query_position_embeddings = nn.Embedding(config.num_queries * config.num_groups, config.d_model) self.reference_point_embeddings = nn.Embedding(config.num_queries * config.num_groups, 4) nn.init.constant_(self.reference_point_embeddings.weight.data, 0) self.decoder = RFDetrDecoder(config) - self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) - if config.two_stage: self.enc_output = nn.ModuleList( [nn.Linear(config.d_model, config.d_model) for _ in range(config.num_groups)] ) self.enc_output_norm = nn.ModuleList([nn.LayerNorm(config.d_model) for _ in range(config.num_groups)]) - self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) - self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + # self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + # self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) else: self.reference_points = nn.Linear(config.d_model, 2) @@ -1729,7 +1723,7 @@ def forward( # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) sources = [] masks = [] - for source, mask in enumerate(features): + for source, mask in features: sources.append(source) masks.append(mask) if mask is None: