From b7e33112809065dafd19bfadfdf6ee9ae7b8110e Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 02:51:06 +0200
Subject: [PATCH 01/35] add florence2 config
---
.../florence/configuration_florence2.py | 339 ++++++++++++++++++
1 file changed, 339 insertions(+)
create mode 100644 src/transformers/models/florence/configuration_florence2.py
diff --git a/src/transformers/models/florence/configuration_florence2.py b/src/transformers/models/florence/configuration_florence2.py
new file mode 100644
index 000000000000..6dd576ee1be5
--- /dev/null
+++ b/src/transformers/models/florence/configuration_florence2.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+""" Florence-2 configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+class Florence2VisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
+ The dropout rate of the drop path layer.
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+ The patch size of the image.
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+ The patch stride of the image.
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+ The patch padding of the image.
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+ Whether to apply layer normalization before the patch embedding layer.
+ enable_checkpoint (`bool`, *optional*, defaults to False):
+ Whether to enable checkpointing.
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+ The dimension of the embedding layer.
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+ The number of attention heads.
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+ The number of groups.
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+ The depth of the model.
+ window_size (`int`, *optional*, defaults to 12):
+ The window size of the model.
+ projection_dim (`int`, *optional*, defaults to 1024):
+ The dimension of the projection layer.
+ visual_temporal_embedding (`dict`, *optional*):
+ The configuration of the visual temporal embedding.
+ image_pos_embed (`dict`, *optional*):
+ The configuration of the image position embedding.
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+ The source of the image feature.
+ Example:
+
+ ```python
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+
+ >>> # Initializing a Florence2 Vision style configuration
+ >>> configuration = Florence2VisionConfig()
+
+ >>> # Initializing a model (with random weights)
+ >>> model = Florence2VisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "florence2_vision"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ drop_path_rate=0.1,
+ patch_size=[7, 3, 3, 3],
+ patch_stride=[4, 2, 2, 2],
+ patch_padding=[3, 1, 1, 1],
+ patch_prenorm=[False, True, True, True],
+ enable_checkpoint=False,
+ dim_embed=[256, 512, 1024, 2048],
+ num_heads=[8, 16, 32, 64],
+ num_groups=[8, 16, 32, 64],
+ depths=[1, 1, 9, 1],
+ window_size=12,
+ projection_dim=1024,
+ visual_temporal_embedding=None,
+ image_pos_embed=None,
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
+ **kwargs,
+ ):
+ self.drop_path_rate = drop_path_rate
+ self.patch_size = patch_size
+ self.patch_stride = patch_stride
+ self.patch_padding = patch_padding
+ self.patch_prenorm = patch_prenorm
+ self.enable_checkpoint = enable_checkpoint
+ self.dim_embed = dim_embed
+ self.num_heads = num_heads
+ self.num_groups = num_groups
+ self.depths = depths
+ self.window_size = window_size
+ self.projection_dim = projection_dim
+ self.visual_temporal_embedding = visual_temporal_embedding
+ self.image_pos_embed = image_pos_embed
+ self.image_feature_source = image_feature_source
+
+ super().__init__(**kwargs)
+
+
+class Florence2LanguageConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the BART
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 51289):
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
+ d_model (`int`, *optional*, defaults to 1024):
+ Dimensionality of the layers and the pooler layer.
+ encoder_layers (`int`, *optional*, defaults to 12):
+ Number of encoder layers.
+ decoder_layers (`int`, *optional*, defaults to 12):
+ Number of decoder layers.
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ activation_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for activations inside the fully connected layer.
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for classifier.
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ init_std (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+ for more details.
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+ for more details.
+ scale_embedding (`bool`, *optional*, defaults to `False`):
+ Scale embeddings by diving by sqrt(d_model).
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models).
+ num_labels (`int`, *optional*, defaults to 3):
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
+
+ Example:
+
+ ```python
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
+
+ >>> # Initializing a Florence2 Language style configuration
+ >>> configuration = Florence2LanguageConfig()
+
+ >>> # Initializing a model (with random weights)
+ >>> model = Florence2LangaugeModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "florence2_language"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+ def __init__(
+ self,
+ vocab_size=51289,
+ max_position_embeddings=1024,
+ encoder_layers=12,
+ encoder_ffn_dim=4096,
+ encoder_attention_heads=16,
+ decoder_layers=12,
+ decoder_ffn_dim=4096,
+ decoder_attention_heads=16,
+ encoder_layerdrop=0.0,
+ decoder_layerdrop=0.0,
+ activation_function="gelu",
+ d_model=1024,
+ dropout=0.1,
+ attention_dropout=0.0,
+ activation_dropout=0.0,
+ init_std=0.02,
+ classifier_dropout=0.0,
+ scale_embedding=False,
+ use_cache=True,
+ num_labels=3,
+ pad_token_id=1,
+ bos_token_id=0,
+ eos_token_id=2,
+ is_encoder_decoder=True,
+ decoder_start_token_id=2,
+ forced_eos_token_id=2,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.d_model = d_model
+ self.encoder_ffn_dim = encoder_ffn_dim
+ self.encoder_layers = encoder_layers
+ self.encoder_attention_heads = encoder_attention_heads
+ self.decoder_ffn_dim = decoder_ffn_dim
+ self.decoder_layers = decoder_layers
+ self.decoder_attention_heads = decoder_attention_heads
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.activation_dropout = activation_dropout
+ self.activation_function = activation_function
+ self.init_std = init_std
+ self.encoder_layerdrop = encoder_layerdrop
+ self.decoder_layerdrop = decoder_layerdrop
+ self.classifier_dropout = classifier_dropout
+ self.use_cache = use_cache
+ self.num_hidden_layers = encoder_layers
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
+
+ super().__init__(
+ num_labels=num_labels,
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ is_encoder_decoder=is_encoder_decoder,
+ decoder_start_token_id=decoder_start_token_id,
+ forced_eos_token_id=forced_eos_token_id,
+ **kwargs,
+ )
+
+ # ensure backward compatibility for BART CNN models
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+ self.forced_bos_token_id = self.bos_token_id
+ warnings.warn(
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+ "The config can simply be saved and uploaded again to be fixed."
+ )
+
+
+class Florence2Config(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+ Florence-2 model according to the specified arguments, defining the model architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vision_config (`Florence2VisionConfig`, *optional*):
+ Custom vision config or dict
+ text_config (`Union[AutoConfig, dict]`, *optional*):
+ The config object of the text backbone.
+ ignore_index (`int`, *optional*, defaults to -100):
+ The ignore index for the loss function.
+ vocab_size (`int`, *optional*, defaults to 51289):
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
+ projection_dim (`int`, *optional*, defaults to 1024):
+ Dimension of the multimodal projection space.
+
+ Example:
+
+ ```python
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+
+ >>> # Initializing a clip-like vision config
+ >>> vision_config = CLIPVisionConfig()
+
+ >>> # Initializing a Bart config
+ >>> text_config = BartConfig()
+
+ >>> # Initializing a Florence-2 configuration
+ >>> configuration = Florence2Config(vision_config, text_config)
+
+ >>> # Initializing a model from the florence-2 configuration
+ >>> model = Florence2ForConditionalGeneration(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "florence2"
+ is_composition = False
+
+ def __init__(
+ self,
+ vision_config=None,
+ text_config=None,
+ ignore_index=-100,
+ vocab_size=51289,
+ projection_dim=1024,
+ **kwargs,
+ ):
+ self.ignore_index = ignore_index
+ self.vocab_size = vocab_size
+ self.projection_dim = projection_dim
+ if vision_config is not None:
+ vision_config = PretrainedConfig(**vision_config)
+ self.vision_config = vision_config
+ self.vocab_size = self.vocab_size
+
+ self.text_config = text_config
+ if text_config is not None:
+ self.text_config = Florence2LanguageConfig(**text_config)
+
+
+ super().__init__(**kwargs)
+
From 3adc633745ece2f10ba8a51d7ae9b8023c0c6aca Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 02:54:26 +0200
Subject: [PATCH 02/35] add florence2 processing
---
.../models/florence/processing_florence2.py | 1088 +++++++++++++++++
1 file changed, 1088 insertions(+)
create mode 100644 src/transformers/models/florence/processing_florence2.py
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
new file mode 100644
index 000000000000..90797a3a242d
--- /dev/null
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -0,0 +1,1088 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Florence-2.
+"""
+
+import re
+import logging
+from typing import List, Optional, Union
+import numpy as np
+
+import torch
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import (
+ PaddingStrategy,
+ PreTokenizedInput,
+ TextInput,
+ TruncationStrategy,
+)
+from ...utils import TensorType
+
+
+logger = logging.getLogger(__name__)
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+ return isinstance(val, str) and val.startswith("http")
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+ return is_url(elem) or is_valid_image(elem)
+
+
+def _is_str_or_image(elem):
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
+
+
+class Florence2Processor(ProcessorMixin):
+ r"""
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
+
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
+
+ Args:
+ image_processor ([`CLIPImageProcessor`], *optional*):
+ The image processor is a required input.
+ tokenizer ([`BartTokenizerFast`], *optional*):
+ The tokenizer is a required input.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ image_processor_class = "CLIPImageProcessor"
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+
+ def __init__(
+ self,
+ image_processor=None,
+ tokenizer=None,
+ ):
+ if image_processor is None:
+ raise ValueError("You need to specify an `image_processor`.")
+ if tokenizer is None:
+ raise ValueError("You need to specify a `tokenizer`.")
+ if not hasattr(image_processor, "image_seq_length"):
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+
+ self.image_seq_length = image_processor.image_seq_length
+
+ tokens_to_add = {
+ 'additional_special_tokens': \
+ tokenizer.additional_special_tokens + \
+ ['', '', '', ''] + \
+ [f'' for x in range(1000)] + \
+ ['', '', '', '','', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
+ }
+ tokenizer.add_special_tokens(tokens_to_add)
+
+ self.tasks_answer_post_processing_type = {
+ '': 'pure_text',
+ '': 'ocr',
+ '': 'pure_text',
+ '': 'pure_text',
+ '': 'pure_text',
+ '': 'description_with_bboxes',
+ '': 'description_with_bboxes',
+ '': "phrase_grounding",
+ '': 'polygons',
+ '': 'polygons',
+ '': 'description_with_bboxes_or_polygons',
+ '': 'pure_text',
+ '': 'pure_text',
+ '': 'pure_text',
+ '': 'bboxes'
+ }
+
+ self.task_prompts_without_inputs = {
+ '': 'What is the text in the image?',
+ '': 'What is the text in the image, with regions?',
+ '': 'What does the image describe?',
+ '': 'Describe in detail what is shown in the image.',
+ '': 'Describe with a paragraph what is shown in the image.',
+ '': 'Locate the objects with category name in the image.',
+ '': 'Locate the objects in the image, with their descriptions.',
+ '': 'Locate the region proposals in the image.'
+ }
+
+ self.task_prompts_with_input = {
+ '': "Locate the phrases in the caption: {input}",
+ '': 'Locate {input} in the image with mask',
+ '': 'What is the polygon mask of region {input}',
+ '': 'Locate {input} in the image.',
+ '': 'What is the region {input}?',
+ '': 'What does the region {input} describe?',
+ '': 'What text is in the region {input}?',
+ }
+
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
+
+
+ super().__init__(image_processor, tokenizer)
+
+ def _construct_prompts(self, text):
+ # replace the task tokens with the task prompts if task token is in the text
+ prompts = []
+ for _text in text:
+ # 1. fixed task prompts without additional inputs
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
+ if task_token in _text:
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
+ _text = task_prompt
+ break
+ # 2. task prompts with additional inputs
+ for task_token, task_prompt in self.task_prompts_with_input.items():
+ if task_token in _text:
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
+ break
+ prompts.append(_text)
+ return prompts
+
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ tokenize_newline_separately: bool = True,
+ padding: Union[bool, str, PaddingStrategy] = False,
+ truncation: Union[bool, str, TruncationStrategy] = None,
+ max_length=None,
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+ do_resize: bool = None,
+ do_normalize: bool = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
+ input_data_format: Optional[
+ Union[str, "ChannelDimension"] # noqa: F821
+ ] = None,
+ resample: "PILImageResampling" = None, # noqa: F821
+ do_convert_rgb: bool = None,
+ do_thumbnail: bool = None,
+ do_align_long_axis: bool = None,
+ do_rescale: bool = None,
+ ) -> BatchFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+ of the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+ number of channels, H and W are image height and width.
+ tokenize_newline_separately (`bool`, defaults to `True`):
+ Adds a separately tokenized '\n' at the end of the prompt.
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
+ index) among:
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ sequence if provided).
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+ acceptable input length for the model if that argument is not provided.
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+ lengths).
+ max_length (`int`, *optional*):
+ Maximum length of the returned list and optionally padding length (see above).
+ truncation (`bool`, *optional*):
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
+ is provided, the `input_ids` will also contain the suffix input ids.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ - **labels** -- Labels compatible with training if `suffix` is not None
+ """
+
+ return_token_type_ids = False
+
+ if images is None:
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
+ if text is None:
+ logger.warning_once(
+ "You are using Florence-2 without a text prompt."
+ )
+ text = ""
+
+ if isinstance(text, List) and isinstance(images, List):
+ if len(images) < len(text):
+ raise ValueError(
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
+ )
+ if _is_str_or_image(text):
+ text = [text]
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
+ pass
+
+ pixel_values = self.image_processor(
+ images,
+ do_resize=do_resize,
+ do_normalize=do_normalize,
+ return_tensors=return_tensors,
+ image_mean=image_mean,
+ image_std=image_std,
+ input_data_format=input_data_format,
+ data_format=data_format,
+ resample=resample,
+ do_convert_rgb=do_convert_rgb,
+ )["pixel_values"]
+
+ if max_length is not None:
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
+
+ text = self._construct_prompts(text)
+
+ inputs = self.tokenizer(
+ text,
+ return_tensors=return_tensors,
+ padding=padding,
+ max_length=max_length,
+ truncation=truncation,
+ return_token_type_ids=return_token_type_ids,
+ )
+
+ return_data = {**inputs, "pixel_values": pixel_values}
+
+ if return_token_type_ids:
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+ return_data.update({"labels": labels})
+ return BatchFeature(data=return_data)
+
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @property
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+ def post_process_generation(self, text, task, image_size):
+ """
+ Post-process the output of the model to each of the task outputs.
+
+ Args:
+ text (`str`): The text to post-process.
+ task (`str`): The task to post-process the text for.
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
+ """
+
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
+ task_answer = self.post_processor(
+ text=text,
+ image_size=image_size,
+ parse_tasks=task_answer_post_processing_type,
+ )[task_answer_post_processing_type]
+
+ if task_answer_post_processing_type == 'pure_text':
+ final_answer = task_answer
+ # remove the special tokens
+ final_answer = final_answer.replace('', '').replace('', '')
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
+ od_instances = task_answer
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
+ elif task_answer_post_processing_type in ['ocr']:
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
+ elif task_answer_post_processing_type in ['phrase_grounding']:
+ bboxes = []
+ labels = []
+ for _grounded_phrase in task_answer:
+ for _bbox in _grounded_phrase['bbox']:
+ bboxes.append(_bbox)
+ labels.append(_grounded_phrase['cat_name'])
+ final_answer = {'bboxes': bboxes, 'labels': labels}
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
+ labels = []
+ polygons = []
+ for result in task_answer:
+ label = result['cat_name']
+ _polygons = result['polygons']
+ labels.append(label)
+ polygons.append(_polygons)
+ final_answer = {'polygons': polygons, 'labels': labels}
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
+ bboxes = []
+ bboxes_labels = []
+ polygons = []
+ polygons_labels = []
+ for result in task_answer:
+ label = result['cat_name']
+ if 'polygons' in result:
+ _polygons = result['polygons']
+ polygons.append(_polygons)
+ polygons_labels.append(label)
+ else:
+ _bbox = result['bbox']
+ bboxes.append(_bbox)
+ bboxes_labels.append(label)
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
+ else:
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
+
+ final_answer = {
+ task: final_answer}
+ return final_answer
+
+class BoxQuantizer(object):
+ def __init__(self, mode, bins):
+ self.mode = mode
+ self.bins = bins
+
+ def quantize(self, boxes: torch.Tensor, size):
+ bins_w, bins_h = self.bins # Quantization bins.
+ size_w, size_h = size # Original image size.
+ size_per_bin_w = size_w / bins_w
+ size_per_bin_h = size_h / bins_h
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
+
+ if self.mode == 'floor':
+ quantized_xmin = (
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
+ quantized_ymin = (
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
+ quantized_xmax = (
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
+ quantized_ymax = (
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
+
+ elif self.mode == 'round':
+ raise NotImplementedError()
+
+ else:
+ raise ValueError('Incorrect quantization type.')
+
+ quantized_boxes = torch.cat(
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
+ ).int()
+
+ return quantized_boxes
+
+ def dequantize(self, boxes: torch.Tensor, size):
+ bins_w, bins_h = self.bins # Quantization bins.
+ size_w, size_h = size # Original image size.
+ size_per_bin_w = size_w / bins_w
+ size_per_bin_h = size_h / bins_h
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
+
+ if self.mode == 'floor':
+ # Add 0.5 to use the center position of the bin as the coordinate.
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
+
+ elif self.mode == 'round':
+ raise NotImplementedError()
+
+ else:
+ raise ValueError('Incorrect quantization type.')
+
+ dequantized_boxes = torch.cat(
+ (dequantized_xmin, dequantized_ymin,
+ dequantized_xmax, dequantized_ymax), dim=-1
+ )
+
+ return dequantized_boxes
+
+
+class CoordinatesQuantizer(object):
+ """
+ Quantize coornidates (Nx2)
+ """
+
+ def __init__(self, mode, bins):
+ self.mode = mode
+ self.bins = bins
+
+ def quantize(self, coordinates: torch.Tensor, size):
+ bins_w, bins_h = self.bins # Quantization bins.
+ size_w, size_h = size # Original image size.
+ size_per_bin_w = size_w / bins_w
+ size_per_bin_h = size_h / bins_h
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
+
+ if self.mode == 'floor':
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
+
+ elif self.mode == 'round':
+ raise NotImplementedError()
+
+ else:
+ raise ValueError('Incorrect quantization type.')
+
+ quantized_coordinates = torch.cat(
+ (quantized_x, quantized_y), dim=-1
+ ).int()
+
+ return quantized_coordinates
+
+ def dequantize(self, coordinates: torch.Tensor, size):
+ bins_w, bins_h = self.bins # Quantization bins.
+ size_w, size_h = size # Original image size.
+ size_per_bin_w = size_w / bins_w
+ size_per_bin_h = size_h / bins_h
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
+
+ if self.mode == 'floor':
+ # Add 0.5 to use the center position of the bin as the coordinate.
+ dequantized_x = (x + 0.5) * size_per_bin_w
+ dequantized_y = (y + 0.5) * size_per_bin_h
+
+ elif self.mode == 'round':
+ raise NotImplementedError()
+
+ else:
+ raise ValueError('Incorrect quantization type.')
+
+ dequantized_coordinates = torch.cat(
+ (dequantized_x, dequantized_y), dim=-1
+ )
+
+ return dequantized_coordinates
+
+
+class Florence2PostProcesser(object):
+ """
+ Florence-2 post process for converting text prediction to various tasks results.
+
+ Args:
+ config: A dict of configs.
+ tokenizer: A tokenizer for decoding text to spans.
+ sample config:
+ UNIFIED_POST_PROCESS:
+ # commom configs
+ NUM_BBOX_HEIGHT_BINS: 1000
+ NUM_BBOX_WIDTH_BINS: 1000
+ COORDINATES_HEIGHT_BINS: 1000
+ COORDINATES_WIDTH_BINS: 1000
+ # task specific configs, override the common configs
+ PRASE_TASKS:
+ - TASK_NAME: 'video_dense_caption'
+ PATTERN: 'r([a-zA-Z0-9 ]+)'
+ SCORE_MODE: 'avg_cat_name_scores'
+ NUM_BINS: 100
+ - TASK_NAME: 'od'
+ PATTERN: 'r([a-zA-Z0-9 ]+)'
+ SCORE_MODE: 'avg_cat_name_scores'
+
+ Returns:
+ parsed_dict (dict): A dict of parsed results.
+ """
+ def __init__(
+ self,
+ tokenizer=None
+ ):
+ parse_tasks = []
+ parse_task_configs = {}
+ config = self._create_default_config()
+ for task in config['PARSE_TASKS']:
+ parse_tasks.append(task['TASK_NAME'])
+ parse_task_configs[task['TASK_NAME']] = task
+
+ self.config = config
+ self.parse_tasks = parse_tasks
+ self.parse_tasks_configs = parse_task_configs
+
+ self.tokenizer = tokenizer
+ if self.tokenizer is not None:
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
+
+ self.init_quantizers()
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
+
+ def _create_black_list_of_phrase_grounding(self):
+ black_list = {}
+
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
+ black_list = set(
+ ['it', 'I', 'me', 'mine',
+ 'you', 'your', 'yours',
+ 'he', 'him', 'his',
+ 'she', 'her', 'hers',
+ 'they', 'them', 'their', 'theirs',
+ 'one', 'oneself',
+ 'we', 'us', 'our', 'ours',
+ 'you', 'your', 'yours',
+ 'they', 'them', 'their', 'theirs',
+ 'mine', 'yours', 'his', 'hers', 'its',
+ 'ours', 'yours', 'theirs',
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
+ 'ourselves', 'yourselves', 'themselves',
+ 'this', 'that',
+ 'these', 'those',
+ 'who', 'whom', 'whose', 'which', 'what',
+ 'who', 'whom', 'whose', 'which', 'that',
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
+ 'each', 'everybody', 'everyone', 'everything',
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
+ 'some', 'somebody', 'someone', 'something',
+ 'each other', 'one another',
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
+ 'ourselves', 'yourselves', 'themselves',
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
+ 'other objects', 'lots', 'a set',
+ ]
+ )
+
+ return black_list
+
+ def _create_default_config(self):
+ config = {
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
+ 'NUM_BBOX_WIDTH_BINS': 1000,
+ 'BOX_QUANTIZATION_MODE': 'floor',
+ 'COORDINATES_HEIGHT_BINS': 1000,
+ 'COORDINATES_WIDTH_BINS': 1000,
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
+ 'PARSE_TASKS': [
+ {
+ 'TASK_NAME': 'od',
+ 'PATTERN': r'([a-zA-Z0-9 ]+)'
+ },
+ {
+ 'TASK_NAME': 'ocr',
+ 'PATTERN': r'(.+?)',
+ 'AREA_THRESHOLD': 0.01
+ },
+ {
+ 'TASK_NAME': 'phrase_grounding',
+ 'FILTER_BY_BLACK_LIST': True
+ },
+ {
+ 'TASK_NAME': 'pure_text',
+ },
+ {
+ 'TASK_NAME': 'description_with_bboxes',
+ },
+ {
+ 'TASK_NAME': 'description_with_polygons',
+ },
+ {
+ 'TASK_NAME': 'polygons',
+ },
+ {
+ 'TASK_NAME': 'bboxes',
+ },
+ {
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
+ }
+ ]
+ }
+
+ return config
+
+ def init_quantizers(self):
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+ self.box_quantizer = BoxQuantizer(
+ box_quantization_mode,
+ (num_bbox_width_bins, num_bbox_height_bins),
+ )
+
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+ self.coordinates_quantizer = CoordinatesQuantizer(
+ box_quantization_mode,
+ (num_bbox_width_bins, num_bbox_height_bins),
+ )
+
+ def decode_with_spans(self, tokenizer, token_ids):
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
+ token_ids, skip_special_tokens=False)
+ assert len(filtered_tokens) == len(token_ids)
+
+ # To avoid mixing byte-level and unicode for byte-level BPT
+ # we need to build string separately for added tokens and byte-level tokens
+ # cf. https://github.com/huggingface/transformers/issues/1133
+ sub_texts = []
+ for token in filtered_tokens:
+ if token in self.all_special_tokens:
+ sub_texts.append(token)
+ else:
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
+ sub_text = tokenizer.convert_tokens_to_string([token])
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
+ # Note: Do not strip sub_text as it may have functional whitespace
+ sub_text = token.replace('▁', ' ')
+ else:
+ raise ValueError(f'type {type(tokenizer)} not supported')
+ sub_texts.append(sub_text)
+
+ text = ''
+ spans = []
+ for sub_text in sub_texts:
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
+ text += sub_text
+ spans.append(span)
+
+ # Text format:
+ # 1. T5Tokenizer/T5TokenizerFast:
+ # " transplanting dog cat"
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
+ # 2. BartTokenizer (need to double check):
+ # "transplanting dogcat"
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
+ return text, spans
+
+ def parse_od_from_text_and_spans(
+ self,
+ text,
+ pattern,
+ image_size,
+ phrase_centric=False
+ ):
+ parsed = list(re.finditer(pattern, text))
+
+ instances = []
+ for i in range(len(parsed)):
+ # Prepare instance.
+ instance = {}
+
+ if phrase_centric:
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
+ else:
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
+ instance['bbox'] = self.box_quantizer.dequantize(
+ boxes=torch.tensor(bbox_bins),
+ size=image_size
+ ).tolist()
+
+ if phrase_centric:
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
+ else:
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
+ instances.append(instance)
+
+ return instances
+
+ def parse_ocr_from_text_and_spans(self,
+ text,
+ pattern,
+ image_size,
+ area_threshold=-1.0,
+ ):
+ bboxes = []
+ labels = []
+ text = text.replace('', '')
+ # ocr with regions
+ parsed = re.findall(pattern, text)
+ instances = []
+ image_width, image_height = image_size
+
+ for ocr_line in parsed:
+ ocr_content = ocr_line[0]
+ quad_box = ocr_line[1:]
+ quad_box = [int(i) for i in quad_box]
+ quad_box = self.coordinates_quantizer.dequantize(
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
+ size=image_size
+ ).reshape(-1).tolist()
+
+ if area_threshold > 0:
+ x_coords = [i for i in quad_box[0::2]]
+ y_coords = [i for i in quad_box[1::2]]
+
+ # apply the Shoelace formula
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
+
+ if area < (image_width * image_height) * area_threshold:
+ continue
+
+ bboxes.append(quad_box)
+ labels.append(ocr_content)
+ instances.append({
+ 'quad_box': quad_box,
+ 'text': ocr_content,
+ })
+ return instances
+
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
+ # ignore and
+ cur_span = 0
+ if text.startswith(''):
+ cur_span += 3
+
+ text = text.replace('', '')
+ text = text.replace('', '')
+ text = text.replace('', '')
+
+ pattern = r"([^<]+(?:){4,})"
+ phrases = re.findall(pattern, text)
+
+ # pattern should be text pattern and od pattern
+ pattern = r'^\s*(.*?)(?=||||||'
+
+ instances = []
+ for pharse_text in phrases:
+ phrase_text_strip = pharse_text.replace('', '', 1)
+ phrase_text_strip = pharse_text.replace('', '', 1)
+
+ if phrase_text_strip == '':
+ cur_span += len(pharse_text)
+ continue
+
+ # Prepare instance.
+ instance = {}
+
+ # parse phrase, get string
+ phrase = re.search(pattern, phrase_text_strip)
+ if phrase is None:
+ cur_span += len(pharse_text)
+ continue
+
+ # parse bboxes by box_pattern
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
+ if len(bboxes_parsed) == 0:
+ cur_span += len(pharse_text)
+ continue
+
+ phrase = phrase.group()
+ # remove leading and trailing spaces
+ phrase = phrase.strip()
+
+ if phrase in self.black_list_of_phrase_grounding:
+ cur_span += len(pharse_text)
+ continue
+
+ # a list of list
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
+ instance['bbox'] = self.box_quantizer.dequantize(
+ boxes=torch.tensor(bbox_bins),
+ size=image_size
+ ).tolist()
+
+ # exclude non-ascii characters
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+ instance['cat_name'] = phrase
+
+ instances.append(instance)
+
+ return instances
+
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
+ # temporary parse solution, split by '.'
+ # ignore and
+
+ text = text.replace('', '')
+ text = text.replace('', '')
+ text = text.replace('', '')
+
+ if allow_empty_phrase:
+ pattern = rf"(?:(?:){{4,}})"
+ else:
+ pattern = r"([^<]+(?:){4,})"
+ phrases = re.findall(pattern, text)
+
+ # pattern should be text pattern and od pattern
+ pattern = r'^\s*(.*?)(?=||||||'
+
+ instances = []
+ for pharse_text in phrases:
+ phrase_text_strip = pharse_text.replace('', '', 1)
+ phrase_text_strip = pharse_text.replace('', '', 1)
+
+ if phrase_text_strip == '' and not allow_empty_phrase:
+ continue
+
+ # parse phrase, get string
+ phrase = re.search(pattern, phrase_text_strip)
+ if phrase is None:
+ continue
+
+ phrase = phrase.group()
+ # remove leading and trailing spaces
+ phrase = phrase.strip()
+
+ # parse bboxes by box_pattern
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
+ if len(bboxes_parsed) == 0:
+ continue
+
+ # a list of list
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
+
+ bboxes = self.box_quantizer.dequantize(
+ boxes=torch.tensor(bbox_bins),
+ size=image_size
+ ).tolist()
+
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+ for _bboxes in bboxes:
+ # Prepare instance.
+ instance = {}
+ instance['bbox'] = _bboxes
+ # exclude non-ascii characters
+ instance['cat_name'] = phrase
+ instances.append(instance)
+
+ return instances
+
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
+ allow_empty_phrase=False,
+ polygon_sep_token='',
+ polygon_start_token='',
+ polygon_end_token='',
+ with_box_at_start=False,
+ ):
+
+ # ref_seg format: '<><><><><><>'
+ # ignore and
+
+ text = text.replace('', '')
+ text = text.replace('', '')
+ text = text.replace('', '')
+
+ if allow_empty_phrase:
+ pattern = rf"(?:(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+ else:
+ # [^<]+: This part matches one or more characters that are not the < symbol.
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
+ #
+ pattern = rf"([^<]+(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+ phrases = re.findall(pattern, text)
+
+ phrase_string_pattern = r'^\s*(.*?)(?=||||||)'
+ box_pattern = rf'((?:)+)(?:{re.escape(polygon_sep_token)}|$)'
+
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
+
+ instances = []
+ for phrase_text in phrases:
+
+ # exclude loc_\d+>
+ # need to get span if want to include category score
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
+
+ # phrase = phrase.replace('', '')
+ # phrase = phrase.replace('poly>', '')
+
+ if phrase_text_strip == '' and not allow_empty_phrase:
+ continue
+
+
+ # parse phrase, get string
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
+ if phrase is None:
+ continue
+ phrase = phrase.group()
+ # remove leading and trailing spaces
+ phrase = phrase.strip()
+
+ # parse bboxes by box_pattern
+
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
+ else:
+ polygons_instances_parsed = [phrase_text]
+
+ for _polygons_instances_parsed in polygons_instances_parsed:
+ # Prepare instance.
+ instance = {}
+
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
+ if isinstance(_polygons_instances_parsed, str):
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
+ else:
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
+ if len(polygons_parsed) == 0:
+ continue
+
+ # a list of list (polygon)
+ bbox = []
+ polygons = []
+ for _polygon_parsed in polygons_parsed:
+ # group 1: whole ...
+ _polygon = _polygon_parsed.group(1)
+ # parse into list of int
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'', _polygon)]
+ if with_box_at_start and len(bbox) == 0:
+ if len(_polygon) > 4:
+ # no valid bbox prediction
+ bbox = _polygon[:4]
+ _polygon = _polygon[4:]
+ else:
+ bbox = [0, 0, 0, 0]
+ # abandon last element if is not paired
+ if len(_polygon) % 2 == 1:
+ _polygon = _polygon[:-1]
+
+ # reshape into (n, 2)
+ _polygon = self.coordinates_quantizer.dequantize(
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
+ size=image_size
+ ).reshape(-1).tolist()
+ # reshape back
+ polygons.append(_polygon)
+
+ instance['cat_name'] = phrase
+ instance['polygons'] = polygons
+ if len(bbox) != 0:
+ instance['bbox'] = self.box_quantizer.dequantize(
+ boxes=torch.tensor([bbox]),
+ size=image_size
+ ).tolist()[0]
+
+ instances.append(instance)
+
+ return instances
+
+ def __call__(
+ self,
+ text=None,
+ image_size=None,
+ parse_tasks=None,
+ ):
+ """
+ Args:
+ text: model outputs
+ image_size: (width, height)
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
+
+ """
+ if parse_tasks is not None:
+ if isinstance(parse_tasks, str):
+ parse_tasks = [parse_tasks]
+ for _parse_task in parse_tasks:
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
+
+ # sequence or text should be provided
+ assert text is not None, 'text should be provided'
+
+ parsed_dict = {
+ 'text': text
+ }
+
+ for task in self.parse_tasks:
+ if parse_tasks is not None and task not in parse_tasks:
+ continue
+
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
+
+ if task == 'ocr':
+ instances = self.parse_ocr_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01),
+ )
+ parsed_dict['ocr'] = instances
+ elif task == 'phrase_grounding':
+ instances = self.parse_phrase_grounding_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ )
+ parsed_dict['phrase_grounding'] = instances
+ elif task == 'pure_text':
+ parsed_dict['pure_text'] = text
+ elif task == 'description_with_bboxes':
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ )
+ parsed_dict['description_with_bboxes'] = instances
+ elif task == 'description_with_polygons':
+ instances = self.parse_description_with_polygons_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ )
+ parsed_dict['description_with_polygons'] = instances
+ elif task == 'polygons':
+ instances = self.parse_description_with_polygons_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ allow_empty_phrase=True,
+ )
+ parsed_dict['polygons'] = instances
+ elif task == 'bboxes':
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ allow_empty_phrase=True,
+ )
+ parsed_dict['bboxes'] = instances
+ elif task == 'description_with_bboxes_or_polygons':
+ if '' in text:
+ # only support either polygons or bboxes, not both at the same time
+ instances = self.parse_description_with_polygons_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ )
+ else:
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
+ text,
+ pattern=pattern,
+ image_size=image_size,
+ )
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
+ else:
+ raise ValueError("task {} is not supported".format(task))
+
+ return parsed_dict
\ No newline at end of file
From b4cb3d6e2425dd570c2458af698d25f976f957e8 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 03:01:01 +0200
Subject: [PATCH 03/35] fix florence2 processing, add
image_utils.ChannelDimension
---
src/transformers/models/florence/processing_florence2.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index 90797a3a242d..7e985e17a8b7 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -24,7 +24,7 @@
import torch
from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, is_valid_image
+from ...image_utils import ImageInput, is_valid_image, ChannelDimension
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import (
PaddingStrategy,
@@ -166,9 +166,9 @@ def __call__(
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
- data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
+ data_format: Optional[ChannelDimension] = "channels_first", # noqa: F821
input_data_format: Optional[
- Union[str, "ChannelDimension"] # noqa: F821
+ Union[str, ChannelDimension] # noqa: F821
] = None,
resample: "PILImageResampling" = None, # noqa: F821
do_convert_rgb: bool = None,
From 73041bacb3a04297347e7c51674508ac551b18c0 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 03:03:38 +0200
Subject: [PATCH 04/35] fix florence2 processing, add missing tokenizers
---
src/transformers/models/florence/processing_florence2.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index 7e985e17a8b7..b97945c47672 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -23,6 +23,11 @@
import torch
+from ...models.bart.tokenization_bart import BartTokenizer
+from ...models.bart.tokenization_bart_fast import BartTokenizerFast
+from ...models.t5.tokenization_t5 import T5Tokenizer
+from ...models.t5.tokenization_t5_fast import T5TokenizerFast
+
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image, ChannelDimension
from ...processing_utils import ProcessorMixin
From 1ce7813f61edda78316239b8e9b42ee91cd06a97 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 03:13:33 +0200
Subject: [PATCH 05/35] add florence2 modeling
---
.../models/florence/modeling_florence2.py | 2928 +++++++++++++++++
1 file changed, 2928 insertions(+)
create mode 100644 src/transformers/models/florence/modeling_florence2.py
diff --git a/src/transformers/models/florence/modeling_florence2.py b/src/transformers/models/florence/modeling_florence2.py
new file mode 100644
index 000000000000..41a71c8b5624
--- /dev/null
+++ b/src/transformers/models/florence/modeling_florence2.py
@@ -0,0 +1,2928 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" PyTorch Florence-2 model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import math
+import warnings
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch.nn import CrossEntropyLoss
+from collections import OrderedDict
+# from einops import rearrange
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_flash_attn_2_available,
+ logging,
+ replace_return_docstrings,
+ is_flash_attn_2_available,
+ is_flash_attn_greater_or_equal_2_10,
+)
+from .configuration_florence2 import (
+ Florence2Config,
+ Florence2LanguageConfig,
+ Florence2VisionConfig,
+)
+
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import (
+ _prepare_4d_attention_mask,
+ _prepare_4d_attention_mask_for_sdpa,
+ _prepare_4d_causal_attention_mask,
+ _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ...modeling_outputs import (
+ BaseModelOutput,
+ BaseModelOutputWithPastAndCrossAttentions,
+ Seq2SeqLMOutput,
+ Seq2SeqModelOutput,
+)
+
+
+if is_flash_attn_2_available():
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Florence2Config"
+
+class LearnedAbsolutePositionEmbedding2D(nn.Module):
+ """
+ This module learns positional embeddings up to a fixed maximum size.
+ """
+
+ def __init__(self, embedding_dim=256, num_pos=50):
+ super().__init__()
+ self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+ self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
+
+ def forward(self, pixel_values):
+ """
+ pixel_values: (batch_size, height, width, num_channels)
+ returns: (batch_size, height, width, embedding_dim * 2)
+ """
+ if len(pixel_values.shape) != 4:
+ raise ValueError('pixel_values must be a 4D tensor')
+ height, width = pixel_values.shape[1:3]
+ width_values = torch.arange(width, device=pixel_values.device)
+ height_values = torch.arange(height, device=pixel_values.device)
+ x_emb = self.column_embeddings(width_values)
+ y_emb = self.row_embeddings(height_values)
+ # (height, width, embedding_dim * 2)
+ pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+ # (embedding_dim * 2, height, width)
+ pos = pos.permute(2, 0, 1)
+ pos = pos.unsqueeze(0)
+ # (batch_size, embedding_dim * 2, height, width)
+ pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+ # (batch_size, height, width, embedding_dim * 2)
+ pos = pos.permute(0, 2, 3, 1)
+ return pos
+
+class PositionalEmbeddingCosine1D(nn.Module):
+ """
+ This class implements a very simple positional encoding. It follows closely
+ the encoder from the link below:
+ https://pytorch.org/tutorials/beginner/translation_transformer.html
+
+ Args:
+ embed_dim: The dimension of the embeddings.
+ dropout_prob: The dropout probability.
+ max_seq_len: The maximum length to precompute the positional encodings.
+ """
+ def __init__(
+ self,
+ embed_dim: int = 512,
+ max_seq_len: int = 1024) -> None:
+ super(PositionalEmbeddingCosine1D, self).__init__()
+ self.embed_dim = embed_dim
+ self.max_seq_len = max_seq_len
+ # Generate the sinusoidal arrays.
+ factor = math.log(10000)
+ denominator = torch.exp(
+ -factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim)
+ # Matrix where rows correspond to a positional embedding as a function
+ # of the position index (i.e., the row index).
+ frequencies = \
+ torch.arange(0, self.max_seq_len) \
+ .reshape(self.max_seq_len, 1) * denominator
+ pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
+ # Populate uneven entries.
+ pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
+ pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
+ # Save the positional embeddings in a constant buffer.
+ self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
+
+ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ seq_embeds: The sequence embeddings in order. Allowed size:
+ 1. [T, D], where T is the length of the sequence, and D is the
+ frame embedding dimension.
+ 2. [B, T, D], where B is the batch size and T and D are the
+ same as above.
+
+ Returns a tensor of with the same dimensions as the input: i.e.,
+ [1, T, D] or [T, D].
+ """
+ shape_len = len(seq_embeds.shape)
+ assert 2 <= shape_len <= 3
+ len_seq = seq_embeds.size(-2)
+ assert len_seq <= self.max_seq_len
+ pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
+ # Adapt pre-computed positional embeddings to the input.
+ if shape_len == 3:
+ pos_embeds = pos_embeds.view(
+ (1, pos_embeds.size(0), pos_embeds.size(1)))
+ return pos_embeds
+
+
+class LearnedAbsolutePositionEmbedding1D(nn.Module):
+ """
+ Learnable absolute positional embeddings for 1D sequences.
+
+ Args:
+ embed_dim: The dimension of the embeddings.
+ max_seq_len: The maximum length to precompute the positional encodings.
+ """
+ def __init__(
+ self,
+ embedding_dim: int = 512,
+ num_pos: int = 1024) -> None:
+ super(LearnedAbsolutePositionEmbedding1D, self).__init__()
+ self.embeddings = nn.Embedding(num_pos, embedding_dim)
+ self.num_pos = num_pos
+
+ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ seq_embeds: The sequence embeddings in order. Allowed size:
+ 1. [T, D], where T is the length of the sequence, and D is the
+ frame embedding dimension.
+ 2. [B, T, D], where B is the batch size and T and D are the
+ same as above.
+
+ Returns a tensor of with the same dimensions as the input: i.e.,
+ [1, T, D] or [T, D].
+ """
+ shape_len = len(seq_embeds.shape)
+ assert 2 <= shape_len <= 3
+ len_seq = seq_embeds.size(-2)
+ assert len_seq <= self.num_pos
+ # [T, D]
+ pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
+ # Adapt pre-computed positional embeddings to the input.
+ if shape_len == 3:
+ pos_embeds = pos_embeds.view(
+ (1, pos_embeds.size(0), pos_embeds.size(1)))
+ return pos_embeds
+
+
+
+class MySequential(nn.Sequential):
+ def forward(self, *inputs):
+ for module in self._modules.values():
+ if type(inputs) == tuple:
+ inputs = module(*inputs)
+ else:
+ inputs = module(inputs)
+ return inputs
+
+
+class PreNorm(nn.Module):
+ def __init__(self, norm, fn, drop_path=None):
+ super().__init__()
+ self.norm = norm
+ self.fn = fn
+ self.drop_path = drop_path
+
+ def forward(self, x, *args, **kwargs):
+ shortcut = x
+ if self.norm != None:
+ x, size = self.fn(self.norm(x), *args, **kwargs)
+ else:
+ x, size = self.fn(x, *args, **kwargs)
+
+ if self.drop_path:
+ x = self.drop_path(x)
+
+ x = shortcut + x
+
+ return x, size
+
+
+class Mlp(nn.Module):
+ def __init__(
+ self,
+ in_features,
+ hidden_features=None,
+ out_features=None,
+ act_layer=nn.GELU,
+ ):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.net = nn.Sequential(OrderedDict([
+ ("fc1", nn.Linear(in_features, hidden_features)),
+ ("act", act_layer()),
+ ("fc2", nn.Linear(hidden_features, out_features))
+ ]))
+
+ def forward(self, x, size):
+ return self.net(x), size
+
+
+class DepthWiseConv2d(nn.Module):
+ def __init__(
+ self,
+ dim_in,
+ kernel_size,
+ padding,
+ stride,
+ bias=True,
+ ):
+ super().__init__()
+ self.dw = nn.Conv2d(
+ dim_in, dim_in,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=dim_in,
+ stride=stride,
+ bias=bias
+ )
+
+ def forward(self, x, size):
+ B, N, C = x.shape
+ H, W = size
+ assert N == H * W
+
+ x = self.dw(x.transpose(1, 2).view(B, C, H, W))
+ size = (x.size(-2), x.size(-1))
+ x = x.flatten(2).transpose(1, 2)
+ return x, size
+
+
+class ConvEmbed(nn.Module):
+ """ Image to Patch Embedding
+ """
+
+ def __init__(
+ self,
+ patch_size=7,
+ in_chans=3,
+ embed_dim=64,
+ stride=4,
+ padding=2,
+ norm_layer=None,
+ pre_norm=True
+ ):
+ super().__init__()
+ self.patch_size = patch_size
+
+ self.proj = nn.Conv2d(
+ in_chans, embed_dim,
+ kernel_size=patch_size,
+ stride=stride,
+ padding=padding
+ )
+
+ dim_norm = in_chans if pre_norm else embed_dim
+ self.norm = norm_layer(dim_norm) if norm_layer else None
+
+ self.pre_norm = pre_norm
+
+ def forward(self, x, size):
+ H, W = size
+ if len(x.size()) == 3:
+ if self.norm and self.pre_norm:
+ x = self.norm(x)
+ # x = rearrange(
+ # x, 'b (h w) c -> b c h w',
+ # h=H, w=W
+ # )
+ x = x.view(-1, H, W, x.size(-1)).permute(0, 3, 1, 2)
+
+ x = self.proj(x)
+
+ _, _, H, W = x.shape
+ # TODO: check if this is correct
+ #x = rearrange(x, 'b c h w -> b (h w) c')
+ x = x.permute(0, 2, 3, 1).contiguous()
+ B, H, W, C = x.size()
+ x = x.view(B, -1, C)
+
+ if self.norm and not self.pre_norm:
+ x = self.norm(x)
+
+ return x, (H, W)
+
+
+class ChannelAttention(nn.Module):
+
+ def __init__(self, dim, groups=8, qkv_bias=True):
+ super().__init__()
+
+ self.groups = groups
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.proj = nn.Linear(dim, dim)
+
+ def forward(self, x, size):
+ B, N, C = x.shape
+
+ qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ q = q * (float(N) ** -0.5)
+ attention = q.transpose(-1, -2) @ k
+ attention = attention.softmax(dim=-1)
+ x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
+ x = x.transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ return x, size
+
+
+#https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+ self.scale_by_keep = scale_by_keep
+
+ def forward(self, x):
+ if self.drop_prob == 0. or not self.training:
+ return x
+ keep_prob = 1 - self.drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0 and self.scale_by_keep:
+ random_tensor.div_(keep_prob)
+ return x * random_tensor
+
+ def extra_repr(self):
+ return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+
+# TODO: can this be replaced with torch.nn.init.trunc_normal_?
+#https://github.com/huggingface/pytorch-image-models/blob/b28945ff056d454b174f0fb8682e362b87150141/timm/layers/weight_init.py
+def trunc_normal_(tensor: torch.Tensor, mean=0., std=1., a=-2., b=2.) -> torch.Tensor:
+ r"""Fills the input Tensor with values drawn from a truncated
+ normal distribution. The values are effectively drawn from the
+ normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+ with values outside :math:`[a, b]` redrawn until they are within
+ the bounds. The method used for generating the random values works
+ best when :math:`a \leq \text{mean} \leq b`.
+
+ NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
+ applied while sampling the normal with mean/std applied, therefore a, b args
+ should be adjusted to match the range of mean, std args.
+
+ Args:
+ tensor: an n-dimensional `torch.Tensor`
+ mean: the mean of the normal distribution
+ std: the standard deviation of the normal distribution
+ a: the minimum cutoff value
+ b: the maximum cutoff value
+ """
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+ def norm_cdf(x):
+ # Computes standard normal cumulative distribution function
+ return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+ with torch.no_grad():
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
+ warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+ "The distribution of values may be incorrect.",
+ stacklevel=2)
+
+ # Values are generated by using a truncated uniform distribution and
+ # then using the inverse CDF for the normal distribution.
+ # Get upper and lower cdf values
+ l = norm_cdf((a - mean) / std)
+ u = norm_cdf((b - mean) / std)
+
+ # Uniformly fill tensor with values from [l, u], then translate to
+ # [2l-1, 2u-1].
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+ # Use inverse cdf transform for normal distribution to get truncated
+ # standard normal
+ tensor.erfinv_()
+
+ # Transform to proper mean, std
+ tensor.mul_(std * math.sqrt(2.))
+ tensor.add_(mean)
+
+ # Clamp to ensure it's in the proper range
+ tensor.clamp_(min=a, max=b)
+ return tensor
+
+
+class ChannelBlock(nn.Module):
+
+ def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
+ drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+ conv_at_attn=True, conv_at_ffn=True):
+ super().__init__()
+
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+ self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+ self.channel_attn = PreNorm(
+ norm_layer(dim),
+ ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
+ drop_path
+ )
+ self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+ self.ffn = PreNorm(
+ norm_layer(dim),
+ Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
+ drop_path
+ )
+
+ def forward(self, x, size):
+ if self.conv1:
+ x, size = self.conv1(x, size)
+ x, size = self.channel_attn(x, size)
+
+ if self.conv2:
+ x, size = self.conv2(x, size)
+ x, size = self.ffn(x, size)
+
+ return x, size
+
+
+def window_partition(x, window_size: int):
+ B, H, W, C = x.shape
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows
+
+
+def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
+ B = batch_size
+ # this will cause onnx conversion failed for dynamic axis, because treated as constant
+ # int(windows.shape[0] / (H * W / window_size / window_size))
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+ return x
+
+
+class WindowAttention(nn.Module):
+ def __init__(self, dim, num_heads, window_size, qkv_bias=True):
+
+ super().__init__()
+ self.dim = dim
+ self.window_size = window_size
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = float(head_dim) ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.proj = nn.Linear(dim, dim)
+
+ self.softmax = nn.Softmax(dim=-1)
+
+ def forward(self, x, size):
+
+ H, W = size
+ B, L, C = x.shape
+ assert L == H * W, "input feature has wrong size"
+
+ x = x.view(B, H, W, C)
+
+ pad_l = pad_t = 0
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+ _, Hp, Wp, _ = x.shape
+
+ x = window_partition(x, self.window_size)
+ x = x.view(-1, self.window_size * self.window_size, C)
+
+ # W-MSA/SW-MSA
+ # attn_windows = self.attn(x_windows)
+
+ B_, N, C = x.shape
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ q = q * self.scale
+ attn = (q @ k.transpose(-2, -1))
+ attn = self.softmax(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+ x = self.proj(x)
+
+ # merge windows
+ x = x.view(
+ -1, self.window_size, self.window_size, C
+ )
+ x = window_reverse(x, B, self.window_size, Hp, Wp)
+
+ if pad_r > 0 or pad_b > 0:
+ x = x[:, :H, :W, :].contiguous()
+
+ x = x.view(B, H * W, C)
+
+ return x, size
+
+
+class SpatialBlock(nn.Module):
+
+ def __init__(self, dim, num_heads, window_size,
+ mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
+ norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
+ super().__init__()
+
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+ self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+ self.window_attn = PreNorm(
+ norm_layer(dim),
+ WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
+ drop_path
+ )
+ self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+ self.ffn = PreNorm(
+ norm_layer(dim),
+ Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
+ drop_path
+ )
+
+ def forward(self, x, size):
+ if self.conv1:
+ x, size = self.conv1(x, size)
+ x, size = self.window_attn(x, size)
+
+ if self.conv2:
+ x, size = self.conv2(x, size)
+ x, size = self.ffn(x, size)
+ return x, size
+
+
+class DaViT(nn.Module):
+ """ DaViT: Dual-Attention Transformer
+
+ Args:
+ in_chans (int): Number of input image channels. Default: 3.
+ num_classes (int): Number of classes for classification head. Default: 1000.
+ patch_size (tuple(int)): Patch size of convolution in different stages. Default: (7, 2, 2, 2).
+ patch_stride (tuple(int)): Patch stride of convolution in different stages. Default: (4, 2, 2, 2).
+ patch_padding (tuple(int)): Patch padding of convolution in different stages. Default: (3, 0, 0, 0).
+ patch_prenorm (tuple(bool)): If True, perform norm before convlution layer. Default: (True, False, False, False).
+ embed_dims (tuple(int)): Patch embedding dimension in different stages. Default: (64, 128, 192, 256).
+ num_heads (tuple(int)): Number of spatial attention heads in different stages. Default: (4, 8, 12, 16).
+ num_groups (tuple(int)): Number of channel groups in different stages. Default: (4, 8, 12, 16).
+ window_size (int): Window size. Default: 7.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True.
+ drop_path_rate (float): Stochastic depth rate. Default: 0.1.
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+ enable_checkpoint (bool): If True, enable checkpointing. Default: False.
+ conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True.
+ conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True.
+ """
+
+ def __init__(
+ self,
+ in_chans=3,
+ num_classes=1000,
+ depths=(1, 1, 3, 1),
+ patch_size=(7, 2, 2, 2),
+ patch_stride=(4, 2, 2, 2),
+ patch_padding=(3, 0, 0, 0),
+ patch_prenorm=(False, False, False, False),
+ embed_dims=(64, 128, 192, 256),
+ num_heads=(3, 6, 12, 24),
+ num_groups=(3, 6, 12, 24),
+ window_size=7,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ drop_path_rate=0.1,
+ norm_layer=nn.LayerNorm,
+ enable_checkpoint=False,
+ conv_at_attn=True,
+ conv_at_ffn=True,
+ ):
+ super().__init__()
+
+ self.num_classes = num_classes
+ self.embed_dims = embed_dims
+ self.num_heads = num_heads
+ self.num_groups = num_groups
+ self.num_stages = len(self.embed_dims)
+ self.enable_checkpoint = enable_checkpoint
+ assert self.num_stages == len(self.num_heads) == len(self.num_groups)
+
+ num_stages = len(embed_dims)
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
+
+ depth_offset = 0
+ convs = []
+ blocks = []
+ for i in range(num_stages):
+ conv_embed = ConvEmbed(
+ patch_size=patch_size[i],
+ stride=patch_stride[i],
+ padding=patch_padding[i],
+ in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
+ embed_dim=self.embed_dims[i],
+ norm_layer=norm_layer,
+ pre_norm=patch_prenorm[i]
+ )
+ convs.append(conv_embed)
+
+ block = MySequential(
+ *[
+ MySequential(OrderedDict([
+ (
+ 'spatial_block', SpatialBlock(
+ embed_dims[i],
+ num_heads[i],
+ window_size,
+ drop_path_rate=dpr[depth_offset+j*2],
+ qkv_bias=qkv_bias,
+ mlp_ratio=mlp_ratio,
+ conv_at_attn=conv_at_attn,
+ conv_at_ffn=conv_at_ffn,
+ )
+ ),
+ (
+ 'channel_block', ChannelBlock(
+ embed_dims[i],
+ num_groups[i],
+ drop_path_rate=dpr[depth_offset+j*2+1],
+ qkv_bias=qkv_bias,
+ mlp_ratio=mlp_ratio,
+ conv_at_attn=conv_at_attn,
+ conv_at_ffn=conv_at_ffn,
+ )
+ )
+ ])) for j in range(depths[i])
+ ]
+ )
+ blocks.append(block)
+ depth_offset += depths[i]*2
+
+ self.convs = nn.ModuleList(convs)
+ self.blocks = nn.ModuleList(blocks)
+
+ self.norms = norm_layer(self.embed_dims[-1])
+ self.avgpool = nn.AdaptiveAvgPool1d(1)
+ self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+
+ self.apply(self._init_weights)
+
+ @property
+ def dim_out(self):
+ return self.embed_dims[-1]
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=0.02)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Conv2d):
+ nn.init.normal_(m.weight, std=0.02)
+ for name, _ in m.named_parameters():
+ if name in ['bias']:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.weight, 1.0)
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1.0)
+ nn.init.constant_(m.bias, 0)
+
+ def forward_features_unpool(self, x):
+ """
+ forward until avg pooling
+ Args:
+ x (_type_): input image tensor
+ """
+ input_size = (x.size(2), x.size(3))
+ for conv, block in zip(self.convs, self.blocks):
+ x, input_size = conv(x, input_size)
+ if self.enable_checkpoint:
+ x, input_size = checkpoint.checkpoint(block, x, input_size)
+ else:
+ x, input_size = block(x, input_size)
+ return x
+
+ def forward_features(self, x):
+ x = self.forward_features_unpool(x)
+
+ # (batch_size, num_tokens, token_dim)
+ x = self.avgpool(x.transpose(1, 2))
+ # (batch_size, 1, num_tokens)
+ x = torch.flatten(x, 1)
+ x = self.norms(x)
+
+ return x
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ x = self.head(x)
+ return x
+
+ @classmethod
+ def from_config(cls, config):
+ return cls(
+ depths=config.depths,
+ embed_dims=config.dim_embed,
+ num_heads=config.num_heads,
+ num_groups=config.num_groups,
+ patch_size=config.patch_size,
+ patch_stride=config.patch_stride,
+ patch_padding=config.patch_padding,
+ patch_prenorm=config.patch_prenorm,
+ drop_path_rate=config.drop_path_rate,
+ window_size=config.window_size,
+ )
+
+
+
+
+if is_flash_attn_2_available():
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+ """
+ Shift input ids one token to the right.
+ """
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+ shifted_input_ids[:, 0] = decoder_start_token_id
+
+ if pad_token_id is None:
+ raise ValueError("self.model.config.pad_token_id has to be defined.")
+ # replace possible -100 values in labels by `pad_token_id`
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+ return shifted_input_ids
+
+
+class Florence2LearnedPositionalEmbedding(nn.Embedding):
+ """
+ This module learns positional embeddings up to a fixed maximum size.
+ """
+
+ def __init__(self, num_embeddings: int, embedding_dim: int):
+ # Florence2 is set up so that if padding_idx is specified then offset the embedding ids by 2
+ # and adjust num_embeddings appropriately. Other models don't have this hack
+ self.offset = 2
+ super().__init__(num_embeddings + self.offset, embedding_dim)
+
+ def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+ """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+ bsz, seq_len = input_ids.shape[:2]
+ positions = torch.arange(
+ past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+ ).expand(bsz, -1)
+
+ return super().forward(positions + self.offset)
+
+
+class Florence2ScaledWordEmbedding(nn.Embedding):
+ """
+ This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+ """
+
+ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+ super().__init__(num_embeddings, embedding_dim, padding_idx)
+ self.embed_scale = embed_scale
+
+ def forward(self, input_ids: torch.Tensor):
+ return super().forward(input_ids) * self.embed_scale
+
+
+class Florence2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ dropout: float = 0.0,
+ is_decoder: bool = False,
+ bias: bool = True,
+ is_causal: bool = False,
+ config: Optional[Florence2LanguageConfig] = None,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ self.config = config
+
+ if (self.head_dim * num_heads) != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+ f" and `num_heads`: {num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+ self.is_decoder = is_decoder
+ self.is_causal = is_causal
+
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, tgt_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self.q_proj(hidden_states) * self.scaling
+ # get key, value proj
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
+ # the provided `key_value_states` to support prefix tuning
+ if (
+ is_cross_attention
+ and past_key_value is not None
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
+ ):
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+ else:
+ # self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+ key_states = key_states.reshape(*proj_shape)
+ value_states = value_states.reshape(*proj_shape)
+
+ src_len = key_states.size(1)
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ if layer_head_mask is not None:
+ if layer_head_mask.size() != (self.num_heads,):
+ raise ValueError(
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+ f" {layer_head_mask.size()}"
+ )
+ attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+ if output_attentions:
+ # this operation is a bit awkward, but it's required to
+ # make sure that attn_weights keeps its gradient.
+ # In order to do so, attn_weights have to be reshaped
+ # twice and have to be reused in the following
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+ else:
+ attn_weights_reshaped = None
+
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ attn_output = torch.bmm(attn_probs, value_states)
+
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+ attn_output = attn_output.transpose(1, 2)
+
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+ # partitioned across GPUs when using tensor-parallelism.
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Florence2FlashAttention2(Florence2Attention):
+ """
+ Florence2 flash attention module. This module inherits from `Florence2Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+ def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # Florence2FlashAttention2 attention does not support output_attentions
+ if output_attentions:
+ raise ValueError("Florence2FlashAttention2 attention does not support output_attentions")
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, q_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
+ # get key, value proj
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
+ # the provided `key_value_states` to support prefix tuning
+ if (
+ is_cross_attention
+ and past_key_value is not None
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
+ ):
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0].transpose(1, 2)
+ value_states = past_key_value[1].transpose(1, 2)
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+ value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+ else:
+ # self_attention
+ key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, -1)
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`float`):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ if not self._flash_attn_uses_top_left_mask:
+ causal = self.is_causal
+ else:
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+ causal = self.is_causal and query_length != 1
+
+ # Contains at least one padding token in the sequence
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q,
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+
+class Florence2SdpaAttention(Florence2Attention):
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+ if output_attentions or layer_head_mask is not None:
+ # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "Florence2Model is using Florence2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+ ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states,
+ key_value_states=key_value_states,
+ past_key_value=past_key_value,
+ attention_mask=attention_mask,
+ layer_head_mask=layer_head_mask,
+ output_attentions=output_attentions,
+ )
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, tgt_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self.q_proj(hidden_states)
+ # get key, value proj
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
+ # the provided `key_value_states` to support prefix tuning
+ if (
+ is_cross_attention
+ and past_key_value is not None
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
+ ):
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+ else:
+ # self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ query_states = self._shape(query_states, tgt_len, bsz)
+
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
+ # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+ # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=attention_mask,
+ dropout_p=self.dropout if self.training else 0.0,
+ is_causal=is_causal,
+ )
+
+ if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2)
+
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+ # partitioned across GPUs when using tensor-parallelism.
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+FLORENCE2_ATTENTION_CLASSES = {
+ "eager": Florence2Attention,
+ "sdpa": Florence2SdpaAttention,
+ "flash_attention_2": Florence2FlashAttention2,
+}
+
+
+class Florence2EncoderLayer(nn.Module):
+ def __init__(self, config: Florence2LanguageConfig):
+ super().__init__()
+ self.embed_dim = config.d_model
+
+ self.self_attn = FLORENCE2_ATTENTION_CLASSES[config._attn_implementation](
+ embed_dim=self.embed_dim,
+ num_heads=config.encoder_attention_heads,
+ dropout=config.attention_dropout,
+ config=config,
+ )
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.activation_dropout = config.activation_dropout
+ self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: torch.FloatTensor,
+ layer_head_mask: torch.FloatTensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ `(encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+ hidden_states, attn_weights, _ = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ layer_head_mask=layer_head_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ if hidden_states.dtype == torch.float16 and (
+ torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+ ):
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+class Florence2DecoderLayer(nn.Module):
+ def __init__(self, config: Florence2LanguageConfig):
+ super().__init__()
+ self.embed_dim = config.d_model
+
+ self.self_attn = FLORENCE2_ATTENTION_CLASSES[config._attn_implementation](
+ embed_dim=self.embed_dim,
+ num_heads=config.decoder_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=True,
+ is_causal=True,
+ config=config,
+ )
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.activation_dropout = config.activation_dropout
+
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ self.encoder_attn = FLORENCE2_ATTENTION_CLASSES[config._attn_implementation](
+ self.embed_dim,
+ config.decoder_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=True,
+ config=config,
+ )
+ self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+ self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = True,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`):
+ cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ `(encoder_attention_heads,)`.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size `(decoder_attention_heads,)`.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+
+ # Self Attention
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+ # add present self-attn cache to positions 1,2 of present_key_value tuple
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ past_key_value=self_attn_past_key_value,
+ attention_mask=attention_mask,
+ layer_head_mask=layer_head_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ # Cross-Attention Block
+ cross_attn_present_key_value = None
+ cross_attn_weights = None
+ if encoder_hidden_states is not None:
+ residual = hidden_states
+
+ # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+ hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+ hidden_states=hidden_states,
+ key_value_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask,
+ layer_head_mask=cross_attn_layer_head_mask,
+ past_key_value=cross_attn_past_key_value,
+ output_attentions=output_attentions,
+ )
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+ # add cross-attn to positions 3,4 of present_key_value tuple
+ present_key_value = present_key_value + cross_attn_present_key_value
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights, cross_attn_weights)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+
+class Florence2LanguagePreTrainedModel(PreTrainedModel):
+ config_class = Florence2LanguageConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
+ _no_split_modules = [r"Florence2EncoderLayer", r"Florence2DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+
+ def _init_weights(self, module):
+ std = self.config.init_std
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ @property
+ def dummy_inputs(self):
+ pad_token = self.config.pad_token_id
+ input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+ dummy_inputs = {
+ "attention_mask": input_ids.ne(pad_token),
+ "input_ids": input_ids,
+ }
+ return dummy_inputs
+
+
+class Florence2Encoder(Florence2LanguagePreTrainedModel):
+ """
+ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+ [`Florence2EncoderLayer`].
+
+ Args:
+ config: Florence2LanguageConfig
+ embed_tokens (nn.Embedding): output embedding
+ """
+
+ def __init__(self, config: Florence2LanguageConfig, embed_tokens: Optional[nn.Embedding] = None):
+ super().__init__(config)
+
+ self.dropout = config.dropout
+ self.layerdrop = config.encoder_layerdrop
+
+ embed_dim = config.d_model
+ self.padding_idx = config.pad_token_id
+ self.max_source_positions = config.max_position_embeddings
+ embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+ self.embed_tokens = Florence2ScaledWordEmbedding(
+ config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+ )
+
+ if embed_tokens is not None:
+ self.embed_tokens.weight = embed_tokens.weight
+
+ self.embed_positions = Florence2LearnedPositionalEmbedding(
+ config.max_position_embeddings,
+ embed_dim,
+ )
+ self.layers = nn.ModuleList([Florence2EncoderLayer(config) for _ in range(config.encoder_layers)])
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+ self._use_sdpa = config._attn_implementation == "sdpa"
+ self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+ provide it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input = input_ids
+ input_ids = input_ids.view(-1, input_ids.shape[-1])
+ elif inputs_embeds is not None:
+ input = inputs_embeds[:, :, -1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ embed_pos = self.embed_positions(input)
+ embed_pos = embed_pos.to(inputs_embeds.device)
+
+ hidden_states = inputs_embeds + embed_pos
+ hidden_states = self.layernorm_embedding(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ # expand attention_mask
+ if attention_mask is not None:
+ if self._use_flash_attention_2:
+ attention_mask = attention_mask if 0 in attention_mask else None
+ elif self._use_sdpa and head_mask is None and not output_attentions:
+ # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+ # the manual implementation that requires a 4D causal mask in all cases.
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+ else:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ # check if head_mask has a correct number of layers specified if desired
+ if head_mask is not None:
+ if head_mask.size()[0] != (len(self.layers)):
+ raise ValueError(
+ f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+ f" {head_mask.size()[0]}."
+ )
+
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ to_drop = False
+ if self.training:
+ dropout_probability = torch.rand([])
+ if dropout_probability < self.layerdrop: # skip the layer
+ to_drop = True
+
+ if to_drop:
+ layer_outputs = (None, None)
+ else:
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ encoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ (head_mask[idx] if head_mask is not None else None),
+ output_attentions,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+class Florence2Decoder(Florence2LanguagePreTrainedModel):
+ """
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Florence2DecoderLayer`]
+
+ Args:
+ config: Florence2LanguageConfig
+ embed_tokens (nn.Embedding): output embedding
+ """
+
+ def __init__(self, config: Florence2LanguageConfig, embed_tokens: Optional[nn.Embedding] = None):
+ super().__init__(config)
+ self.dropout = config.dropout
+ self.layerdrop = config.decoder_layerdrop
+ self.padding_idx = config.pad_token_id
+ self.max_target_positions = config.max_position_embeddings
+ embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+ self.embed_tokens = Florence2ScaledWordEmbedding(
+ config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+ )
+
+ if embed_tokens is not None:
+ self.embed_tokens.weight = embed_tokens.weight
+
+ self.embed_positions = Florence2LearnedPositionalEmbedding(
+ config.max_position_embeddings,
+ config.d_model,
+ )
+ self.layers = nn.ModuleList([Florence2DecoderLayer(config) for _ in range(config.decoder_layers)])
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+ self._use_sdpa = config._attn_implementation == "sdpa"
+
+ self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+ r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+ provide it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ of the decoder.
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+ Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+ selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ input = input_ids
+ input_shape = input.shape
+ input_ids = input_ids.view(-1, input_shape[-1])
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ input = inputs_embeds[:, :, -1]
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ # past_key_values_length
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input)
+
+ if self._use_flash_attention_2:
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+ # the manual implementation that requires a 4D causal mask in all cases.
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+ attention_mask,
+ input_shape,
+ inputs_embeds,
+ past_key_values_length,
+ )
+ else:
+ # 4d mask is passed through the layers
+ attention_mask = _prepare_4d_causal_attention_mask(
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
+ )
+
+ # expand encoder attention mask
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
+ if self._use_flash_attention_2:
+ encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+ elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+ # the manual implementation that requires a 4D causal mask in all cases.
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+ encoder_attention_mask,
+ inputs_embeds.dtype,
+ tgt_len=input_shape[-1],
+ )
+ else:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ encoder_attention_mask = _prepare_4d_attention_mask(
+ encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+ )
+
+ # embed positions
+ positions = self.embed_positions(input, past_key_values_length)
+ positions = positions.to(inputs_embeds.device)
+
+ hidden_states = inputs_embeds + positions
+ hidden_states = self.layernorm_embedding(hidden_states)
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+ next_decoder_cache = () if use_cache else None
+
+ # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+ for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+ if attn_mask is not None:
+ if attn_mask.size()[0] != (len(self.layers)):
+ raise ValueError(
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+ f" {head_mask.size()[0]}."
+ )
+
+ for idx, decoder_layer in enumerate(self.layers):
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+ if self.training:
+ dropout_probability = torch.rand([])
+ if dropout_probability < self.layerdrop:
+ continue
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ head_mask[idx] if head_mask is not None else None,
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+ None,
+ output_attentions,
+ use_cache,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+ cross_attn_layer_head_mask=(
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+ ),
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ if encoder_hidden_states is not None:
+ all_cross_attentions += (layer_outputs[2],)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(
+ v
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+ if v is not None
+ )
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
+ _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+ def __init__(self, config: Florence2LanguageConfig):
+ super().__init__(config)
+
+ padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+ self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+ self.encoder = Florence2Encoder(config, self.shared)
+ self.decoder = Florence2Decoder(config, self.shared)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def _tie_weights(self):
+ if self.config.tie_word_embeddings:
+ self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
+
+ def get_input_embeddings(self):
+ return self.shared
+
+ def set_input_embeddings(self, value):
+ self.shared = value
+ self.encoder.embed_tokens = self.shared
+ self.decoder.embed_tokens = self.shared
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ decoder_head_mask: Optional[torch.Tensor] = None,
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, Seq2SeqModelOutput]:
+ # different to other models, Florence2 automatically creates decoder_input_ids from
+ # input_ids if no decoder_input_ids are provided
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ if input_ids is None:
+ raise ValueError(
+ "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+ "passed, `input_ids` cannot be `None`. Please pass either "
+ "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+ )
+
+ decoder_input_ids = shift_tokens_right(
+ input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if encoder_outputs is None:
+ encoder_outputs = self.encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+ decoder_outputs = self.decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=attention_mask,
+ head_mask=decoder_head_mask,
+ cross_attn_head_mask=cross_attn_head_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ return decoder_outputs + encoder_outputs
+
+ return Seq2SeqModelOutput(
+ last_hidden_state=decoder_outputs.last_hidden_state,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+
+class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel):
+ base_model_prefix = "model"
+ _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+
+ def __init__(self, config: Florence2LanguageConfig):
+ super().__init__(config)
+ self.model = Florence2LanguageModel(config)
+ self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+ self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_encoder(self):
+ return self.model.get_encoder()
+
+ def get_decoder(self):
+ return self.model.get_decoder()
+
+ def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+ new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+ self._resize_final_logits_bias(new_embeddings.weight.shape[0])
+ return new_embeddings
+
+ def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+ old_num_tokens = self.final_logits_bias.shape[-1]
+ if new_num_tokens <= old_num_tokens:
+ new_bias = self.final_logits_bias[:, :new_num_tokens]
+ else:
+ extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+ new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+ self.register_buffer("final_logits_bias", new_bias)
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ decoder_head_mask: Optional[torch.Tensor] = None,
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, Seq2SeqLMOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if labels is not None:
+ if use_cache:
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+ use_cache = False
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ decoder_input_ids=decoder_input_ids,
+ encoder_outputs=encoder_outputs,
+ decoder_attention_mask=decoder_attention_mask,
+ head_mask=head_mask,
+ decoder_head_mask=decoder_head_mask,
+ cross_attn_head_mask=cross_attn_head_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ decoder_inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ lm_logits = self.lm_head(outputs[0])
+ lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
+
+ masked_lm_loss = None
+ if labels is not None:
+ labels = labels.to(lm_logits.device)
+ loss_fct = CrossEntropyLoss()
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return Seq2SeqLMOutput(
+ loss=masked_lm_loss,
+ logits=lm_logits,
+ past_key_values=outputs.past_key_values,
+ decoder_hidden_states=outputs.decoder_hidden_states,
+ decoder_attentions=outputs.decoder_attentions,
+ cross_attentions=outputs.cross_attentions,
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+ encoder_hidden_states=outputs.encoder_hidden_states,
+ encoder_attentions=outputs.encoder_attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ decoder_input_ids,
+ past_key_values=None,
+ attention_mask=None,
+ decoder_attention_mask=None,
+ head_mask=None,
+ decoder_head_mask=None,
+ cross_attn_head_mask=None,
+ use_cache=None,
+ encoder_outputs=None,
+ **kwargs,
+ ):
+ # cut decoder_input_ids if past_key_values is used
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if decoder_input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = decoder_input_ids.shape[1] - 1
+
+ decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
+
+ return {
+ "input_ids": None, # encoder_outputs is defined. input_ids not needed
+ "encoder_outputs": encoder_outputs,
+ "past_key_values": past_key_values,
+ "decoder_input_ids": decoder_input_ids,
+ "attention_mask": attention_mask,
+ "decoder_attention_mask": decoder_attention_mask,
+ "head_mask": head_mask,
+ "decoder_head_mask": decoder_head_mask,
+ "cross_attn_head_mask": cross_attn_head_mask,
+ "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
+ }
+
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+ return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ # cached cross_attention states don't have to be reordered -> they are always the same
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ + layer_past[2:],
+ )
+ return reordered_past
+
+@dataclass
+class Florence2Seq2SeqLMOutput(ModelOutput):
+ """
+ Base class for Florence-2 model's outputs that also contains : pre-computed hidden states that can speed up sequential
+ decoding.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+ Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size,
+ num_image_tokens, hidden_size)`.
+
+ image_hidden_states of the model produced by the vision encoder
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+FLORENCE2_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`Florence2Config`] or [`Florence2VisionConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare Florence-2 Model outputting raw hidden-states without any specific head on top.",
+ FLORENCE2_START_DOCSTRING,
+)
+class Florence2PreTrainedModel(PreTrainedModel):
+ config_class = Florence2Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _skip_keys_device_placement = "past_key_values"
+
+ @property
+ def _supports_flash_attn_2(self):
+ """
+ Retrieve language_model's attribute to check whether the model supports
+ Flash Attention 2 or not.
+ """
+ return self.language_model._supports_flash_attn_2
+
+ @property
+ def _supports_sdpa(self):
+ """
+ Retrieve language_model's attribute to check whether the model supports
+ SDPA or not.
+ """
+ return self.language_model._supports_sdpa
+
+
+FLORENCE2_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+ The tensors corresponding to the input images. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`Florence2Processor`] uses
+ [`CLIPImageProcessor`] for processing images).
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+@add_start_docstrings(
+ """The FLORENCE2 vision model without any head""",
+ FLORENCE2_START_DOCSTRING,
+)
+class Florence2VisionModel(Florence2PreTrainedModel):
+ def __init__(self, config: Florence2VisionConfig):
+ super().__init__(config)
+ assert config.model_type == 'davit', 'only DaViT is supported for now'
+ self.vision_tower = DaViT.from_config(config=config)
+
+ self.post_init()
+
+ def forward(self, pixel_values):
+ if len(pixel_values.shape) == 4:
+ x = self.vision_tower.forward_features_unpool(pixel_values)
+ else:
+ raise ValueError(f'invalid image shape {pixel_values.shape}')
+ return x
+
+
+@add_start_docstrings(
+ """The FLORENCE2 vision model with projection layer""",
+ FLORENCE2_START_DOCSTRING,
+)
+class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
+ def __init__(self, config: Florence2VisionConfig):
+ super().__init__(config)
+ assert config.model_type == 'davit', 'only DaViT is supported for now'
+ self.vision_tower = DaViT.from_config(config=config)
+
+ self._build_image_projection_layers(config)
+
+ self.post_init()
+
+ def _build_image_projection_layers(self, config):
+ image_dim_out = config.dim_embed[-1]
+ dim_projection = config.projection_dim
+ self.image_projection = nn.Parameter(
+ torch.empty(image_dim_out, dim_projection)
+ )
+ self.image_proj_norm = nn.LayerNorm(dim_projection)
+ image_pos_embed_config = config.image_pos_embed
+ if image_pos_embed_config['type'] == 'learned_abs_2d':
+ self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+ embedding_dim=image_dim_out,
+ num_pos=image_pos_embed_config['max_pos_embeddings']
+ )
+ else:
+ raise NotImplementedError('Not implemented yet')
+
+ self.image_feature_source = config.image_feature_source
+
+ # temporal embedding
+ visual_temporal_embedding_config = config.visual_temporal_embedding
+ if visual_temporal_embedding_config['type'] == 'COSINE':
+ self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+ embed_dim=image_dim_out,
+ max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
+ )
+ else:
+ raise NotImplementedError('Not implemented yet')
+
+ def forward(self, pixel_values):
+ if len(pixel_values.shape) == 4:
+ batch_size, C, H, W = pixel_values.shape
+ T = 1
+ x = self.vision_tower.forward_features_unpool(pixel_values)
+ else:
+ raise ValueError(f'invalid image shape {pixel_values.shape}')
+
+ if self.image_pos_embed is not None:
+ x = x.view(batch_size * T, -1, x.shape[-1])
+ num_tokens = x.shape[-2]
+ h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
+ assert h * w == num_tokens, 'only support square feature maps for now'
+ x = x.view(batch_size * T, h, w, x.shape[-1])
+ pos_embed = self.image_pos_embed(x)
+ x = x + pos_embed
+ x = x.view(batch_size, T * h*w, x.shape[-1])
+
+ if self.visual_temporal_embed is not None:
+ visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
+ x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
+
+ x_feat_dict = {}
+
+ spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
+ x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+
+ temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
+ x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
+
+ x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
+ x_feat_dict['last_frame'] = x
+
+ new_x = []
+ for _image_feature_source in self.image_feature_source:
+ if _image_feature_source not in x_feat_dict:
+ raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
+ new_x.append(x_feat_dict[_image_feature_source])
+
+ x = torch.cat(new_x, dim=1)
+
+ x = x @ self.image_projection
+ x = self.image_proj_norm(x)
+
+
+ return x
+
+
+
+@add_start_docstrings(
+ """The FLORENCE2 model which consists of a vision backbone and a language model.""",
+ FLORENCE2_START_DOCSTRING,
+)
+class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
+ def __init__(self, config: Florence2Config):
+ super().__init__(config)
+ assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
+ del config.vision_config.model_type
+ self.vision_tower = DaViT.from_config(config=config.vision_config)
+ # remove unused layers
+ del self.vision_tower.head
+ del self.vision_tower.norms
+
+ self.vocab_size = config.vocab_size
+ self._attn_implementation = config._attn_implementation
+ self._build_image_projection_layers(config)
+
+ language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
+
+ if language_model._tied_weights_keys is not None:
+ self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+ self.language_model = language_model
+
+ self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+ self.post_init()
+
+ def _build_image_projection_layers(self, config):
+ image_dim_out = config.vision_config.dim_embed[-1]
+ dim_projection = config.vision_config.projection_dim
+ self.image_projection = nn.Parameter(
+ torch.empty(image_dim_out, dim_projection)
+ )
+ self.image_proj_norm = nn.LayerNorm(dim_projection)
+ image_pos_embed_config = config.vision_config.image_pos_embed
+ if image_pos_embed_config['type'] == 'learned_abs_2d':
+ self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+ embedding_dim=image_dim_out,
+ num_pos=image_pos_embed_config['max_pos_embeddings']
+ )
+ else:
+ raise NotImplementedError('Not implemented yet')
+
+ self.image_feature_source = config.vision_config.image_feature_source
+
+ # temporal embedding
+ visual_temporal_embedding_config = config.vision_config.visual_temporal_embedding
+ if visual_temporal_embedding_config['type'] == 'COSINE':
+ self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+ embed_dim=image_dim_out,
+ max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
+ )
+ else:
+ raise NotImplementedError('Not implemented yet')
+
+ def get_encoder(self):
+ return self.language_model.get_encoder()
+
+ def get_decoder(self):
+ return self.language_model.get_decoder()
+
+ def get_input_embeddings(self):
+ return self.language_model.get_input_embeddings()
+
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+ model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+ # update vocab size
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
+ self.config.vocab_size = model_embeds.num_embeddings
+ self.vocab_size = model_embeds.num_embeddings
+ return model_embeds
+
+ def _encode_image(self, pixel_values):
+ if len(pixel_values.shape) == 4:
+ batch_size, C, H, W = pixel_values.shape
+ T = 1
+ x = self.vision_tower.forward_features_unpool(pixel_values)
+ else:
+ raise ValueError(f'invalid image shape {pixel_values.shape}')
+
+ if self.image_pos_embed is not None:
+ x = x.view(batch_size * T, -1, x.shape[-1])
+ num_tokens = x.shape[-2]
+ h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
+ assert h * w == num_tokens, 'only support square feature maps for now'
+ x = x.view(batch_size * T, h, w, x.shape[-1])
+ pos_embed = self.image_pos_embed(x)
+ x = x + pos_embed
+ x = x.view(batch_size, T * h*w, x.shape[-1])
+
+ if self.visual_temporal_embed is not None:
+ visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
+ x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
+
+ x_feat_dict = {}
+
+ spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
+ x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+
+ temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
+ x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
+
+ x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
+ x_feat_dict['last_frame'] = x
+
+ new_x = []
+ for _image_feature_source in self.image_feature_source:
+ if _image_feature_source not in x_feat_dict:
+ raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
+ new_x.append(x_feat_dict[_image_feature_source])
+
+ x = torch.cat(new_x, dim=1)
+
+ x = x @ self.image_projection
+ x = self.image_proj_norm(x)
+
+ return x
+
+ def _merge_input_ids_with_image_features(
+ self, image_features, inputs_embeds
+ ):
+ batch_size, image_token_length = image_features.size()[:-1]
+ device = image_features.device
+ image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
+
+ # task_prefix_embeds: [batch_size, padded_context_length, hidden_size]
+ # task_prefix_attention_mask: [batch_size, context_length]
+ if inputs_embeds is None:
+ return image_features, image_attention_mask
+
+ task_prefix_embeds = inputs_embeds
+ task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
+
+ if len(task_prefix_attention_mask.shape) == 3:
+ task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
+
+ # concat [image embeds, task prefix embeds]
+ inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
+ attention_mask = torch.cat([image_attention_mask, task_prefix_attention_mask], dim=1)
+
+ return inputs_embeds, attention_mask
+
+
+ @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ decoder_head_mask: Optional[torch.Tensor] = None,
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, Florence2ForConditionalGeneration
+
+ >>> model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-large")
+ >>> processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")
+
+ >>> prompt = ""
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(**inputs, max_length=100)
+ >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "A green car parked in front of a yellow building."
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ image_features = None
+ if inputs_embeds is None:
+ # 1. Extra the input embeddings
+ if input_ids is not None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+ # 2. Merge text and images
+ if pixel_values is not None:
+ # (batch_size, num_image_tokens, hidden_size)
+ image_features = self._encode_image(pixel_values)
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
+
+ attention_mask = attention_mask.to(inputs_embeds.dtype)
+ outputs = self.language_model(
+ attention_mask=attention_mask,
+ labels=labels,
+ inputs_embeds=inputs_embeds,
+ decoder_input_ids=decoder_input_ids,
+ encoder_outputs=encoder_outputs,
+ decoder_attention_mask=decoder_attention_mask,
+ head_mask=head_mask,
+ decoder_head_mask=decoder_head_mask,
+ cross_attn_head_mask=cross_attn_head_mask,
+ past_key_values=past_key_values,
+ decoder_inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ logits = outputs.logits
+ logits = logits.float()
+ loss = outputs.loss
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return Florence2Seq2SeqLMOutput(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ decoder_hidden_states=outputs.decoder_hidden_states,
+ decoder_attentions=outputs.decoder_attentions,
+ cross_attentions=outputs.cross_attentions,
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+ encoder_hidden_states=outputs.encoder_hidden_states,
+ encoder_attentions=outputs.encoder_attentions,
+ image_hidden_states=image_features
+ )
+
+ def generate(
+ self,
+ input_ids,
+ inputs_embeds=None,
+ pixel_values=None,
+ **kwargs
+ ):
+
+ if inputs_embeds is None:
+ # 1. Extra the input embeddings
+ if input_ids is not None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+ # 2. Merge text and images
+ if pixel_values is not None:
+ image_features = self._encode_image(pixel_values)
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
+
+ return self.language_model.generate(
+ input_ids=None,
+ inputs_embeds=inputs_embeds,
+ **kwargs
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ decoder_input_ids,
+ past_key_values=None,
+ attention_mask=None,
+ pixel_values=None,
+ decoder_attention_mask=None,
+ head_mask=None,
+ decoder_head_mask=None,
+ cross_attn_head_mask=None,
+ use_cache=None,
+ encoder_outputs=None,
+ **kwargs,
+ ):
+ # cut decoder_input_ids if past_key_values is used
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if decoder_input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = decoder_input_ids.shape[1] - 1
+
+ decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
+
+ return {
+ "input_ids": None, # encoder_outputs is defined. input_ids not needed
+ "encoder_outputs": encoder_outputs,
+ "past_key_values": past_key_values,
+ "decoder_input_ids": decoder_input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "decoder_attention_mask": decoder_attention_mask,
+ "head_mask": head_mask,
+ "decoder_head_mask": decoder_head_mask,
+ "cross_attn_head_mask": cross_attn_head_mask,
+ "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
+ }
+
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+ return self.language_model.shift_tokens_right(labels)
+
+ def _reorder_cache(self, *args, **kwargs):
+ return self.language_model._reorder_cache(*args, **kwargs)
\ No newline at end of file
From c2221a2175df0a4304d2d85852c4be2dbe3637ce Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 03:14:20 +0200
Subject: [PATCH 06/35] add florence2 __init__.py
---
src/transformers/models/florence/__init__.py | 77 ++++++++++++++++++++
1 file changed, 77 insertions(+)
create mode 100644 src/transformers/models/florence/__init__.py
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
new file mode 100644
index 000000000000..5b532dff1e86
--- /dev/null
+++ b/src/transformers/models/florence/__init__.py
@@ -0,0 +1,77 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ _LazyModule,
+ is_torch_available,
+)
+
+
+_import_structure = {
+ "configuration_florence2": [
+ "Florence2VisionConfig",
+ "Florence2LanguageConfig",
+ "Florence2Config",
+ ],
+ "processing_florence2": [
+ "Florence2Processor",
+ "Florence2PostProcesser",
+ ],
+}
+
+
+try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_florence2"] = [
+ "Florence2PreTrainedModel",
+ "Florence2VisionModel",
+ "Florence2VisionModelWithProjection",
+ "Florence2ForConditionalGeneration",
+ ]
+
+
+if TYPE_CHECKING:
+ from .configuration_florence2 import (
+ Florence2VisionConfig,
+ Florence2LanguageConfig,
+ Florence2Config,
+ )
+ from .processing_florence2 import (
+ Florence2Processor,
+ Florence2PostProcesser,
+ )
+
+ try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_florence2 import (
+ Florence2PreTrainedModel,
+ Florence2VisionModel,
+ Florence2VisionModelWithProjection,
+ Florence2ForConditionalGeneration,
+ )
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
From 6d1606ed29209066f6190406d3992824bfc767e7 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 15:47:58 +0200
Subject: [PATCH 07/35] make fixup
---
src/transformers/models/florence/__init__.py | 14 +-
.../florence/configuration_florence2.py | 13 +-
.../models/florence/modeling_florence2.py | 435 +++++-----
.../models/florence/processing_florence2.py | 747 ++++++++++--------
4 files changed, 631 insertions(+), 578 deletions(-)
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
index 5b532dff1e86..262e7173ca64 100644
--- a/src/transformers/models/florence/__init__.py
+++ b/src/transformers/models/florence/__init__.py
@@ -22,13 +22,13 @@
_import_structure = {
"configuration_florence2": [
- "Florence2VisionConfig",
- "Florence2LanguageConfig",
"Florence2Config",
+ "Florence2LanguageConfig",
+ "Florence2VisionConfig",
],
"processing_florence2": [
- "Florence2Processor",
"Florence2PostProcesser",
+ "Florence2Processor",
],
}
@@ -49,13 +49,13 @@
if TYPE_CHECKING:
from .configuration_florence2 import (
- Florence2VisionConfig,
- Florence2LanguageConfig,
Florence2Config,
+ Florence2LanguageConfig,
+ Florence2VisionConfig,
)
from .processing_florence2 import (
- Florence2Processor,
Florence2PostProcesser,
+ Florence2Processor,
)
try:
@@ -65,10 +65,10 @@
pass
else:
from .modeling_florence2 import (
+ Florence2ForConditionalGeneration,
Florence2PreTrainedModel,
Florence2VisionModel,
Florence2VisionModelWithProjection,
- Florence2ForConditionalGeneration,
)
else:
diff --git a/src/transformers/models/florence/configuration_florence2.py b/src/transformers/models/florence/configuration_florence2.py
index 6dd576ee1be5..643d6378dfa5 100644
--- a/src/transformers/models/florence/configuration_florence2.py
+++ b/src/transformers/models/florence/configuration_florence2.py
@@ -12,19 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
+
+
""" Florence-2 configuration"""
-from typing import Optional
from ...configuration_utils import PretrainedConfig
from ...utils import logging
+
logger = logging.get_logger(__name__)
+
class Florence2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
- according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -271,7 +274,7 @@ def __init__(
class Florence2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
- Florence-2 model according to the specified arguments, defining the model architecture.
+ Florence-2 model according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
@@ -280,7 +283,7 @@ class Florence2Config(PretrainedConfig):
vision_config (`Florence2VisionConfig`, *optional*):
Custom vision config or dict
text_config (`Union[AutoConfig, dict]`, *optional*):
- The config object of the text backbone.
+ The config object of the text backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
vocab_size (`int`, *optional*, defaults to 51289):
@@ -334,6 +337,4 @@ def __init__(
if text_config is not None:
self.text_config = Florence2LanguageConfig(**text_config)
-
super().__init__(**kwargs)
-
diff --git a/src/transformers/models/florence/modeling_florence2.py b/src/transformers/models/florence/modeling_florence2.py
index 41a71c8b5624..c6e09c303ed9 100644
--- a/src/transformers/models/florence/modeling_florence2.py
+++ b/src/transformers/models/florence/modeling_florence2.py
@@ -13,31 +13,45 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-""" PyTorch Florence-2 model."""
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+"""PyTorch Florence-2 model."""
import math
import warnings
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
import torch
-import torch.utils.checkpoint
-from torch import nn
import torch.nn.functional as F
+import torch.utils.checkpoint
import torch.utils.checkpoint as checkpoint
-from torch.nn import CrossEntropyLoss
-from collections import OrderedDict
-# from einops import rearrange
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import (
+ _prepare_4d_attention_mask,
+ _prepare_4d_attention_mask_for_sdpa,
+ _prepare_4d_causal_attention_mask,
+ _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ...modeling_outputs import (
+ BaseModelOutput,
+ BaseModelOutputWithPastAndCrossAttentions,
+ Seq2SeqLMOutput,
+ Seq2SeqModelOutput,
+)
+
+# from einops import rearrange
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
+ is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
- is_flash_attn_2_available,
- is_flash_attn_greater_or_equal_2_10,
)
from .configuration_florence2 import (
Florence2Config,
@@ -46,21 +60,6 @@
)
-from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import (
- _prepare_4d_attention_mask,
- _prepare_4d_attention_mask_for_sdpa,
- _prepare_4d_causal_attention_mask,
- _prepare_4d_causal_attention_mask_for_sdpa,
-)
-from ...modeling_outputs import (
- BaseModelOutput,
- BaseModelOutputWithPastAndCrossAttentions,
- Seq2SeqLMOutput,
- Seq2SeqModelOutput,
-)
-
-
if is_flash_attn_2_available():
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
@@ -68,6 +67,7 @@
_CONFIG_FOR_DOC = "Florence2Config"
+
class LearnedAbsolutePositionEmbedding2D(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
@@ -80,11 +80,11 @@ def __init__(self, embedding_dim=256, num_pos=50):
def forward(self, pixel_values):
"""
- pixel_values: (batch_size, height, width, num_channels)
+ pixel_values: (batch_size, height, width, num_channels)
returns: (batch_size, height, width, embedding_dim * 2)
"""
if len(pixel_values.shape) != 4:
- raise ValueError('pixel_values must be a 4D tensor')
+ raise ValueError("pixel_values must be a 4D tensor")
height, width = pixel_values.shape[1:3]
width_values = torch.arange(width, device=pixel_values.device)
height_values = torch.arange(height, device=pixel_values.device)
@@ -101,6 +101,7 @@ def forward(self, pixel_values):
pos = pos.permute(0, 2, 3, 1)
return pos
+
class PositionalEmbeddingCosine1D(nn.Module):
"""
This class implements a very simple positional encoding. It follows closely
@@ -112,22 +113,17 @@ class PositionalEmbeddingCosine1D(nn.Module):
dropout_prob: The dropout probability.
max_seq_len: The maximum length to precompute the positional encodings.
"""
- def __init__(
- self,
- embed_dim: int = 512,
- max_seq_len: int = 1024) -> None:
+
+ def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
super(PositionalEmbeddingCosine1D, self).__init__()
self.embed_dim = embed_dim
self.max_seq_len = max_seq_len
# Generate the sinusoidal arrays.
factor = math.log(10000)
- denominator = torch.exp(
- -factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim)
+ denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim)
# Matrix where rows correspond to a positional embedding as a function
# of the position index (i.e., the row index).
- frequencies = \
- torch.arange(0, self.max_seq_len) \
- .reshape(self.max_seq_len, 1) * denominator
+ frequencies = torch.arange(0, self.max_seq_len).reshape(self.max_seq_len, 1) * denominator
pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
# Populate uneven entries.
pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
@@ -151,11 +147,10 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
assert 2 <= shape_len <= 3
len_seq = seq_embeds.size(-2)
assert len_seq <= self.max_seq_len
- pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
+ pos_embeds = self.pos_idx_to_embed[0 : seq_embeds.size(-2), :]
# Adapt pre-computed positional embeddings to the input.
if shape_len == 3:
- pos_embeds = pos_embeds.view(
- (1, pos_embeds.size(0), pos_embeds.size(1)))
+ pos_embeds = pos_embeds.view((1, pos_embeds.size(0), pos_embeds.size(1)))
return pos_embeds
@@ -167,10 +162,8 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
embed_dim: The dimension of the embeddings.
max_seq_len: The maximum length to precompute the positional encodings.
"""
- def __init__(
- self,
- embedding_dim: int = 512,
- num_pos: int = 1024) -> None:
+
+ def __init__(self, embedding_dim: int = 512, num_pos: int = 1024) -> None:
super(LearnedAbsolutePositionEmbedding1D, self).__init__()
self.embeddings = nn.Embedding(num_pos, embedding_dim)
self.num_pos = num_pos
@@ -195,12 +188,10 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
# Adapt pre-computed positional embeddings to the input.
if shape_len == 3:
- pos_embeds = pos_embeds.view(
- (1, pos_embeds.size(0), pos_embeds.size(1)))
+ pos_embeds = pos_embeds.view((1, pos_embeds.size(0), pos_embeds.size(1)))
return pos_embeds
-
class MySequential(nn.Sequential):
def forward(self, *inputs):
for module in self._modules.values():
@@ -244,11 +235,15 @@ def __init__(
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
- self.net = nn.Sequential(OrderedDict([
- ("fc1", nn.Linear(in_features, hidden_features)),
- ("act", act_layer()),
- ("fc2", nn.Linear(hidden_features, out_features))
- ]))
+ self.net = nn.Sequential(
+ OrderedDict(
+ [
+ ("fc1", nn.Linear(in_features, hidden_features)),
+ ("act", act_layer()),
+ ("fc2", nn.Linear(hidden_features, out_features)),
+ ]
+ )
+ )
def forward(self, x, size):
return self.net(x), size
@@ -265,12 +260,7 @@ def __init__(
):
super().__init__()
self.dw = nn.Conv2d(
- dim_in, dim_in,
- kernel_size=kernel_size,
- padding=padding,
- groups=dim_in,
- stride=stride,
- bias=bias
+ dim_in, dim_in, kernel_size=kernel_size, padding=padding, groups=dim_in, stride=stride, bias=bias
)
def forward(self, x, size):
@@ -285,28 +275,13 @@ def forward(self, x, size):
class ConvEmbed(nn.Module):
- """ Image to Patch Embedding
- """
+ """Image to Patch Embedding"""
- def __init__(
- self,
- patch_size=7,
- in_chans=3,
- embed_dim=64,
- stride=4,
- padding=2,
- norm_layer=None,
- pre_norm=True
- ):
+ def __init__(self, patch_size=7, in_chans=3, embed_dim=64, stride=4, padding=2, norm_layer=None, pre_norm=True):
super().__init__()
self.patch_size = patch_size
- self.proj = nn.Conv2d(
- in_chans, embed_dim,
- kernel_size=patch_size,
- stride=stride,
- padding=padding
- )
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
dim_norm = in_chans if pre_norm else embed_dim
self.norm = norm_layer(dim_norm) if norm_layer else None
@@ -328,7 +303,7 @@ def forward(self, x, size):
_, _, H, W = x.shape
# TODO: check if this is correct
- #x = rearrange(x, 'b c h w -> b (h w) c')
+ # x = rearrange(x, 'b c h w -> b (h w) c')
x = x.permute(0, 2, 3, 1).contiguous()
B, H, W, C = x.size()
x = x.view(B, -1, C)
@@ -340,7 +315,6 @@ def forward(self, x, size):
class ChannelAttention(nn.Module):
-
def __init__(self, dim, groups=8, qkv_bias=True):
super().__init__()
@@ -363,17 +337,17 @@ def forward(self, x, size):
return x, size
-#https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+# https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
class DropPath(nn.Module):
- """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
- """
- def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
self.scale_by_keep = scale_by_keep
def forward(self, x):
- if self.drop_prob == 0. or not self.training:
+ if self.drop_prob == 0.0 or not self.training:
return x
keep_prob = 1 - self.drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
@@ -383,12 +357,12 @@ def forward(self, x):
return x * random_tensor
def extra_repr(self):
- return f'drop_prob={round(self.drop_prob,3):0.3f}'
+ return f"drop_prob={round(self.drop_prob,3):0.3f}"
# TODO: can this be replaced with torch.nn.init.trunc_normal_?
-#https://github.com/huggingface/pytorch-image-models/blob/b28945ff056d454b174f0fb8682e362b87150141/timm/layers/weight_init.py
-def trunc_normal_(tensor: torch.Tensor, mean=0., std=1., a=-2., b=2.) -> torch.Tensor:
+# https://github.com/huggingface/pytorch-image-models/blob/b28945ff056d454b174f0fb8682e362b87150141/timm/layers/weight_init.py
+def trunc_normal_(tensor: torch.Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0) -> torch.Tensor:
r"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
@@ -407,17 +381,20 @@ def trunc_normal_(tensor: torch.Tensor, mean=0., std=1., a=-2., b=2.) -> torch.T
a: the minimum cutoff value
b: the maximum cutoff value
"""
+
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
- return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
with torch.no_grad():
# Cut & paste from PyTorch official master until it's in a few official releases - RW
if (mean < a - 2 * std) or (mean > b + 2 * std):
- warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
- "The distribution of values may be incorrect.",
- stacklevel=2)
+ warnings.warn(
+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+ "The distribution of values may be incorrect.",
+ stacklevel=2,
+ )
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
@@ -434,7 +411,7 @@ def norm_cdf(x):
tensor.erfinv_()
# Transform to proper mean, std
- tensor.mul_(std * math.sqrt(2.))
+ tensor.mul_(std * math.sqrt(2.0))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
@@ -443,25 +420,29 @@ def norm_cdf(x):
class ChannelBlock(nn.Module):
-
- def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
- drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
- conv_at_attn=True, conv_at_ffn=True):
+ def __init__(
+ self,
+ dim,
+ groups,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ drop_path_rate=0.0,
+ act_layer=nn.GELU,
+ norm_layer=nn.LayerNorm,
+ conv_at_attn=True,
+ conv_at_ffn=True,
+ ):
super().__init__()
- drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
self.channel_attn = PreNorm(
- norm_layer(dim),
- ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
- drop_path
+ norm_layer(dim), ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias), drop_path
)
self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
self.ffn = PreNorm(
- norm_layer(dim),
- Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
- drop_path
+ norm_layer(dim), Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer), drop_path
)
def forward(self, x, size):
@@ -484,9 +465,9 @@ def window_partition(x, window_size: int):
def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
- B = batch_size
+ B = batch_size
# this will cause onnx conversion failed for dynamic axis, because treated as constant
- # int(windows.shape[0] / (H * W / window_size / window_size))
+ # int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
@@ -494,7 +475,6 @@ def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
class WindowAttention(nn.Module):
def __init__(self, dim, num_heads, window_size, qkv_bias=True):
-
super().__init__()
self.dim = dim
self.window_size = window_size
@@ -508,7 +488,6 @@ def __init__(self, dim, num_heads, window_size, qkv_bias=True):
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, size):
-
H, W = size
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
@@ -532,16 +511,14 @@ def forward(self, x, size):
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
- attn = (q @ k.transpose(-2, -1))
+ attn = q @ k.transpose(-2, -1)
attn = self.softmax(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
# merge windows
- x = x.view(
- -1, self.window_size, self.window_size, C
- )
+ x = x.view(-1, self.window_size, self.window_size, C)
x = window_reverse(x, B, self.window_size, Hp, Wp)
if pad_r > 0 or pad_b > 0:
@@ -553,25 +530,30 @@ def forward(self, x, size):
class SpatialBlock(nn.Module):
-
- def __init__(self, dim, num_heads, window_size,
- mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
- norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
+ def __init__(
+ self,
+ dim,
+ num_heads,
+ window_size,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ drop_path_rate=0.0,
+ act_layer=nn.GELU,
+ norm_layer=nn.LayerNorm,
+ conv_at_attn=True,
+ conv_at_ffn=True,
+ ):
super().__init__()
- drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
self.window_attn = PreNorm(
- norm_layer(dim),
- WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
- drop_path
+ norm_layer(dim), WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias), drop_path
)
self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
self.ffn = PreNorm(
- norm_layer(dim),
- Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
- drop_path
+ norm_layer(dim), Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer), drop_path
)
def forward(self, x, size):
@@ -586,7 +568,7 @@ def forward(self, x, size):
class DaViT(nn.Module):
- """ DaViT: Dual-Attention Transformer
+ """DaViT: Dual-Attention Transformer
Args:
in_chans (int): Number of input image channels. Default: 3.
@@ -621,14 +603,14 @@ def __init__(
num_heads=(3, 6, 12, 24),
num_groups=(3, 6, 12, 24),
window_size=7,
- mlp_ratio=4.,
+ mlp_ratio=4.0,
qkv_bias=True,
drop_path_rate=0.1,
norm_layer=nn.LayerNorm,
enable_checkpoint=False,
conv_at_attn=True,
conv_at_ffn=True,
- ):
+ ):
super().__init__()
self.num_classes = num_classes
@@ -640,7 +622,7 @@ def __init__(
assert self.num_stages == len(self.num_heads) == len(self.num_groups)
num_stages = len(embed_dims)
- dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths) * 2)]
depth_offset = 0
convs = []
@@ -653,41 +635,48 @@ def __init__(
in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
embed_dim=self.embed_dims[i],
norm_layer=norm_layer,
- pre_norm=patch_prenorm[i]
+ pre_norm=patch_prenorm[i],
)
convs.append(conv_embed)
block = MySequential(
*[
- MySequential(OrderedDict([
- (
- 'spatial_block', SpatialBlock(
- embed_dims[i],
- num_heads[i],
- window_size,
- drop_path_rate=dpr[depth_offset+j*2],
- qkv_bias=qkv_bias,
- mlp_ratio=mlp_ratio,
- conv_at_attn=conv_at_attn,
- conv_at_ffn=conv_at_ffn,
- )
- ),
- (
- 'channel_block', ChannelBlock(
- embed_dims[i],
- num_groups[i],
- drop_path_rate=dpr[depth_offset+j*2+1],
- qkv_bias=qkv_bias,
- mlp_ratio=mlp_ratio,
- conv_at_attn=conv_at_attn,
- conv_at_ffn=conv_at_ffn,
- )
+ MySequential(
+ OrderedDict(
+ [
+ (
+ "spatial_block",
+ SpatialBlock(
+ embed_dims[i],
+ num_heads[i],
+ window_size,
+ drop_path_rate=dpr[depth_offset + j * 2],
+ qkv_bias=qkv_bias,
+ mlp_ratio=mlp_ratio,
+ conv_at_attn=conv_at_attn,
+ conv_at_ffn=conv_at_ffn,
+ ),
+ ),
+ (
+ "channel_block",
+ ChannelBlock(
+ embed_dims[i],
+ num_groups[i],
+ drop_path_rate=dpr[depth_offset + j * 2 + 1],
+ qkv_bias=qkv_bias,
+ mlp_ratio=mlp_ratio,
+ conv_at_attn=conv_at_attn,
+ conv_at_ffn=conv_at_ffn,
+ ),
+ ),
+ ]
)
- ])) for j in range(depths[i])
+ )
+ for j in range(depths[i])
]
)
blocks.append(block)
- depth_offset += depths[i]*2
+ depth_offset += depths[i] * 2
self.convs = nn.ModuleList(convs)
self.blocks = nn.ModuleList(blocks)
@@ -710,7 +699,7 @@ def _init_weights(self, m):
elif isinstance(m, nn.Conv2d):
nn.init.normal_(m.weight, std=0.02)
for name, _ in m.named_parameters():
- if name in ['bias']:
+ if name in ["bias"]:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.weight, 1.0)
@@ -721,7 +710,7 @@ def _init_weights(self, m):
def forward_features_unpool(self, x):
"""
- forward until avg pooling
+ forward until avg pooling
Args:
x (_type_): input image tensor
"""
@@ -749,7 +738,7 @@ def forward(self, x):
x = self.forward_features(x)
x = self.head(x)
return x
-
+
@classmethod
def from_config(cls, config):
return cls(
@@ -766,12 +755,11 @@ def from_config(cls, config):
)
-
-
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -1516,7 +1504,6 @@ def forward(
return outputs
-
class Florence2LanguagePreTrainedModel(PreTrainedModel):
config_class = Florence2LanguageConfig
base_model_prefix = "model"
@@ -2320,6 +2307,7 @@ def _reorder_cache(past_key_values, beam_idx):
)
return reordered_past
+
@dataclass
class Florence2Seq2SeqLMOutput(ModelOutput):
"""
@@ -2495,6 +2483,7 @@ def _supports_sdpa(self):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
+
@add_start_docstrings(
"""The FLORENCE2 vision model without any head""",
FLORENCE2_START_DOCSTRING,
@@ -2502,16 +2491,16 @@ def _supports_sdpa(self):
class Florence2VisionModel(Florence2PreTrainedModel):
def __init__(self, config: Florence2VisionConfig):
super().__init__(config)
- assert config.model_type == 'davit', 'only DaViT is supported for now'
+ assert config.model_type == "davit", "only DaViT is supported for now"
self.vision_tower = DaViT.from_config(config=config)
self.post_init()
-
+
def forward(self, pixel_values):
if len(pixel_values.shape) == 4:
x = self.vision_tower.forward_features_unpool(pixel_values)
else:
- raise ValueError(f'invalid image shape {pixel_values.shape}')
+ raise ValueError(f"invalid image shape {pixel_values.shape}")
return x
@@ -2522,40 +2511,36 @@ def forward(self, pixel_values):
class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
def __init__(self, config: Florence2VisionConfig):
super().__init__(config)
- assert config.model_type == 'davit', 'only DaViT is supported for now'
+ assert config.model_type == "davit", "only DaViT is supported for now"
self.vision_tower = DaViT.from_config(config=config)
self._build_image_projection_layers(config)
self.post_init()
-
+
def _build_image_projection_layers(self, config):
image_dim_out = config.dim_embed[-1]
dim_projection = config.projection_dim
- self.image_projection = nn.Parameter(
- torch.empty(image_dim_out, dim_projection)
- )
+ self.image_projection = nn.Parameter(torch.empty(image_dim_out, dim_projection))
self.image_proj_norm = nn.LayerNorm(dim_projection)
image_pos_embed_config = config.image_pos_embed
- if image_pos_embed_config['type'] == 'learned_abs_2d':
+ if image_pos_embed_config["type"] == "learned_abs_2d":
self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
- embedding_dim=image_dim_out,
- num_pos=image_pos_embed_config['max_pos_embeddings']
+ embedding_dim=image_dim_out, num_pos=image_pos_embed_config["max_pos_embeddings"]
)
else:
- raise NotImplementedError('Not implemented yet')
+ raise NotImplementedError("Not implemented yet")
self.image_feature_source = config.image_feature_source
# temporal embedding
visual_temporal_embedding_config = config.visual_temporal_embedding
- if visual_temporal_embedding_config['type'] == 'COSINE':
+ if visual_temporal_embedding_config["type"] == "COSINE":
self.visual_temporal_embed = PositionalEmbeddingCosine1D(
- embed_dim=image_dim_out,
- max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
+ embed_dim=image_dim_out, max_seq_len=visual_temporal_embedding_config["max_temporal_embeddings"]
)
else:
- raise NotImplementedError('Not implemented yet')
+ raise NotImplementedError("Not implemented yet")
def forward(self, pixel_values):
if len(pixel_values.shape) == 4:
@@ -2563,17 +2548,17 @@ def forward(self, pixel_values):
T = 1
x = self.vision_tower.forward_features_unpool(pixel_values)
else:
- raise ValueError(f'invalid image shape {pixel_values.shape}')
-
+ raise ValueError(f"invalid image shape {pixel_values.shape}")
+
if self.image_pos_embed is not None:
x = x.view(batch_size * T, -1, x.shape[-1])
num_tokens = x.shape[-2]
- h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
- assert h * w == num_tokens, 'only support square feature maps for now'
+ h, w = int(num_tokens**0.5), int(num_tokens**0.5)
+ assert h * w == num_tokens, "only support square feature maps for now"
x = x.view(batch_size * T, h, w, x.shape[-1])
pos_embed = self.image_pos_embed(x)
x = x + pos_embed
- x = x.view(batch_size, T * h*w, x.shape[-1])
+ x = x.view(batch_size, T * h * w, x.shape[-1])
if self.visual_temporal_embed is not None:
visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
@@ -2582,18 +2567,18 @@ def forward(self, pixel_values):
x_feat_dict = {}
spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
- x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+ x_feat_dict["spatial_avg_pool"] = spatial_avg_pool_x
temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
- x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
+ x_feat_dict["temporal_avg_pool"] = temporal_avg_pool_x
x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
- x_feat_dict['last_frame'] = x
+ x_feat_dict["last_frame"] = x
new_x = []
for _image_feature_source in self.image_feature_source:
if _image_feature_source not in x_feat_dict:
- raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
+ raise ValueError("invalid image feature source: {}".format(_image_feature_source))
new_x.append(x_feat_dict[_image_feature_source])
x = torch.cat(new_x, dim=1)
@@ -2601,11 +2586,9 @@ def forward(self, pixel_values):
x = x @ self.image_projection
x = self.image_proj_norm(x)
-
return x
-
@add_start_docstrings(
"""The FLORENCE2 model which consists of a vision backbone and a language model.""",
FLORENCE2_START_DOCSTRING,
@@ -2613,10 +2596,10 @@ def forward(self, pixel_values):
class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
def __init__(self, config: Florence2Config):
super().__init__(config)
- assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
+ assert config.vision_config.model_type == "davit", "only DaViT is supported for now"
del config.vision_config.model_type
self.vision_tower = DaViT.from_config(config=config.vision_config)
- # remove unused layers
+ # remove unused layers
del self.vision_tower.head
del self.vision_tower.norms
@@ -2632,34 +2615,30 @@ def __init__(self, config: Florence2Config):
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
self.post_init()
-
+
def _build_image_projection_layers(self, config):
image_dim_out = config.vision_config.dim_embed[-1]
dim_projection = config.vision_config.projection_dim
- self.image_projection = nn.Parameter(
- torch.empty(image_dim_out, dim_projection)
- )
+ self.image_projection = nn.Parameter(torch.empty(image_dim_out, dim_projection))
self.image_proj_norm = nn.LayerNorm(dim_projection)
image_pos_embed_config = config.vision_config.image_pos_embed
- if image_pos_embed_config['type'] == 'learned_abs_2d':
+ if image_pos_embed_config["type"] == "learned_abs_2d":
self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
- embedding_dim=image_dim_out,
- num_pos=image_pos_embed_config['max_pos_embeddings']
+ embedding_dim=image_dim_out, num_pos=image_pos_embed_config["max_pos_embeddings"]
)
else:
- raise NotImplementedError('Not implemented yet')
+ raise NotImplementedError("Not implemented yet")
self.image_feature_source = config.vision_config.image_feature_source
# temporal embedding
visual_temporal_embedding_config = config.vision_config.visual_temporal_embedding
- if visual_temporal_embedding_config['type'] == 'COSINE':
+ if visual_temporal_embedding_config["type"] == "COSINE":
self.visual_temporal_embed = PositionalEmbeddingCosine1D(
- embed_dim=image_dim_out,
- max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
+ embed_dim=image_dim_out, max_seq_len=visual_temporal_embedding_config["max_temporal_embeddings"]
)
else:
- raise NotImplementedError('Not implemented yet')
+ raise NotImplementedError("Not implemented yet")
def get_encoder(self):
return self.language_model.get_encoder()
@@ -2677,24 +2656,24 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
self.config.vocab_size = model_embeds.num_embeddings
self.vocab_size = model_embeds.num_embeddings
return model_embeds
-
+
def _encode_image(self, pixel_values):
if len(pixel_values.shape) == 4:
batch_size, C, H, W = pixel_values.shape
T = 1
x = self.vision_tower.forward_features_unpool(pixel_values)
else:
- raise ValueError(f'invalid image shape {pixel_values.shape}')
-
+ raise ValueError(f"invalid image shape {pixel_values.shape}")
+
if self.image_pos_embed is not None:
x = x.view(batch_size * T, -1, x.shape[-1])
num_tokens = x.shape[-2]
- h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
- assert h * w == num_tokens, 'only support square feature maps for now'
+ h, w = int(num_tokens**0.5), int(num_tokens**0.5)
+ assert h * w == num_tokens, "only support square feature maps for now"
x = x.view(batch_size * T, h, w, x.shape[-1])
pos_embed = self.image_pos_embed(x)
x = x + pos_embed
- x = x.view(batch_size, T * h*w, x.shape[-1])
+ x = x.view(batch_size, T * h * w, x.shape[-1])
if self.visual_temporal_embed is not None:
visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
@@ -2703,18 +2682,18 @@ def _encode_image(self, pixel_values):
x_feat_dict = {}
spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
- x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+ x_feat_dict["spatial_avg_pool"] = spatial_avg_pool_x
temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
- x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
+ x_feat_dict["temporal_avg_pool"] = temporal_avg_pool_x
x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
- x_feat_dict['last_frame'] = x
+ x_feat_dict["last_frame"] = x
new_x = []
for _image_feature_source in self.image_feature_source:
if _image_feature_source not in x_feat_dict:
- raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
+ raise ValueError("invalid image feature source: {}".format(_image_feature_source))
new_x.append(x_feat_dict[_image_feature_source])
x = torch.cat(new_x, dim=1)
@@ -2722,11 +2701,9 @@ def _encode_image(self, pixel_values):
x = x @ self.image_projection
x = self.image_proj_norm(x)
- return x
+ return x
- def _merge_input_ids_with_image_features(
- self, image_features, inputs_embeds
- ):
+ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds):
batch_size, image_token_length = image_features.size()[:-1]
device = image_features.device
image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
@@ -2748,7 +2725,6 @@ def _merge_input_ids_with_image_features(
return inputs_embeds, attention_mask
-
@add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
@@ -2816,7 +2792,9 @@ def forward(
if pixel_values is not None:
# (batch_size, num_image_tokens, hidden_size)
image_features = self._encode_image(pixel_values)
- inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
+ image_features, inputs_embeds
+ )
attention_mask = attention_mask.to(inputs_embeds.dtype)
outputs = self.language_model(
@@ -2854,17 +2832,10 @@ def forward(
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
- image_hidden_states=image_features
+ image_hidden_states=image_features,
)
- def generate(
- self,
- input_ids,
- inputs_embeds=None,
- pixel_values=None,
- **kwargs
- ):
-
+ def generate(self, input_ids, inputs_embeds=None, pixel_values=None, **kwargs):
if inputs_embeds is None:
# 1. Extra the input embeddings
if input_ids is not None:
@@ -2872,13 +2843,11 @@ def generate(
# 2. Merge text and images
if pixel_values is not None:
image_features = self._encode_image(pixel_values)
- inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
-
- return self.language_model.generate(
- input_ids=None,
- inputs_embeds=inputs_embeds,
- **kwargs
- )
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
+ image_features, inputs_embeds
+ )
+
+ return self.language_model.generate(input_ids=None, inputs_embeds=inputs_embeds, **kwargs)
def prepare_inputs_for_generation(
self,
@@ -2906,7 +2875,7 @@ def prepare_inputs_for_generation(
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
-
+
return {
"input_ids": None, # encoder_outputs is defined. input_ids not needed
"encoder_outputs": encoder_outputs,
@@ -2920,9 +2889,9 @@ def prepare_inputs_for_generation(
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
-
+
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self.language_model.shift_tokens_right(labels)
def _reorder_cache(self, *args, **kwargs):
- return self.language_model._reorder_cache(*args, **kwargs)
\ No newline at end of file
+ return self.language_model._reorder_cache(*args, **kwargs)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index b97945c47672..b6531de62945 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -16,20 +16,19 @@
Processor class for Florence-2.
"""
-import re
import logging
+import re
from typing import List, Optional, Union
-import numpy as np
+import numpy as np
import torch
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ChannelDimension, ImageInput, is_valid_image
from ...models.bart.tokenization_bart import BartTokenizer
from ...models.bart.tokenization_bart_fast import BartTokenizerFast
from ...models.t5.tokenization_t5 import T5Tokenizer
from ...models.t5.tokenization_t5_fast import T5TokenizerFast
-
-from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, is_valid_image, ChannelDimension
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import (
PaddingStrategy,
@@ -42,10 +41,12 @@
logger = logging.getLogger(__name__)
+
# Copied from transformers.models.idefics2.processing_idefics2.is_url
def is_url(val) -> bool:
return isinstance(val, str) and val.startswith("http")
+
# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
def is_image_or_image_url(elem):
return is_url(elem) or is_valid_image(elem)
@@ -88,58 +89,77 @@ def __init__(
self.image_seq_length = image_processor.image_seq_length
tokens_to_add = {
- 'additional_special_tokens': \
- tokenizer.additional_special_tokens + \
- ['', '', '', ''] + \
- [f'' for x in range(1000)] + \
- ['', '', '', '','', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
- }
+ "additional_special_tokens": tokenizer.additional_special_tokens
+ + ["", "", "", ""]
+ + [f"" for x in range(1000)]
+ + [
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ ]
+ }
tokenizer.add_special_tokens(tokens_to_add)
self.tasks_answer_post_processing_type = {
- '': 'pure_text',
- '': 'ocr',
- '': 'pure_text',
- '': 'pure_text',
- '': 'pure_text',
- '': 'description_with_bboxes',
- '': 'description_with_bboxes',
- '': "phrase_grounding",
- '': 'polygons',
- '': 'polygons',
- '': 'description_with_bboxes_or_polygons',
- '': 'pure_text',
- '': 'pure_text',
- '': 'pure_text',
- '': 'bboxes'
+ "": "pure_text",
+ "": "ocr",
+ "": "pure_text",
+ "": "pure_text",
+ "": "pure_text",
+ "": "description_with_bboxes",
+ "": "description_with_bboxes",
+ "": "phrase_grounding",
+ "": "polygons",
+ "": "polygons",
+ "": "description_with_bboxes_or_polygons",
+ "": "pure_text",
+ "": "pure_text",
+ "": "pure_text",
+ "": "bboxes",
}
self.task_prompts_without_inputs = {
- '': 'What is the text in the image?',
- '': 'What is the text in the image, with regions?',
- '': 'What does the image describe?',
- '': 'Describe in detail what is shown in the image.',
- '': 'Describe with a paragraph what is shown in the image.',
- '': 'Locate the objects with category name in the image.',
- '': 'Locate the objects in the image, with their descriptions.',
- '': 'Locate the region proposals in the image.'
+ "": "What is the text in the image?",
+ "": "What is the text in the image, with regions?",
+ "": "What does the image describe?",
+ "": "Describe in detail what is shown in the image.",
+ "": "Describe with a paragraph what is shown in the image.",
+ "": "Locate the objects with category name in the image.",
+ "": "Locate the objects in the image, with their descriptions.",
+ "": "Locate the region proposals in the image.",
}
self.task_prompts_with_input = {
- '': "Locate the phrases in the caption: {input}",
- '': 'Locate {input} in the image with mask',
- '': 'What is the polygon mask of region {input}',
- '': 'Locate {input} in the image.',
- '': 'What is the region {input}?',
- '': 'What does the region {input} describe?',
- '': 'What text is in the region {input}?',
+ "": "Locate the phrases in the caption: {input}",
+ "": "Locate {input} in the image with mask",
+ "": "What is the polygon mask of region {input}",
+ "": "Locate {input} in the image.",
+ "": "What is the region {input}?",
+ "": "What does the region {input} describe?",
+ "": "What text is in the region {input}?",
}
self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
-
super().__init__(image_processor, tokenizer)
-
+
def _construct_prompts(self, text):
# replace the task tokens with the task prompts if task token is in the text
prompts = []
@@ -150,10 +170,10 @@ def _construct_prompts(self, text):
assert _text == task_token, f"Task token {task_token} should be the only token in the text."
_text = task_prompt
break
- # 2. task prompts with additional inputs
+ # 2. task prompts with additional inputs
for task_token, task_prompt in self.task_prompts_with_input.items():
if task_token in _text:
- _text = task_prompt.format(input=_text.replace(task_token, ''))
+ _text = task_prompt.format(input=_text.replace(task_token, ""))
break
prompts.append(_text)
return prompts
@@ -237,9 +257,7 @@ def __call__(
if images is None:
raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
if text is None:
- logger.warning_once(
- "You are using Florence-2 without a text prompt."
- )
+ logger.warning_once("You are using Florence-2 without a text prompt.")
text = ""
if isinstance(text, List) and isinstance(images, List):
@@ -319,65 +337,70 @@ def post_process_generation(self, text, task, image_size):
image_size (`Tuple[int, int]`): The size of the image. height x width.
"""
- task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, "pure_text")
task_answer = self.post_processor(
text=text,
image_size=image_size,
parse_tasks=task_answer_post_processing_type,
)[task_answer_post_processing_type]
- if task_answer_post_processing_type == 'pure_text':
+ if task_answer_post_processing_type == "pure_text":
final_answer = task_answer
# remove the special tokens
- final_answer = final_answer.replace('', '').replace('', '')
- elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
+ final_answer = final_answer.replace("", "").replace("", "")
+ elif task_answer_post_processing_type in ["od", "description_with_bboxes", "bboxes"]:
od_instances = task_answer
- bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
- labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
- final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
- elif task_answer_post_processing_type in ['ocr']:
- bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
- labels = [str(_od_instance['text']) for _od_instance in task_answer]
- final_answer = {'quad_boxes': bboxes, 'labels': labels}
- elif task_answer_post_processing_type in ['phrase_grounding']:
+ bboxes_od = [_od_instance["bbox"] for _od_instance in od_instances]
+ labels_od = [str(_od_instance["cat_name"]) for _od_instance in od_instances]
+ final_answer = {"bboxes": bboxes_od, "labels": labels_od}
+ elif task_answer_post_processing_type in ["ocr"]:
+ bboxes = [_od_instance["quad_box"] for _od_instance in task_answer]
+ labels = [str(_od_instance["text"]) for _od_instance in task_answer]
+ final_answer = {"quad_boxes": bboxes, "labels": labels}
+ elif task_answer_post_processing_type in ["phrase_grounding"]:
bboxes = []
labels = []
for _grounded_phrase in task_answer:
- for _bbox in _grounded_phrase['bbox']:
+ for _bbox in _grounded_phrase["bbox"]:
bboxes.append(_bbox)
- labels.append(_grounded_phrase['cat_name'])
- final_answer = {'bboxes': bboxes, 'labels': labels}
- elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
+ labels.append(_grounded_phrase["cat_name"])
+ final_answer = {"bboxes": bboxes, "labels": labels}
+ elif task_answer_post_processing_type in ["description_with_polygons", "polygons"]:
labels = []
polygons = []
for result in task_answer:
- label = result['cat_name']
- _polygons = result['polygons']
+ label = result["cat_name"]
+ _polygons = result["polygons"]
labels.append(label)
polygons.append(_polygons)
- final_answer = {'polygons': polygons, 'labels': labels}
- elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
+ final_answer = {"polygons": polygons, "labels": labels}
+ elif task_answer_post_processing_type in ["description_with_bboxes_or_polygons"]:
bboxes = []
bboxes_labels = []
polygons = []
polygons_labels = []
for result in task_answer:
- label = result['cat_name']
- if 'polygons' in result:
- _polygons = result['polygons']
+ label = result["cat_name"]
+ if "polygons" in result:
+ _polygons = result["polygons"]
polygons.append(_polygons)
polygons_labels.append(label)
else:
- _bbox = result['bbox']
+ _bbox = result["bbox"]
bboxes.append(_bbox)
bboxes_labels.append(label)
- final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
+ final_answer = {
+ "bboxes": bboxes,
+ "bboxes_labels": bboxes_labels,
+ "polygons": polygons,
+ "polygons_labels": polygons_labels,
+ }
else:
- raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
+ raise ValueError("Unknown task answer post processing type: {}".format(task_answer_post_processing_type))
+
+ final_answer = {task: final_answer}
+ return final_answer
- final_answer = {
- task: final_answer}
- return final_answer
class BoxQuantizer(object):
def __init__(self, mode, bins):
@@ -386,57 +409,48 @@ def __init__(self, mode, bins):
def quantize(self, boxes: torch.Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
- size_w, size_h = size # Original image size.
+ size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
- if self.mode == 'floor':
- quantized_xmin = (
- xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
- quantized_ymin = (
- ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
- quantized_xmax = (
- xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
- quantized_ymax = (
- ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
-
- elif self.mode == 'round':
+ if self.mode == "floor":
+ quantized_xmin = (xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
+ quantized_ymin = (ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
+ quantized_xmax = (xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
+ quantized_ymax = (ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
+
+ elif self.mode == "round":
raise NotImplementedError()
else:
- raise ValueError('Incorrect quantization type.')
+ raise ValueError("Incorrect quantization type.")
- quantized_boxes = torch.cat(
- (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
- ).int()
+ quantized_boxes = torch.cat((quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1).int()
return quantized_boxes
def dequantize(self, boxes: torch.Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
- size_w, size_h = size # Original image size.
+ size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
- if self.mode == 'floor':
+ if self.mode == "floor":
# Add 0.5 to use the center position of the bin as the coordinate.
dequantized_xmin = (xmin + 0.5) * size_per_bin_w
dequantized_ymin = (ymin + 0.5) * size_per_bin_h
dequantized_xmax = (xmax + 0.5) * size_per_bin_w
dequantized_ymax = (ymax + 0.5) * size_per_bin_h
- elif self.mode == 'round':
+ elif self.mode == "round":
raise NotImplementedError()
else:
- raise ValueError('Incorrect quantization type.')
+ raise ValueError("Incorrect quantization type.")
- dequantized_boxes = torch.cat(
- (dequantized_xmin, dequantized_ymin,
- dequantized_xmax, dequantized_ymax), dim=-1
- )
+ dequantized_boxes = torch.cat((dequantized_xmin, dequantized_ymin, dequantized_xmax, dequantized_ymax), dim=-1)
return dequantized_boxes
@@ -452,57 +466,53 @@ def __init__(self, mode, bins):
def quantize(self, coordinates: torch.Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
- size_w, size_h = size # Original image size.
+ size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
- assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+ assert coordinates.shape[-1] == 2, "coordinates should be shape (N, 2)"
x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
- if self.mode == 'floor':
+ if self.mode == "floor":
quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
- elif self.mode == 'round':
+ elif self.mode == "round":
raise NotImplementedError()
else:
- raise ValueError('Incorrect quantization type.')
+ raise ValueError("Incorrect quantization type.")
- quantized_coordinates = torch.cat(
- (quantized_x, quantized_y), dim=-1
- ).int()
+ quantized_coordinates = torch.cat((quantized_x, quantized_y), dim=-1).int()
return quantized_coordinates
def dequantize(self, coordinates: torch.Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
- size_w, size_h = size # Original image size.
+ size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
- assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+ assert coordinates.shape[-1] == 2, "coordinates should be shape (N, 2)"
x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
- if self.mode == 'floor':
+ if self.mode == "floor":
# Add 0.5 to use the center position of the bin as the coordinate.
dequantized_x = (x + 0.5) * size_per_bin_w
dequantized_y = (y + 0.5) * size_per_bin_h
- elif self.mode == 'round':
+ elif self.mode == "round":
raise NotImplementedError()
else:
- raise ValueError('Incorrect quantization type.')
+ raise ValueError("Incorrect quantization type.")
- dequantized_coordinates = torch.cat(
- (dequantized_x, dequantized_y), dim=-1
- )
+ dequantized_coordinates = torch.cat((dequantized_x, dequantized_y), dim=-1)
return dequantized_coordinates
class Florence2PostProcesser(object):
- """
- Florence-2 post process for converting text prediction to various tasks results.
+ r"""
+ Florence-2 post process for converting text prediction to various tasks results.
Args:
config: A dict of configs.
@@ -527,22 +537,20 @@ class Florence2PostProcesser(object):
Returns:
parsed_dict (dict): A dict of parsed results.
"""
- def __init__(
- self,
- tokenizer=None
- ):
+
+ def __init__(self, tokenizer=None):
parse_tasks = []
parse_task_configs = {}
config = self._create_default_config()
- for task in config['PARSE_TASKS']:
- parse_tasks.append(task['TASK_NAME'])
- parse_task_configs[task['TASK_NAME']] = task
+ for task in config["PARSE_TASKS"]:
+ parse_tasks.append(task["TASK_NAME"])
+ parse_task_configs[task["TASK_NAME"]] = task
self.config = config
self.parse_tasks = parse_tasks
self.parse_tasks_configs = parse_task_configs
- self.tokenizer = tokenizer
+ self.tokenizer = tokenizer
if self.tokenizer is not None:
self.all_special_tokens = set(self.tokenizer.all_special_tokens)
@@ -552,105 +560,188 @@ def __init__(
def _create_black_list_of_phrase_grounding(self):
black_list = {}
- if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
- black_list = set(
- ['it', 'I', 'me', 'mine',
- 'you', 'your', 'yours',
- 'he', 'him', 'his',
- 'she', 'her', 'hers',
- 'they', 'them', 'their', 'theirs',
- 'one', 'oneself',
- 'we', 'us', 'our', 'ours',
- 'you', 'your', 'yours',
- 'they', 'them', 'their', 'theirs',
- 'mine', 'yours', 'his', 'hers', 'its',
- 'ours', 'yours', 'theirs',
- 'myself', 'yourself', 'himself', 'herself', 'itself',
- 'ourselves', 'yourselves', 'themselves',
- 'this', 'that',
- 'these', 'those',
- 'who', 'whom', 'whose', 'which', 'what',
- 'who', 'whom', 'whose', 'which', 'that',
- 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
- 'each', 'everybody', 'everyone', 'everything',
- 'few', 'many', 'nobody', 'none', 'one', 'several',
- 'some', 'somebody', 'someone', 'something',
- 'each other', 'one another',
- 'myself', 'yourself', 'himself', 'herself', 'itself',
- 'ourselves', 'yourselves', 'themselves',
- 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
- 'other objects', 'lots', 'a set',
- ]
+ if (
+ "phrase_grounding" in self.parse_tasks
+ and self.parse_tasks_configs["phrase_grounding"]["FILTER_BY_BLACK_LIST"]
+ ):
+ black_list = set(
+ [
+ "it",
+ "I",
+ "me",
+ "mine",
+ "you",
+ "your",
+ "yours",
+ "he",
+ "him",
+ "his",
+ "she",
+ "her",
+ "hers",
+ "they",
+ "them",
+ "their",
+ "theirs",
+ "one",
+ "oneself",
+ "we",
+ "us",
+ "our",
+ "ours",
+ "you",
+ "your",
+ "yours",
+ "they",
+ "them",
+ "their",
+ "theirs",
+ "mine",
+ "yours",
+ "his",
+ "hers",
+ "its",
+ "ours",
+ "yours",
+ "theirs",
+ "myself",
+ "yourself",
+ "himself",
+ "herself",
+ "itself",
+ "ourselves",
+ "yourselves",
+ "themselves",
+ "this",
+ "that",
+ "these",
+ "those",
+ "who",
+ "whom",
+ "whose",
+ "which",
+ "what",
+ "who",
+ "whom",
+ "whose",
+ "which",
+ "that",
+ "all",
+ "another",
+ "any",
+ "anybody",
+ "anyone",
+ "anything",
+ "each",
+ "everybody",
+ "everyone",
+ "everything",
+ "few",
+ "many",
+ "nobody",
+ "none",
+ "one",
+ "several",
+ "some",
+ "somebody",
+ "someone",
+ "something",
+ "each other",
+ "one another",
+ "myself",
+ "yourself",
+ "himself",
+ "herself",
+ "itself",
+ "ourselves",
+ "yourselves",
+ "themselves",
+ "the image",
+ "image",
+ "images",
+ "the",
+ "a",
+ "an",
+ "a group",
+ "other objects",
+ "lots",
+ "a set",
+ ]
)
return black_list
-
+
def _create_default_config(self):
config = {
- 'NUM_BBOX_HEIGHT_BINS': 1000,
- 'NUM_BBOX_WIDTH_BINS': 1000,
- 'BOX_QUANTIZATION_MODE': 'floor',
- 'COORDINATES_HEIGHT_BINS': 1000,
- 'COORDINATES_WIDTH_BINS': 1000,
- 'COORDINATES_QUANTIZATION_MODE': 'floor',
- 'PARSE_TASKS': [
- {
- 'TASK_NAME': 'od',
- 'PATTERN': r'([a-zA-Z0-9 ]+)'
- },
+ "NUM_BBOX_HEIGHT_BINS": 1000,
+ "NUM_BBOX_WIDTH_BINS": 1000,
+ "BOX_QUANTIZATION_MODE": "floor",
+ "COORDINATES_HEIGHT_BINS": 1000,
+ "COORDINATES_WIDTH_BINS": 1000,
+ "COORDINATES_QUANTIZATION_MODE": "floor",
+ "PARSE_TASKS": [
+ {"TASK_NAME": "od", "PATTERN": r"([a-zA-Z0-9 ]+)"},
{
- 'TASK_NAME': 'ocr',
- 'PATTERN': r'(.+?)',
- 'AREA_THRESHOLD': 0.01
+ "TASK_NAME": "ocr",
+ "PATTERN": r"(.+?)",
+ "AREA_THRESHOLD": 0.01,
},
+ {"TASK_NAME": "phrase_grounding", "FILTER_BY_BLACK_LIST": True},
{
- 'TASK_NAME': 'phrase_grounding',
- 'FILTER_BY_BLACK_LIST': True
+ "TASK_NAME": "pure_text",
},
{
- 'TASK_NAME': 'pure_text',
+ "TASK_NAME": "description_with_bboxes",
},
{
- 'TASK_NAME': 'description_with_bboxes',
+ "TASK_NAME": "description_with_polygons",
},
{
- 'TASK_NAME': 'description_with_polygons',
+ "TASK_NAME": "polygons",
},
{
- 'TASK_NAME': 'polygons',
+ "TASK_NAME": "bboxes",
},
{
- 'TASK_NAME': 'bboxes',
+ "TASK_NAME": "description_with_bboxes_or_polygons",
},
- {
- 'TASK_NAME': 'description_with_bboxes_or_polygons',
- }
- ]
+ ],
}
return config
def init_quantizers(self):
# we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
- num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
- num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
- box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+ num_bbox_height_bins = self.config.get("NUM_BBOX_HEIGHT_BINS", 1000)
+ num_bbox_width_bins = self.config.get("NUM_BBOX_WIDTH_BINS", 1000)
+ box_quantization_mode = self.config.get("BOX_QUANTIZATION_MODE", "floor")
self.box_quantizer = BoxQuantizer(
box_quantization_mode,
(num_bbox_width_bins, num_bbox_height_bins),
)
-
- num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
- num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
- box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+
+ num_bbox_height_bins = (
+ self.config["COORDINATES_HEIGHT_BINS"]
+ if "COORDINATES_HEIGHT_BINS" in self.config
+ else self.config.get("NUM_BBOX_HEIGHT_BINS", 1000)
+ )
+ num_bbox_width_bins = (
+ self.config["COORDINATES_WIDTH_BINS"]
+ if "COORDINATES_WIDTH_BINS" in self.config
+ else self.config.get("NUM_BBOX_WIDTH_BINS", 1000)
+ )
+ box_quantization_mode = (
+ self.config.get("COORDINATES_QUANTIZATION_MODE")
+ if "COORDINATES_QUANTIZATION_MODE" in self.config
+ else self.config.get("BOX_QUANTIZATION_MODE", "floor")
+ )
self.coordinates_quantizer = CoordinatesQuantizer(
box_quantization_mode,
(num_bbox_width_bins, num_bbox_height_bins),
)
def decode_with_spans(self, tokenizer, token_ids):
- filtered_tokens = tokenizer.convert_ids_to_tokens(
- token_ids, skip_special_tokens=False)
+ filtered_tokens = tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False)
assert len(filtered_tokens) == len(token_ids)
# To avoid mixing byte-level and unicode for byte-level BPT
@@ -666,12 +757,12 @@ def decode_with_spans(self, tokenizer, token_ids):
elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
# Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
# Note: Do not strip sub_text as it may have functional whitespace
- sub_text = token.replace('▁', ' ')
+ sub_text = token.replace("▁", " ")
else:
- raise ValueError(f'type {type(tokenizer)} not supported')
+ raise ValueError(f"type {type(tokenizer)} not supported")
sub_texts.append(sub_text)
- text = ''
+ text = ""
spans = []
for sub_text in sub_texts:
span = (len(text), len(text) + len(sub_text)) # [start index, end index).
@@ -679,7 +770,7 @@ def decode_with_spans(self, tokenizer, token_ids):
spans.append(span)
# Text format:
- # 1. T5Tokenizer/T5TokenizerFast:
+ # 1. T5Tokenizer/T5TokenizerFast:
# " transplanting dog cat"
# Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
# 2. BartTokenizer (need to double check):
@@ -687,13 +778,7 @@ def decode_with_spans(self, tokenizer, token_ids):
# Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
return text, spans
- def parse_od_from_text_and_spans(
- self,
- text,
- pattern,
- image_size,
- phrase_centric=False
- ):
+ def parse_od_from_text_and_spans(self, text, pattern, image_size, phrase_centric=False):
parsed = list(re.finditer(pattern, text))
instances = []
@@ -705,28 +790,26 @@ def parse_od_from_text_and_spans(
bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
else:
bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
- instance['bbox'] = self.box_quantizer.dequantize(
- boxes=torch.tensor(bbox_bins),
- size=image_size
- ).tolist()
+ instance["bbox"] = self.box_quantizer.dequantize(boxes=torch.tensor(bbox_bins), size=image_size).tolist()
if phrase_centric:
- instance['cat_name'] = parsed[i].group(1).lower().strip()
+ instance["cat_name"] = parsed[i].group(1).lower().strip()
else:
- instance['cat_name'] = parsed[i].group(5).lower().strip()
+ instance["cat_name"] = parsed[i].group(5).lower().strip()
instances.append(instance)
return instances
- def parse_ocr_from_text_and_spans(self,
- text,
- pattern,
- image_size,
- area_threshold=-1.0,
- ):
+ def parse_ocr_from_text_and_spans(
+ self,
+ text,
+ pattern,
+ image_size,
+ area_threshold=-1.0,
+ ):
bboxes = []
labels = []
- text = text.replace('', '')
+ text = text.replace("", "")
# ocr with regions
parsed = re.findall(pattern, text)
instances = []
@@ -736,59 +819,64 @@ def parse_ocr_from_text_and_spans(self,
ocr_content = ocr_line[0]
quad_box = ocr_line[1:]
quad_box = [int(i) for i in quad_box]
- quad_box = self.coordinates_quantizer.dequantize(
- torch.tensor(np.array(quad_box).reshape(-1, 2)),
- size=image_size
- ).reshape(-1).tolist()
+ quad_box = (
+ self.coordinates_quantizer.dequantize(torch.tensor(np.array(quad_box).reshape(-1, 2)), size=image_size)
+ .reshape(-1)
+ .tolist()
+ )
if area_threshold > 0:
x_coords = [i for i in quad_box[0::2]]
y_coords = [i for i in quad_box[1::2]]
# apply the Shoelace formula
- area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
+ area = 0.5 * abs(
+ sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1))
+ )
if area < (image_width * image_height) * area_threshold:
continue
bboxes.append(quad_box)
labels.append(ocr_content)
- instances.append({
- 'quad_box': quad_box,
- 'text': ocr_content,
- })
+ instances.append(
+ {
+ "quad_box": quad_box,
+ "text": ocr_content,
+ }
+ )
return instances
def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
# ignore and
cur_span = 0
- if text.startswith(''):
+ if text.startswith(""):
cur_span += 3
- text = text.replace('', '')
- text = text.replace('', '')
- text = text.replace('', '')
+ text = text.replace("", "")
+ text = text.replace("", "")
+ text = text.replace("", "")
pattern = r"([^<]+(?:){4,})"
phrases = re.findall(pattern, text)
-
+
# pattern should be text pattern and od pattern
- pattern = r'^\s*(.*?)(?=||||||'
+ pattern = r"^\s*(.*?)(?=||||||"
instances = []
for pharse_text in phrases:
- phrase_text_strip = pharse_text.replace('', '', 1)
- phrase_text_strip = pharse_text.replace('', '', 1)
+ phrase_text_strip = pharse_text.replace("", "", 1)
+ phrase_text_strip = pharse_text.replace("", "", 1)
- if phrase_text_strip == '':
+ if phrase_text_strip == "":
cur_span += len(pharse_text)
continue
# Prepare instance.
instance = {}
- # parse phrase, get string
+ # parse phrase, get string
phrase = re.search(pattern, phrase_text_strip)
if phrase is None:
cur_span += len(pharse_text)
@@ -808,16 +896,13 @@ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
cur_span += len(pharse_text)
continue
- # a list of list
+ # a list of list
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
- instance['bbox'] = self.box_quantizer.dequantize(
- boxes=torch.tensor(bbox_bins),
- size=image_size
- ).tolist()
+ instance["bbox"] = self.box_quantizer.dequantize(boxes=torch.tensor(bbox_bins), size=image_size).tolist()
# exclude non-ascii characters
- phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
- instance['cat_name'] = phrase
+ phrase = phrase.encode("ascii", errors="ignore").decode("ascii")
+ instance["cat_name"] = phrase
instances.append(instance)
@@ -827,29 +912,29 @@ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image
# temporary parse solution, split by '.'
# ignore and
- text = text.replace('', '')
- text = text.replace('', '')
- text = text.replace('', '')
+ text = text.replace("", "")
+ text = text.replace("", "")
+ text = text.replace("", "")
if allow_empty_phrase:
- pattern = rf"(?:(?:){{4,}})"
+ pattern = r"(?:(?:){4,})"
else:
pattern = r"([^<]+(?:){4,})"
phrases = re.findall(pattern, text)
-
+
# pattern should be text pattern and od pattern
- pattern = r'^\s*(.*?)(?=||||||'
+ pattern = r"^\s*(.*?)(?=||||||"
instances = []
for pharse_text in phrases:
- phrase_text_strip = pharse_text.replace('', '', 1)
- phrase_text_strip = pharse_text.replace('', '', 1)
+ phrase_text_strip = pharse_text.replace("", "", 1)
+ phrase_text_strip = pharse_text.replace("", "", 1)
- if phrase_text_strip == '' and not allow_empty_phrase:
+ if phrase_text_strip == "" and not allow_empty_phrase:
continue
- # parse phrase, get string
+ # parse phrase, get string
phrase = re.search(pattern, phrase_text_strip)
if phrase is None:
continue
@@ -863,70 +948,68 @@ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image
if len(bboxes_parsed) == 0:
continue
- # a list of list
+ # a list of list
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
- bboxes = self.box_quantizer.dequantize(
- boxes=torch.tensor(bbox_bins),
- size=image_size
- ).tolist()
+ bboxes = self.box_quantizer.dequantize(boxes=torch.tensor(bbox_bins), size=image_size).tolist()
- phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+ phrase = phrase.encode("ascii", errors="ignore").decode("ascii")
for _bboxes in bboxes:
# Prepare instance.
instance = {}
- instance['bbox'] = _bboxes
+ instance["bbox"] = _bboxes
# exclude non-ascii characters
- instance['cat_name'] = phrase
+ instance["cat_name"] = phrase
instances.append(instance)
return instances
- def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
- allow_empty_phrase=False,
- polygon_sep_token='',
- polygon_start_token='',
- polygon_end_token='',
- with_box_at_start=False,
- ):
-
+ def parse_description_with_polygons_from_text_and_spans(
+ self,
+ text,
+ pattern,
+ image_size,
+ allow_empty_phrase=False,
+ polygon_sep_token="",
+ polygon_start_token="",
+ polygon_end_token="",
+ with_box_at_start=False,
+ ):
# ref_seg format: '<><><><><><>'
# ignore and
- text = text.replace('', '')
- text = text.replace('', '')
- text = text.replace('', '')
+ text = text.replace("", "")
+ text = text.replace("", "")
+ text = text.replace("", "")
if allow_empty_phrase:
pattern = rf"(?:(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
else:
- # [^<]+: This part matches one or more characters that are not the < symbol.
+ # [^<]+: This part matches one or more characters that are not the < symbol.
# The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
#
pattern = rf"([^<]+(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
phrases = re.findall(pattern, text)
- phrase_string_pattern = r'^\s*(.*?)(?=||||||)'
- box_pattern = rf'((?:)+)(?:{re.escape(polygon_sep_token)}|$)'
+ phrase_string_pattern = r"^\s*(.*?)(?=||||||)"
+ box_pattern = rf"((?:)+)(?:{re.escape(polygon_sep_token)}|$)"
# one polygons instance is separated by polygon_start_token and polygon_end_token
- polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
-
+ polygons_instance_pattern = rf"{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}"
+
instances = []
for phrase_text in phrases:
-
# exclude loc_\d+>
# need to get span if want to include category score
- phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
+ phrase_text_strip = re.sub(r"^loc_\d+>", "", phrase_text, count=1)
# phrase = phrase.replace('', '')
# phrase = phrase.replace('poly>', '')
- if phrase_text_strip == '' and not allow_empty_phrase:
+ if phrase_text_strip == "" and not allow_empty_phrase:
continue
-
- # parse phrase, get string
+ # parse phrase, get string
phrase = re.search(phrase_string_pattern, phrase_text_strip)
if phrase is None:
continue
@@ -947,10 +1030,10 @@ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, ima
instance = {}
# polygons_parsed= list(re.finditer(box_pattern, phrase_text))
- if isinstance(_polygons_instances_parsed, str):
- polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
+ if isinstance(_polygons_instances_parsed, str):
+ polygons_parsed = list(re.finditer(box_pattern, _polygons_instances_parsed))
else:
- polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
+ polygons_parsed = list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
if len(polygons_parsed) == 0:
continue
@@ -961,7 +1044,7 @@ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, ima
# group 1: whole ...
_polygon = _polygon_parsed.group(1)
# parse into list of int
- _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'', _polygon)]
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r"", _polygon)]
if with_box_at_start and len(bbox) == 0:
if len(_polygon) > 4:
# no valid bbox prediction
@@ -969,25 +1052,27 @@ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, ima
_polygon = _polygon[4:]
else:
bbox = [0, 0, 0, 0]
- # abandon last element if is not paired
+ # abandon last element if is not paired
if len(_polygon) % 2 == 1:
_polygon = _polygon[:-1]
-
+
# reshape into (n, 2)
- _polygon = self.coordinates_quantizer.dequantize(
- torch.tensor(np.array(_polygon).reshape(-1, 2)),
- size=image_size
- ).reshape(-1).tolist()
+ _polygon = (
+ self.coordinates_quantizer.dequantize(
+ torch.tensor(np.array(_polygon).reshape(-1, 2)), size=image_size
+ )
+ .reshape(-1)
+ .tolist()
+ )
# reshape back
polygons.append(_polygon)
- instance['cat_name'] = phrase
- instance['polygons'] = polygons
+ instance["cat_name"] = phrase
+ instance["polygons"] = polygons
if len(bbox) != 0:
- instance['bbox'] = self.box_quantizer.dequantize(
- boxes=torch.tensor([bbox]),
- size=image_size
- ).tolist()[0]
+ instance["bbox"] = self.box_quantizer.dequantize(
+ boxes=torch.tensor([bbox]), size=image_size
+ ).tolist()[0]
instances.append(instance)
@@ -1010,84 +1095,82 @@ def __call__(
if isinstance(parse_tasks, str):
parse_tasks = [parse_tasks]
for _parse_task in parse_tasks:
- assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
-
- # sequence or text should be provided
- assert text is not None, 'text should be provided'
+ assert _parse_task in self.parse_tasks, f"parse task {_parse_task} not supported"
- parsed_dict = {
- 'text': text
- }
+ # sequence or text should be provided
+ assert text is not None, "text should be provided"
+
+ parsed_dict = {"text": text}
for task in self.parse_tasks:
if parse_tasks is not None and task not in parse_tasks:
continue
- pattern = self.parse_tasks_configs[task].get('PATTERN', None)
+ pattern = self.parse_tasks_configs[task].get("PATTERN", None)
- if task == 'ocr':
+ if task == "ocr":
instances = self.parse_ocr_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
- area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01),
+ area_threshold=self.parse_tasks_configs[task].get("AREA_THRESHOLD", 0.01),
)
- parsed_dict['ocr'] = instances
- elif task == 'phrase_grounding':
- instances = self.parse_phrase_grounding_from_text_and_spans(
+ parsed_dict["ocr"] = instances
+ elif task == "phrase_grounding":
+ instances = self.parse_phrase_grounding_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
- parsed_dict['phrase_grounding'] = instances
- elif task == 'pure_text':
- parsed_dict['pure_text'] = text
- elif task == 'description_with_bboxes':
- instances = self.parse_description_with_bboxes_from_text_and_spans(
+ parsed_dict["phrase_grounding"] = instances
+ elif task == "pure_text":
+ parsed_dict["pure_text"] = text
+ elif task == "description_with_bboxes":
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
- parsed_dict['description_with_bboxes'] = instances
- elif task == 'description_with_polygons':
- instances = self.parse_description_with_polygons_from_text_and_spans(
+ parsed_dict["description_with_bboxes"] = instances
+ elif task == "description_with_polygons":
+ instances = self.parse_description_with_polygons_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
- parsed_dict['description_with_polygons'] = instances
- elif task == 'polygons':
- instances = self.parse_description_with_polygons_from_text_and_spans(
+ parsed_dict["description_with_polygons"] = instances
+ elif task == "polygons":
+ instances = self.parse_description_with_polygons_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
allow_empty_phrase=True,
)
- parsed_dict['polygons'] = instances
- elif task == 'bboxes':
- instances = self.parse_description_with_bboxes_from_text_and_spans(
+ parsed_dict["polygons"] = instances
+ elif task == "bboxes":
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
allow_empty_phrase=True,
)
- parsed_dict['bboxes'] = instances
- elif task == 'description_with_bboxes_or_polygons':
- if '' in text:
+ parsed_dict["bboxes"] = instances
+ elif task == "description_with_bboxes_or_polygons":
+ if "" in text:
# only support either polygons or bboxes, not both at the same time
- instances = self.parse_description_with_polygons_from_text_and_spans(
+ instances = self.parse_description_with_polygons_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
else:
- instances = self.parse_description_with_bboxes_from_text_and_spans(
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
- parsed_dict['description_with_bboxes_or_polygons'] = instances
+ parsed_dict["description_with_bboxes_or_polygons"] = instances
else:
raise ValueError("task {} is not supported".format(task))
- return parsed_dict
\ No newline at end of file
+ return parsed_dict
From eaa2e7b6b976e8ad93a352643dc467ff620a76c8 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:15:44 +0200
Subject: [PATCH 08/35] fix florence2 modeling, rm einops.rearrange dep
---
src/transformers/models/florence/modeling_florence2.py | 10 +---------
1 file changed, 1 insertion(+), 9 deletions(-)
diff --git a/src/transformers/models/florence/modeling_florence2.py b/src/transformers/models/florence/modeling_florence2.py
index c6e09c303ed9..a4a7a5d0aafb 100644
--- a/src/transformers/models/florence/modeling_florence2.py
+++ b/src/transformers/models/florence/modeling_florence2.py
@@ -41,8 +41,6 @@
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
-
-# from einops import rearrange
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
@@ -211,7 +209,7 @@ def __init__(self, norm, fn, drop_path=None):
def forward(self, x, *args, **kwargs):
shortcut = x
- if self.norm != None:
+ if self.norm is not None:
x, size = self.fn(self.norm(x), *args, **kwargs)
else:
x, size = self.fn(x, *args, **kwargs)
@@ -293,17 +291,11 @@ def forward(self, x, size):
if len(x.size()) == 3:
if self.norm and self.pre_norm:
x = self.norm(x)
- # x = rearrange(
- # x, 'b (h w) c -> b c h w',
- # h=H, w=W
- # )
x = x.view(-1, H, W, x.size(-1)).permute(0, 3, 1, 2)
x = self.proj(x)
_, _, H, W = x.shape
- # TODO: check if this is correct
- # x = rearrange(x, 'b c h w -> b (h w) c')
x = x.permute(0, 2, 3, 1).contiguous()
B, H, W, C = x.size()
x = x.view(B, -1, C)
From 492d30f2373469e8d1d4bb4d039f46fc13183926 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:25:54 +0200
Subject: [PATCH 09/35] fix florence2 config, make fixup
---
src/transformers/models/florence/configuration_florence2.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/src/transformers/models/florence/configuration_florence2.py b/src/transformers/models/florence/configuration_florence2.py
index 643d6378dfa5..3b8467187571 100644
--- a/src/transformers/models/florence/configuration_florence2.py
+++ b/src/transformers/models/florence/configuration_florence2.py
@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import warnings
-
-""" Florence-2 configuration"""
+"""Florence-2 configuration"""
+import warnings
from ...configuration_utils import PretrainedConfig
from ...utils import logging
From 5b2737b3cafe96ff82561ceb798827ae1adb54aa Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:34:55 +0200
Subject: [PATCH 10/35] fix florence2 processing, make fixup
---
.../models/florence/processing_florence2.py | 210 +++++++++---------
1 file changed, 104 insertions(+), 106 deletions(-)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index b6531de62945..51ce3be8f6dc 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -564,110 +564,108 @@ def _create_black_list_of_phrase_grounding(self):
"phrase_grounding" in self.parse_tasks
and self.parse_tasks_configs["phrase_grounding"]["FILTER_BY_BLACK_LIST"]
):
- black_list = set(
- [
- "it",
- "I",
- "me",
- "mine",
- "you",
- "your",
- "yours",
- "he",
- "him",
- "his",
- "she",
- "her",
- "hers",
- "they",
- "them",
- "their",
- "theirs",
- "one",
- "oneself",
- "we",
- "us",
- "our",
- "ours",
- "you",
- "your",
- "yours",
- "they",
- "them",
- "their",
- "theirs",
- "mine",
- "yours",
- "his",
- "hers",
- "its",
- "ours",
- "yours",
- "theirs",
- "myself",
- "yourself",
- "himself",
- "herself",
- "itself",
- "ourselves",
- "yourselves",
- "themselves",
- "this",
- "that",
- "these",
- "those",
- "who",
- "whom",
- "whose",
- "which",
- "what",
- "who",
- "whom",
- "whose",
- "which",
- "that",
- "all",
- "another",
- "any",
- "anybody",
- "anyone",
- "anything",
- "each",
- "everybody",
- "everyone",
- "everything",
- "few",
- "many",
- "nobody",
- "none",
- "one",
- "several",
- "some",
- "somebody",
- "someone",
- "something",
- "each other",
- "one another",
- "myself",
- "yourself",
- "himself",
- "herself",
- "itself",
- "ourselves",
- "yourselves",
- "themselves",
- "the image",
- "image",
- "images",
- "the",
- "a",
- "an",
- "a group",
- "other objects",
- "lots",
- "a set",
- ]
- )
+ black_list = {
+ "it",
+ "I",
+ "me",
+ "mine",
+ "you",
+ "your",
+ "yours",
+ "he",
+ "him",
+ "his",
+ "she",
+ "her",
+ "hers",
+ "they",
+ "them",
+ "their",
+ "theirs",
+ "one",
+ "oneself",
+ "we",
+ "us",
+ "our",
+ "ours",
+ "you",
+ "your",
+ "yours",
+ "they",
+ "them",
+ "their",
+ "theirs",
+ "mine",
+ "yours",
+ "his",
+ "hers",
+ "its",
+ "ours",
+ "yours",
+ "theirs",
+ "myself",
+ "yourself",
+ "himself",
+ "herself",
+ "itself",
+ "ourselves",
+ "yourselves",
+ "themselves",
+ "this",
+ "that",
+ "these",
+ "those",
+ "who",
+ "whom",
+ "whose",
+ "which",
+ "what",
+ "who",
+ "whom",
+ "whose",
+ "which",
+ "that",
+ "all",
+ "another",
+ "any",
+ "anybody",
+ "anyone",
+ "anything",
+ "each",
+ "everybody",
+ "everyone",
+ "everything",
+ "few",
+ "many",
+ "nobody",
+ "none",
+ "one",
+ "several",
+ "some",
+ "somebody",
+ "someone",
+ "something",
+ "each other",
+ "one another",
+ "myself",
+ "yourself",
+ "himself",
+ "herself",
+ "itself",
+ "ourselves",
+ "yourselves",
+ "themselves",
+ "the image",
+ "image",
+ "images",
+ "the",
+ "a",
+ "an",
+ "a group",
+ "other objects",
+ "lots",
+ "a set",
+ }
return black_list
@@ -826,8 +824,8 @@ def parse_ocr_from_text_and_spans(
)
if area_threshold > 0:
- x_coords = [i for i in quad_box[0::2]]
- y_coords = [i for i in quad_box[1::2]]
+ x_coords = list(quad_box[0::2])
+ y_coords = list(quad_box[1::2])
# apply the Shoelace formula
area = 0.5 * abs(
From 7cef9d9cb15d4363eb705ce5f6f15e6e95a66382 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 17:27:03 +0200
Subject: [PATCH 11/35] fix florence2 processing, make repo-consistency
---
src/transformers/models/florence/processing_florence2.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index 51ce3be8f6dc..60b9c39bd049 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -307,7 +307,7 @@ def __call__(
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
def batch_decode(self, *args, **kwargs):
"""
- This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ This method forwards all its arguments to Florence2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
@@ -315,7 +315,7 @@ def batch_decode(self, *args, **kwargs):
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
def decode(self, *args, **kwargs):
"""
- This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ This method forwards all its arguments to Florence2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
From 8875899b5cb848938715ee00d31807c520238673 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 17:49:28 +0200
Subject: [PATCH 12/35] fix processing, add PILImageResampling
---
src/transformers/models/florence/processing_florence2.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index 60b9c39bd049..f37fb9d3c89e 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -24,7 +24,7 @@
import torch
from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ChannelDimension, ImageInput, is_valid_image
+from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_valid_image
from ...models.bart.tokenization_bart import BartTokenizer
from ...models.bart.tokenization_bart_fast import BartTokenizerFast
from ...models.t5.tokenization_t5 import T5Tokenizer
@@ -191,11 +191,11 @@ def __call__(
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
- data_format: Optional[ChannelDimension] = "channels_first", # noqa: F821
+ data_format: Optional[ChannelDimension] = "channels_first",
input_data_format: Optional[
- Union[str, ChannelDimension] # noqa: F821
+ Union[str, ChannelDimension]
] = None,
- resample: "PILImageResampling" = None, # noqa: F821
+ resample: PILImageResampling = None, # noqa: F821 # type: ignore
do_convert_rgb: bool = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
From 86d0862043fa89b838fa435dd84d4c12b75f9e8f Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:01:01 +0200
Subject: [PATCH 13/35] fix import of optional dependencies
---
src/transformers/models/florence/__init__.py | 21 ++++++++++++++++---
.../models/florence/processing_florence2.py | 17 +++++++++------
2 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
index 262e7173ca64..693c2700e0e5 100644
--- a/src/transformers/models/florence/__init__.py
+++ b/src/transformers/models/florence/__init__.py
@@ -17,6 +17,7 @@
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
+ is_vision_available,
)
@@ -27,11 +28,18 @@
"Florence2VisionConfig",
],
"processing_florence2": [
- "Florence2PostProcesser",
- "Florence2Processor",
+ "Florence2PostProcesser"
],
}
+try:
+ if not is_vision_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["processing_florence2"].append("Florence2Processor")
+
try:
if not is_torch_available():
@@ -55,9 +63,16 @@
)
from .processing_florence2 import (
Florence2PostProcesser,
- Florence2Processor,
)
+ try:
+ if not is_vision_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .processing_florence2 import Florence2Processor
+
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index f37fb9d3c89e..1d04201b324a 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -24,7 +24,7 @@
import torch
from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_valid_image
+from ...image_utils import ChannelDimension, ImageInput, is_valid_image
from ...models.bart.tokenization_bart import BartTokenizer
from ...models.bart.tokenization_bart_fast import BartTokenizerFast
from ...models.t5.tokenization_t5 import T5Tokenizer
@@ -36,7 +36,14 @@
TextInput,
TruncationStrategy,
)
-from ...utils import TensorType
+from ...utils import (
+ TensorType,
+ is_vision_available,
+)
+
+
+if is_vision_available():
+ from ...image_utils import PILImageResampling
logger = logging.getLogger(__name__)
@@ -192,10 +199,8 @@ def __call__(
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = "channels_first",
- input_data_format: Optional[
- Union[str, ChannelDimension]
- ] = None,
- resample: PILImageResampling = None, # noqa: F821 # type: ignore
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ resample: PILImageResampling = None, # noqa: F821
do_convert_rgb: bool = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
From 3765e739cb9e8493e99f456d7501aca0835ff508 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:18:15 +0200
Subject: [PATCH 14/35] add florence2 to models.__init__
---
src/transformers/models/__init__.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 24b602f18c8f..83f6427d8710 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -86,6 +86,7 @@
fastspeech2_conformer,
flaubert,
flava,
+ florence,
fnet,
focalnet,
fsmt,
From 3f915bdafa4eecc8824484bb859a5a2f4298636f Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:18:35 +0200
Subject: [PATCH 15/35] add florence2 to transformers.__init__
---
src/transformers/__init__.py | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4976a4a1b90e..df678da14048 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -423,6 +423,11 @@
"FlavaMultimodalConfig",
"FlavaTextConfig",
],
+ "models.florence": [
+ "Florence2Config",
+ "Florence2LanguageConfig",
+ "Florence2VisionConfig",
+ ],
"models.fnet": ["FNetConfig"],
"models.focalnet": ["FocalNetConfig"],
"models.fsmt": [
@@ -1130,6 +1135,7 @@
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
_import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
_import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
+ _import_structure["models.florence"].extend(["Florence2PostProcesser", "Florence2Processor"])
_import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
_import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
_import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
@@ -2119,6 +2125,14 @@
"FlavaTextModel",
]
)
+ _import_structure["models.florence"].extend(
+ [
+ "Florence2PreTrainedModel",
+ "Florence2VisionModel",
+ "Florence2VisionModelWithProjection",
+ "Florence2ForConditionalGeneration",
+ ]
+ )
_import_structure["models.fnet"].extend(
[
"FNetForMaskedLM",
@@ -5013,6 +5027,11 @@
FlavaMultimodalConfig,
FlavaTextConfig,
)
+ from .models.florence import (
+ Florence2Config,
+ Florence2LanguageConfig,
+ Florence2VisionConfig,
+ )
from .models.fnet import FNetConfig
from .models.focalnet import FocalNetConfig
from .models.fsmt import (
@@ -5751,6 +5770,7 @@
FlavaImageProcessor,
FlavaProcessor,
)
+ from .models.florence import Florence2Processor, Florence2PostProcesser
from .models.fuyu import FuyuImageProcessor, FuyuProcessor
from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
from .models.grounding_dino import GroundingDinoImageProcessor
@@ -6593,6 +6613,12 @@
FlavaPreTrainedModel,
FlavaTextModel,
)
+ from .models.florence import (
+ Florence2ForConditionalGeneration,
+ Florence2PreTrainedModel,
+ Florence2VisionModel,
+ Florence2VisionModelWithProjection,
+ )
from .models.fnet import (
FNetForMaskedLM,
FNetForMultipleChoice,
From bdb3706b8fc656807a80e72ceb9a2259f42dcadf Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:21:11 +0200
Subject: [PATCH 16/35] make fix-copies
---
.../florence/configuration_florence2.py | 63 ++++++++++---------
src/transformers/utils/dummy_pt_objects.py | 28 +++++++++
.../utils/dummy_vision_objects.py | 14 +++++
3 files changed, 76 insertions(+), 29 deletions(-)
diff --git a/src/transformers/models/florence/configuration_florence2.py b/src/transformers/models/florence/configuration_florence2.py
index 3b8467187571..5206ddb460e0 100644
--- a/src/transformers/models/florence/configuration_florence2.py
+++ b/src/transformers/models/florence/configuration_florence2.py
@@ -35,23 +35,23 @@ class Florence2VisionConfig(PretrainedConfig):
Args:
drop_path_rate (`float`, *optional*, defaults to 0.1):
The dropout rate of the drop path layer.
- patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+ patch_size (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
The patch size of the image.
- patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+ patch_stride (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
The patch stride of the image.
- patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+ patch_padding (`List[int]`, *optional*, defaults to `[3, 1, 1, 1]`):
The patch padding of the image.
- patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+ patch_prenorm (`List[bool]`, *optional*, defaults to `[False, True, True, True]`):
Whether to apply layer normalization before the patch embedding layer.
- enable_checkpoint (`bool`, *optional*, defaults to False):
+ enable_checkpoint (`bool`, *optional*, defaults to `False`):
Whether to enable checkpointing.
- dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+ dim_embed (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
The dimension of the embedding layer.
- num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+ num_heads (`List[int]`, *optional*, defaults to `[8, 16, 32, 64]`):
The number of attention heads.
- num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+ num_groups (`List[int]`, *optional*, defaults to `[8, 16, 32, 64]`):
The number of groups.
- depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+ depths (`List[int]`, *optional*, defaults to `[1, 1, 9, 1]`):
The depth of the model.
window_size (`int`, *optional*, defaults to 12):
The window size of the model.
@@ -61,7 +61,7 @@ class Florence2VisionConfig(PretrainedConfig):
The configuration of the visual temporal embedding.
image_pos_embed (`dict`, *optional*):
The configuration of the image position embedding.
- image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+ image_feature_source (`List[str]`, *optional*, defaults to `['spatial_avg_pool', 'temporal_avg_pool']`):
The source of the image feature.
Example:
@@ -134,48 +134,53 @@ class Florence2LanguageConfig(PretrainedConfig):
vocab_size (`int`, *optional*, defaults to 51289):
Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Florence2LanguageModel`].
- d_model (`int`, *optional*, defaults to 1024):
- Dimensionality of the layers and the pooler layer.
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (`int`, *optional*, defaults to 12):
- Number of decoder layers.
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (`int`, *optional*, defaults to 16):
- Number of attention heads for each attention layer in the Transformer decoder.
+ decoder_layers (`int`, *optional*, defaults to 12):
+ Number of decoder layers.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (`int`, *optional*, defaults to 4096):
- Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+ for more details.
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+ for more details.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
+ d_model (`int`, *optional*, defaults to 1024):
+ Dimensionality of the layers and the pooler layer.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (`float`, *optional*, defaults to 0.0):
- The dropout ratio for classifier.
- max_position_embeddings (`int`, *optional*, defaults to 1024):
- The maximum sequence length that this model might ever be used with. Typically set this to something large
- just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop (`float`, *optional*, defaults to 0.0):
- The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
- for more details.
- decoder_layerdrop (`float`, *optional*, defaults to 0.0):
- The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
- for more details.
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for classifier.
scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
num_labels (`int`, *optional*, defaults to 3):
The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+ pad_token_id (``, *optional*, defaults to 1):
+ bos_token_id (``, *optional*, defaults to 0):
+ eos_token_id (``, *optional*, defaults to 2):
+ is_encoder_decoder (``, *optional*, defaults to `True`):
+ decoder_start_token_id (``, *optional*, defaults to 2):
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 0cda4ed7b963..5660963802fc 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3955,6 +3955,34 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+class Florence2ForConditionalGeneration(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Florence2PreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Florence2VisionModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Florence2VisionModelWithProjection(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
class FNetForMaskedLM(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index a27dc024447f..591c46cb1b5b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -240,6 +240,20 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
+class Florence2Processor(metaclass=DummyObject):
+ _backends = ["vision"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["vision"])
+
+
+class Florence2PostProcesser(metaclass=DummyObject):
+ _backends = ["vision"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["vision"])
+
+
class FuyuImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
From 2d81135826977a0f790eca95c2dff2a0bcead0bd Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:34:58 +0200
Subject: [PATCH 17/35] fix formatting & typo
---
src/transformers/__init__.py | 6 +++---
src/transformers/models/florence/__init__.py | 6 ++----
src/transformers/models/florence/processing_florence2.py | 4 ++--
src/transformers/utils/dummy_vision_objects.py | 4 ++--
4 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index df678da14048..5733bcda94bc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1135,7 +1135,7 @@
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
_import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
_import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
- _import_structure["models.florence"].extend(["Florence2PostProcesser", "Florence2Processor"])
+ _import_structure["models.florence"].extend(["Florence2PostProcessor", "Florence2Processor"])
_import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
_import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
_import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
@@ -2127,10 +2127,10 @@
)
_import_structure["models.florence"].extend(
[
+ "Florence2ForConditionalGeneration",
"Florence2PreTrainedModel",
"Florence2VisionModel",
"Florence2VisionModelWithProjection",
- "Florence2ForConditionalGeneration",
]
)
_import_structure["models.fnet"].extend(
@@ -5770,7 +5770,7 @@
FlavaImageProcessor,
FlavaProcessor,
)
- from .models.florence import Florence2Processor, Florence2PostProcesser
+ from .models.florence import Florence2PostProcessor, Florence2Processor
from .models.fuyu import FuyuImageProcessor, FuyuProcessor
from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
from .models.grounding_dino import GroundingDinoImageProcessor
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
index 693c2700e0e5..f3dfbf59eba6 100644
--- a/src/transformers/models/florence/__init__.py
+++ b/src/transformers/models/florence/__init__.py
@@ -27,9 +27,7 @@
"Florence2LanguageConfig",
"Florence2VisionConfig",
],
- "processing_florence2": [
- "Florence2PostProcesser"
- ],
+ "processing_florence2": ["Florence2PostProcessor"],
}
try:
@@ -62,7 +60,7 @@
Florence2VisionConfig,
)
from .processing_florence2 import (
- Florence2PostProcesser,
+ Florence2PostProcessor,
)
try:
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index 1d04201b324a..3c4568609960 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -163,7 +163,7 @@ def __init__(
"": "What text is in the region {input}?",
}
- self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
+ self.post_processor = Florence2PostProcessor(tokenizer=tokenizer)
super().__init__(image_processor, tokenizer)
@@ -515,7 +515,7 @@ def dequantize(self, coordinates: torch.Tensor, size):
return dequantized_coordinates
-class Florence2PostProcesser(object):
+class Florence2PostProcessor(object):
r"""
Florence-2 post process for converting text prediction to various tasks results.
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 591c46cb1b5b..7355881d81a0 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -240,14 +240,14 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
-class Florence2Processor(metaclass=DummyObject):
+class Florence2PostProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
-class Florence2PostProcesser(metaclass=DummyObject):
+class Florence2Processor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
From e9f86d077591aca2c3d19784d30018f08c33ceaf Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:51:33 +0200
Subject: [PATCH 18/35] fix __init__s & dummies
---
src/transformers/__init__.py | 6 ++++++
src/transformers/models/florence/__init__.py | 6 ++++++
src/transformers/utils/dummy_pt_objects.py | 21 ++++++++++++++++++++
3 files changed, 33 insertions(+)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5733bcda94bc..f08fd93b1916 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2128,6 +2128,9 @@
_import_structure["models.florence"].extend(
[
"Florence2ForConditionalGeneration",
+ "Florence2LanguageForConditionalGeneration",
+ "Florence2LanguageModel",
+ "Florence2LanguagePreTrainedModel",
"Florence2PreTrainedModel",
"Florence2VisionModel",
"Florence2VisionModelWithProjection",
@@ -6615,6 +6618,9 @@
)
from .models.florence import (
Florence2ForConditionalGeneration,
+ Florence2LanguageForConditionalGeneration,
+ Florence2LanguageModel,
+ Florence2LanguagePreTrainedModel,
Florence2PreTrainedModel,
Florence2VisionModel,
Florence2VisionModelWithProjection,
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
index f3dfbf59eba6..bcde843c77c4 100644
--- a/src/transformers/models/florence/__init__.py
+++ b/src/transformers/models/florence/__init__.py
@@ -50,6 +50,9 @@
"Florence2VisionModel",
"Florence2VisionModelWithProjection",
"Florence2ForConditionalGeneration",
+ "Florence2LanguageModel",
+ "Florence2LanguagePreTrainedModel",
+ "Florence2LanguageForConditionalGeneration",
]
@@ -79,6 +82,9 @@
else:
from .modeling_florence2 import (
Florence2ForConditionalGeneration,
+ Florence2LanguageForConditionalGeneration,
+ Florence2LanguageModel,
+ Florence2LanguagePreTrainedModel,
Florence2PreTrainedModel,
Florence2VisionModel,
Florence2VisionModelWithProjection,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5660963802fc..f15adc92395c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3962,6 +3962,27 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+class Florence2LanguageForConditionalGeneration(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Florence2LanguageModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Florence2LanguagePreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
class Florence2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
From 172a1dd0e26fb9115a8b5aed3d29b6ae3bb1d911 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 19:47:40 +0200
Subject: [PATCH 19/35] add tests florence2 processing
---
.../florence/test_processing_florence2.py | 153 ++++++++++++++++++
1 file changed, 153 insertions(+)
create mode 100644 tests/models/florence/test_processing_florence2.py
diff --git a/tests/models/florence/test_processing_florence2.py b/tests/models/florence/test_processing_florence2.py
new file mode 100644
index 000000000000..d114b4ef5f17
--- /dev/null
+++ b/tests/models/florence/test_processing_florence2.py
@@ -0,0 +1,153 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import random
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import BartTokenizer, BartTokenizerFast
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES # TODO can I use this?
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import CLIPImageProcessor, Florence2Processor
+
+@require_vision
+class Florence2ProcessorTest(unittest.TestCase):
+ def setUp(self):
+ self.tmpdirname = tempfile.mkdtemp()
+
+ vocab_tokens = [] # TODO: add vocab tokens
+ self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+
+ with open(self.vocab_file, "w", encoding="utf-8") as fp:
+ fp.write("".join([x + "\n" for x in vocab_tokens]))
+
+ self.processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+
+ def get_tokenizer(self, **kwargs):
+ return BartTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+ def get_rust_tokenizer(self, **kwargs):
+ return BartTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+ def get_image_processor(self, **kwargs):
+ return CLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+ def tearDown(self):
+ shutil.rmtree(self.tmpdirname)
+
+ def prepare_image_inputs(self):
+ """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+ or a list of PyTorch tensors if one specifies torchify=True.
+ """
+ image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+ image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+ return image_inputs
+
+ def test_save_load_pretrained_default(self):
+ tokenizer_slow = self.get_tokenizer()
+ tokenizer_fast = self.get_rust_tokenizer()
+ image_processor = self.get_image_processor()
+
+ processor_slow = Florence2Processor(tokenizer=tokenizer_slow, image_processor=image_processor)
+ processor_slow.save_pretrained(self.tmpdirname)
+ processor_slow = Florence2Processor.from_pretrained(self.tmpdirname)
+
+ processor_fast = Florence2Processor(tokenizer=tokenizer_fast, image_processor=image_processor)
+ processor_fast.save_pretrained(self.tmpdirname)
+ processor_fast = Florence2Processor.from_pretrained(self.tmpdirname)
+
+ self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+ self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+ self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+ self.assertIsInstance(processor_slow.tokenizer, BartTokenizer)
+ self.assertIsInstance(processor_fast.tokenizer, BartTokenizerFast)
+
+ self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+ self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+ self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
+ self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
+
+ def test_save_load_pretrained_additional_features(self):
+ processor = Florence2Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+ processor.save_pretrained(self.tmpdirname)
+
+ tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+ image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+ processor = Florence2Processor.from_pretrained(
+ self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+ )
+
+ self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+ self.assertIsInstance(processor.tokenizer, BartTokenizer)
+
+ self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+ self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
+
+ def test_image_processor(self):
+ image_processor = self.get_image_processor()
+ tokenizer = self.get_tokenizer()
+ processor = Florence2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+ image_inputs = self.prepare_image_inputs()
+
+ input_feat_extract = image_processor(image_inputs, return_tensors="np")
+ input_processor = processor(images=image_inputs, return_tensors="np")
+
+ for key in input_feat_extract.keys():
+ self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+ def test_tokenizer(self):
+ image_processor = self.get_image_processor()
+ tokenizer = self.get_tokenizer()
+ processor = Florence2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+ input_str = "huggingface is cool"
+
+ encoded_processor = processor(text=input_str)
+ encoded_tok = tokenizer(input_str)
+ for key in encoded_tok.keys():
+ self.assertListEqual(encoded_processor[key], encoded_tok[key])
+
+
+ def test_processor(self):
+ image_processor = self.get_image_processor()
+ tokenizer = self.get_tokenizer()
+
+ processor = Florence2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+ input_str = "huggingface is cool"
+ image_inputs = self.prepare_image_inputs()
+
+ encoded_processor = processor(text=input_str, images=image_inputs)
+
+ self.assertListEqual(list(encoded_processor.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+
+ # test if it raises when no input is passed
+ with pytest.raises(ValueError):
+ processor()
+
From 087ef0e9e05a49d284ae5d3ed3aba104e601989d Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 23:53:50 +0200
Subject: [PATCH 20/35] add tests florence2 modeling
---
.../florence/test_modeling_florence2.py | 234 ++++++++++++++++++
1 file changed, 234 insertions(+)
create mode 100644 tests/models/florence/test_modeling_florence2.py
diff --git a/tests/models/florence/test_modeling_florence2.py b/tests/models/florence/test_modeling_florence2.py
new file mode 100644
index 000000000000..6823b4918120
--- /dev/null
+++ b/tests/models/florence/test_modeling_florence2.py
@@ -0,0 +1,234 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import random
+import requests
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import (
+ Florence2VisionConfig,
+ Florence2LanguageConfig,
+ Florence2Config,
+)
+
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+ ModelTesterMixin,
+)
+
+
+if is_torch_available():
+ import torch
+ from torch import nn
+
+ from transformers import (
+ Florence2LanguageModel,
+ Florence2VisionModelWithProjection,
+ Florence2LanguageForConditionalGeneration,
+ Florence2VisionModel,
+ Florence2ForConditionalGeneration,
+ Florence2LanguagePreTrainedModel,
+ Florence2PreTrainedModel,
+ )
+else:
+ torch = {}
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import Florence2Processor
+
+MODEL_ID = "microsoft/Florence-2-base-ft"
+
+class Florence2LanguageModelTester:
+ def __init__(self):
+ pass
+
+@require_torch
+class Florence2LanguageModelTest(ModelTesterMixin, unittest.TestCase):
+ def setUp(self):
+ self.model_tester = Florence2LanguageModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=Florence2LanguageConfig)
+
+ @slow
+ def test_model_from_pretrained(self):
+ model = Florence2LanguageModel.from_pretrained(MODEL_ID)
+ self.assertIsNotNone(model)
+
+
+class Florence2VisionModelTester:
+ def __init__(self):
+ pass
+
+@require_torch
+class Florence2VisionModelTest(ModelTesterMixin, unittest.TestCase):
+ def setUp(self):
+ self.model_tester = Florence2VisionModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=Florence2VisionConfig)
+
+ @slow
+ def test_model_from_pretrained(self):
+ model = Florence2VisionModel.from_pretrained(MODEL_ID)
+ self.assertIsNotNone(model)
+
+
+class Florence2VisionModelWithProjectionTester:
+ def __init__(self):
+ pass
+
+@require_torch
+class Florence2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
+ def setUp(self):
+ self.model_tester = Florence2VisionModelWithProjectionTester(self)
+ self.config_tester = ConfigTester(self, config_class=Florence2VisionConfig)
+
+ @slow
+ def test_model_from_pretrained(self):
+ model = Florence2VisionModelWithProjection.from_pretrained(MODEL_ID)
+ self.assertIsNotNone(model)
+
+
+class Florence2ForConditionalGenerationTester:
+ def __init__(self):
+ pass
+
+@require_torch
+class Florence2ForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
+ def setUp(self):
+ self.model_tester = Florence2ForConditionalGenerationTester(self)
+ self.config_tester = ConfigTester(self, config_class=Florence2Config)
+
+ @slow
+ def test_model_from_pretrained(self):
+ model = Florence2ForConditionalGeneration.from_pretrained(MODEL_ID)
+ self.assertIsNotNone(model)
+
+
+class Florence2LanguageForConditionalGenerationTester:
+ def __init__(self):
+ pass
+
+@require_torch
+class Florence2LanguageForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
+ def setUp(self):
+ self.model_tester = Florence2LanguageForConditionalGenerationTester(self)
+ self.config_tester = ConfigTester(self, config_class=Florence2LanguageConfig)
+
+ @slow
+ def test_model_from_pretrained(self):
+ model = Florence2LanguageForConditionalGeneration.from_pretrained(MODEL_ID)
+ self.assertIsNotNone(model)
+
+
+def prepare_img():
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
+ im = Image.open(requests.get(url, stream=True).raw)
+ return im
+
+
+@require_vision
+@require_torch
+class Florence2LanguageModelIntegrationTest(unittest.TestCase):
+ @slow
+ def test_inference(self):
+ model = Florence2LanguageModel.from_pretrained(MODEL_ID).to(torch_device)
+ processor = Florence2Processor.from_pretrained(MODEL_ID)
+
+ img = prepare_img()
+ inputs = processor(img, return_tensors="pt").to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs)
+
+ # TODO: write test test condition
+ #self.assertEqual(outputs.last_hidden_state.shape, (1, 1, 768))
+
+
+@require_vision
+@require_torch
+class Florence2VisionModelIntegrationTest(unittest.TestCase):
+ @slow
+ def test_inference(self):
+ model = Florence2VisionModel.from_pretrained(MODEL_ID).to(torch_device)
+ processor = Florence2Processor.from_pretrained(MODEL_ID)
+
+ img = prepare_img()
+ inputs = processor(img, return_tensors="pt").to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs)
+
+ # TODO: write test condition
+
+
+@require_vision
+@require_torch
+class Florence2VisionModelWithProjectionIntegrationTest(unittest.TestCase):
+ @slow
+ def test_inference(self):
+ model = Florence2VisionModelWithProjection.from_pretrained(MODEL_ID).to(torch_device)
+ processor = Florence2Processor.from_pretrained(MODEL_ID)
+
+ img = prepare_img()
+ inputs = processor(img, return_tensors="pt").to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs)
+
+ # TODO: write test condition
+
+
+@require_vision
+@require_torch
+class Florence2ForConditionalGenerationIntegrationTest(unittest.TestCase):
+ @slow
+ def test_inference(self):
+ model = Florence2ForConditionalGeneration.from_pretrained(MODEL_ID).to(torch_device)
+ processor = Florence2Processor.from_pretrained(MODEL_ID)
+
+ img = prepare_img()
+ inputs = processor(img, return_tensors="pt").to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs)
+
+ # TODO: write test condition
+
+
+@require_vision
+@require_torch
+class Florence2LanguageForConditionalGenerationIntegrationTest(unittest.TestCase):
+ @slow
+ def test_inference(self):
+ model = Florence2LanguageForConditionalGeneration.from_pretrained(MODEL_ID).to(torch_device)
+ processor = Florence2Processor.from_pretrained(MODEL_ID)
+
+ img = prepare_img()
+ inputs = processor(img, return_tensors="pt").to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs)
+
+ # TODO: write test condition
+
From fb234ef24d8b20f2b6b9bba92a89fd095e6511bd Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 20 Jun 2024 23:58:16 +0200
Subject: [PATCH 21/35] fix tests florence2 processign vocabfile
---
tests/models/florence/test_processing_florence2.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/models/florence/test_processing_florence2.py b/tests/models/florence/test_processing_florence2.py
index d114b4ef5f17..a9dba89aed5a 100644
--- a/tests/models/florence/test_processing_florence2.py
+++ b/tests/models/florence/test_processing_florence2.py
@@ -23,7 +23,7 @@
import pytest
from transformers import BartTokenizer, BartTokenizerFast
-from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES # TODO can I use this?
+from transformers.models.bart.tokenization_bart import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
From 0a615baf4870450829d93f4a794b88169b20510f Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Fri, 21 Jun 2024 03:33:10 +0200
Subject: [PATCH 22/35] fix rm import Florence2PostProcessor
---
src/transformers/__init__.py | 4 ++--
src/transformers/models/florence/__init__.py | 6 +-----
src/transformers/utils/dummy_vision_objects.py | 7 -------
3 files changed, 3 insertions(+), 14 deletions(-)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f08fd93b1916..77bf3bdb211a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1135,7 +1135,7 @@
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
_import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
_import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
- _import_structure["models.florence"].extend(["Florence2PostProcessor", "Florence2Processor"])
+ _import_structure["models.florence"].extend(["Florence2Processor"])
_import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
_import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
_import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
@@ -5773,7 +5773,7 @@
FlavaImageProcessor,
FlavaProcessor,
)
- from .models.florence import Florence2PostProcessor, Florence2Processor
+ from .models.florence import Florence2Processor
from .models.fuyu import FuyuImageProcessor, FuyuProcessor
from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
from .models.grounding_dino import GroundingDinoImageProcessor
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
index bcde843c77c4..0ce98bda9450 100644
--- a/src/transformers/models/florence/__init__.py
+++ b/src/transformers/models/florence/__init__.py
@@ -27,7 +27,6 @@
"Florence2LanguageConfig",
"Florence2VisionConfig",
],
- "processing_florence2": ["Florence2PostProcessor"],
}
try:
@@ -36,7 +35,7 @@
except OptionalDependencyNotAvailable:
pass
else:
- _import_structure["processing_florence2"].append("Florence2Processor")
+ _import_structure["processing_florence2"] = ["Florence2Processor"]
try:
@@ -62,9 +61,6 @@
Florence2LanguageConfig,
Florence2VisionConfig,
)
- from .processing_florence2 import (
- Florence2PostProcessor,
- )
try:
if not is_vision_available():
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 7355881d81a0..04002cae9b0b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -240,13 +240,6 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
-class Florence2PostProcessor(metaclass=DummyObject):
- _backends = ["vision"]
-
- def __init__(self, *args, **kwargs):
- requires_backends(self, ["vision"])
-
-
class Florence2Processor(metaclass=DummyObject):
_backends = ["vision"]
From 26e1c23eff0129bea76d394095e401b3d0e63ef0 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Fri, 21 Jun 2024 03:51:55 +0200
Subject: [PATCH 23/35] fix rm unnecessary imports
---
src/transformers/__init__.py | 8 --
src/transformers/models/florence/__init__.py | 10 +-
src/transformers/utils/dummy_pt_objects.py | 21 -----
.../florence/test_modeling_florence2.py | 94 +++----------------
4 files changed, 12 insertions(+), 121 deletions(-)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 77bf3bdb211a..8b31ac0260a3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -425,7 +425,6 @@
],
"models.florence": [
"Florence2Config",
- "Florence2LanguageConfig",
"Florence2VisionConfig",
],
"models.fnet": ["FNetConfig"],
@@ -2128,9 +2127,6 @@
_import_structure["models.florence"].extend(
[
"Florence2ForConditionalGeneration",
- "Florence2LanguageForConditionalGeneration",
- "Florence2LanguageModel",
- "Florence2LanguagePreTrainedModel",
"Florence2PreTrainedModel",
"Florence2VisionModel",
"Florence2VisionModelWithProjection",
@@ -5032,7 +5028,6 @@
)
from .models.florence import (
Florence2Config,
- Florence2LanguageConfig,
Florence2VisionConfig,
)
from .models.fnet import FNetConfig
@@ -6618,9 +6613,6 @@
)
from .models.florence import (
Florence2ForConditionalGeneration,
- Florence2LanguageForConditionalGeneration,
- Florence2LanguageModel,
- Florence2LanguagePreTrainedModel,
Florence2PreTrainedModel,
Florence2VisionModel,
Florence2VisionModelWithProjection,
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence/__init__.py
index 0ce98bda9450..c2a08b783125 100644
--- a/src/transformers/models/florence/__init__.py
+++ b/src/transformers/models/florence/__init__.py
@@ -24,7 +24,6 @@
_import_structure = {
"configuration_florence2": [
"Florence2Config",
- "Florence2LanguageConfig",
"Florence2VisionConfig",
],
}
@@ -45,20 +44,16 @@
pass
else:
_import_structure["modeling_florence2"] = [
+ "Florence2ForConditionalGeneration",
"Florence2PreTrainedModel",
"Florence2VisionModel",
"Florence2VisionModelWithProjection",
- "Florence2ForConditionalGeneration",
- "Florence2LanguageModel",
- "Florence2LanguagePreTrainedModel",
- "Florence2LanguageForConditionalGeneration",
]
if TYPE_CHECKING:
from .configuration_florence2 import (
Florence2Config,
- Florence2LanguageConfig,
Florence2VisionConfig,
)
@@ -78,9 +73,6 @@
else:
from .modeling_florence2 import (
Florence2ForConditionalGeneration,
- Florence2LanguageForConditionalGeneration,
- Florence2LanguageModel,
- Florence2LanguagePreTrainedModel,
Florence2PreTrainedModel,
Florence2VisionModel,
Florence2VisionModelWithProjection,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f15adc92395c..5660963802fc 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3962,27 +3962,6 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
-class Florence2LanguageForConditionalGeneration(metaclass=DummyObject):
- _backends = ["torch"]
-
- def __init__(self, *args, **kwargs):
- requires_backends(self, ["torch"])
-
-
-class Florence2LanguageModel(metaclass=DummyObject):
- _backends = ["torch"]
-
- def __init__(self, *args, **kwargs):
- requires_backends(self, ["torch"])
-
-
-class Florence2LanguagePreTrainedModel(metaclass=DummyObject):
- _backends = ["torch"]
-
- def __init__(self, *args, **kwargs):
- requires_backends(self, ["torch"])
-
-
class Florence2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/tests/models/florence/test_modeling_florence2.py b/tests/models/florence/test_modeling_florence2.py
index 6823b4918120..1d2929ca6f95 100644
--- a/tests/models/florence/test_modeling_florence2.py
+++ b/tests/models/florence/test_modeling_florence2.py
@@ -25,7 +25,6 @@
from transformers import (
Florence2VisionConfig,
- Florence2LanguageConfig,
Florence2Config,
)
@@ -43,13 +42,9 @@
from torch import nn
from transformers import (
- Florence2LanguageModel,
- Florence2VisionModelWithProjection,
- Florence2LanguageForConditionalGeneration,
- Florence2VisionModel,
Florence2ForConditionalGeneration,
- Florence2LanguagePreTrainedModel,
- Florence2PreTrainedModel,
+ Florence2VisionModel,
+ Florence2VisionModelWithProjection,
)
else:
torch = {}
@@ -61,19 +56,20 @@
MODEL_ID = "microsoft/Florence-2-base-ft"
-class Florence2LanguageModelTester:
+
+class Florence2ForConditionalGenerationTester:
def __init__(self):
pass
@require_torch
-class Florence2LanguageModelTest(ModelTesterMixin, unittest.TestCase):
+class Florence2ForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
def setUp(self):
- self.model_tester = Florence2LanguageModelTester(self)
- self.config_tester = ConfigTester(self, config_class=Florence2LanguageConfig)
+ self.model_tester = Florence2ForConditionalGenerationTester(self)
+ self.config_tester = ConfigTester(self, config_class=Florence2Config)
@slow
def test_model_from_pretrained(self):
- model = Florence2LanguageModel.from_pretrained(MODEL_ID)
+ model = Florence2ForConditionalGeneration.from_pretrained(MODEL_ID)
self.assertIsNotNone(model)
@@ -108,39 +104,6 @@ def test_model_from_pretrained(self):
model = Florence2VisionModelWithProjection.from_pretrained(MODEL_ID)
self.assertIsNotNone(model)
-
-class Florence2ForConditionalGenerationTester:
- def __init__(self):
- pass
-
-@require_torch
-class Florence2ForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
- def setUp(self):
- self.model_tester = Florence2ForConditionalGenerationTester(self)
- self.config_tester = ConfigTester(self, config_class=Florence2Config)
-
- @slow
- def test_model_from_pretrained(self):
- model = Florence2ForConditionalGeneration.from_pretrained(MODEL_ID)
- self.assertIsNotNone(model)
-
-
-class Florence2LanguageForConditionalGenerationTester:
- def __init__(self):
- pass
-
-@require_torch
-class Florence2LanguageForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
- def setUp(self):
- self.model_tester = Florence2LanguageForConditionalGenerationTester(self)
- self.config_tester = ConfigTester(self, config_class=Florence2LanguageConfig)
-
- @slow
- def test_model_from_pretrained(self):
- model = Florence2LanguageForConditionalGeneration.from_pretrained(MODEL_ID)
- self.assertIsNotNone(model)
-
-
def prepare_img():
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
im = Image.open(requests.get(url, stream=True).raw)
@@ -149,10 +112,10 @@ def prepare_img():
@require_vision
@require_torch
-class Florence2LanguageModelIntegrationTest(unittest.TestCase):
+class Florence2ForConditionalGenerationIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
- model = Florence2LanguageModel.from_pretrained(MODEL_ID).to(torch_device)
+ model = Florence2ForConditionalGeneration.from_pretrained(MODEL_ID).to(torch_device)
processor = Florence2Processor.from_pretrained(MODEL_ID)
img = prepare_img()
@@ -161,8 +124,7 @@ def test_inference(self):
with torch.no_grad():
outputs = model(**inputs)
- # TODO: write test test condition
- #self.assertEqual(outputs.last_hidden_state.shape, (1, 1, 768))
+ # TODO: write test condition
@require_vision
@@ -198,37 +160,3 @@ def test_inference(self):
# TODO: write test condition
-
-@require_vision
-@require_torch
-class Florence2ForConditionalGenerationIntegrationTest(unittest.TestCase):
- @slow
- def test_inference(self):
- model = Florence2ForConditionalGeneration.from_pretrained(MODEL_ID).to(torch_device)
- processor = Florence2Processor.from_pretrained(MODEL_ID)
-
- img = prepare_img()
- inputs = processor(img, return_tensors="pt").to(torch_device)
-
- with torch.no_grad():
- outputs = model(**inputs)
-
- # TODO: write test condition
-
-
-@require_vision
-@require_torch
-class Florence2LanguageForConditionalGenerationIntegrationTest(unittest.TestCase):
- @slow
- def test_inference(self):
- model = Florence2LanguageForConditionalGeneration.from_pretrained(MODEL_ID).to(torch_device)
- processor = Florence2Processor.from_pretrained(MODEL_ID)
-
- img = prepare_img()
- inputs = processor(img, return_tensors="pt").to(torch_device)
-
- with torch.no_grad():
- outputs = model(**inputs)
-
- # TODO: write test condition
-
From f11c993ed5e80bc2c4c427a9049757b8933de2fd Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Fri, 21 Jun 2024 14:18:53 +0200
Subject: [PATCH 24/35] fix processing PILImageResampling
---
src/transformers/models/florence/processing_florence2.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence/processing_florence2.py
index 3c4568609960..abedf0a6d06d 100644
--- a/src/transformers/models/florence/processing_florence2.py
+++ b/src/transformers/models/florence/processing_florence2.py
@@ -43,7 +43,8 @@
if is_vision_available():
- from ...image_utils import PILImageResampling
+ import PIL.Image
+ from PIL.Image import Image as PILImage
logger = logging.getLogger(__name__)
@@ -200,7 +201,7 @@ def __call__(
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = "channels_first",
input_data_format: Optional[Union[str, ChannelDimension]] = None,
- resample: PILImageResampling = None, # noqa: F821
+ resample: Optional[Union[PILImage, PIL.Image.Resampling]] = None, # noqa: F821
do_convert_rgb: bool = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
From 311da9c247bb45bd0081d0c865b50f9218153b18 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Fri, 21 Jun 2024 14:44:16 +0200
Subject: [PATCH 25/35] fix make repo-consistency error
---
utils/check_repo.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index dcb1374d8e0b..4232b5b0ff09 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -81,6 +81,9 @@
"SeamlessM4Tv2TextToUnitModel",
"SeamlessM4Tv2CodeHifiGan",
"SeamlessM4Tv2TextToUnitForConditionalGeneration",
+ "Florence2LanguageForConditionalGeneration",
+ "Florence2LanguageModel",
+ "Florence2LanguagePreTrainedModel",
]
# Update this list for models that are not tested with a comment explaining the reason it should not be.
From b4298146e6e492b9a7f67a307f8806e2376a3656 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Sat, 22 Jun 2024 14:38:04 +0200
Subject: [PATCH 26/35] update tests
---
tests/models/florence/__init__.py | 0
.../florence/test_modeling_florence2.py | 42 ++++++++++++-------
.../florence/test_processing_florence2.py | 25 +++++------
3 files changed, 38 insertions(+), 29 deletions(-)
create mode 100644 tests/models/florence/__init__.py
diff --git a/tests/models/florence/__init__.py b/tests/models/florence/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/florence/test_modeling_florence2.py b/tests/models/florence/test_modeling_florence2.py
index 1d2929ca6f95..b0bd628aa972 100644
--- a/tests/models/florence/test_modeling_florence2.py
+++ b/tests/models/florence/test_modeling_florence2.py
@@ -12,22 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import json
-import os
-import random
-import requests
-import shutil
-import tempfile
import unittest
-
-import numpy as np
-import pytest
+import requests
from transformers import (
- Florence2VisionConfig,
Florence2Config,
+ Florence2VisionConfig,
)
-
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
@@ -39,7 +30,6 @@
if is_torch_available():
import torch
- from torch import nn
from transformers import (
Florence2ForConditionalGeneration,
@@ -58,11 +48,17 @@
class Florence2ForConditionalGenerationTester:
- def __init__(self):
+ def __init__(self, parent):
+ pass
+
+ def prepare_config_and_inputs_for_common(self):
pass
+
@require_torch
class Florence2ForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
+ all_model_classes = (Florence2ForConditionalGeneration,) if is_torch_available() else ()
+
def setUp(self):
self.model_tester = Florence2ForConditionalGenerationTester(self)
self.config_tester = ConfigTester(self, config_class=Florence2Config)
@@ -74,11 +70,17 @@ def test_model_from_pretrained(self):
class Florence2VisionModelTester:
- def __init__(self):
+ def __init__(self, parent):
pass
+ def prepare_config_and_inputs_for_common(self):
+ pass
+
+
@require_torch
class Florence2VisionModelTest(ModelTesterMixin, unittest.TestCase):
+ all_model_classes = (Florence2VisionModel,) if is_torch_available() else ()
+
def setUp(self):
self.model_tester = Florence2VisionModelTester(self)
self.config_tester = ConfigTester(self, config_class=Florence2VisionConfig)
@@ -90,11 +92,17 @@ def test_model_from_pretrained(self):
class Florence2VisionModelWithProjectionTester:
- def __init__(self):
+ def __init__(self, parent):
pass
+ def prepare_config_and_inputs_for_common(self):
+ pass
+
+
@require_torch
class Florence2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
+ all_model_classes = (Florence2VisionModelWithProjection,) if is_torch_available() else ()
+
def setUp(self):
self.model_tester = Florence2VisionModelWithProjectionTester(self)
self.config_tester = ConfigTester(self, config_class=Florence2VisionConfig)
@@ -104,6 +112,7 @@ def test_model_from_pretrained(self):
model = Florence2VisionModelWithProjection.from_pretrained(MODEL_ID)
self.assertIsNotNone(model)
+
def prepare_img():
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
im = Image.open(requests.get(url, stream=True).raw)
@@ -125,6 +134,7 @@ def test_inference(self):
outputs = model(**inputs)
# TODO: write test condition
+ self.assertIsNotNone(outputs)
@require_vision
@@ -142,6 +152,7 @@ def test_inference(self):
outputs = model(**inputs)
# TODO: write test condition
+ self.assertIsNotNone(outputs)
@require_vision
@@ -159,4 +170,5 @@ def test_inference(self):
outputs = model(**inputs)
# TODO: write test condition
+ self.assertIsNotNone(outputs)
diff --git a/tests/models/florence/test_processing_florence2.py b/tests/models/florence/test_processing_florence2.py
index a9dba89aed5a..ba5de9f65e38 100644
--- a/tests/models/florence/test_processing_florence2.py
+++ b/tests/models/florence/test_processing_florence2.py
@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import json
import os
-import random
import shutil
import tempfile
import unittest
@@ -27,42 +25,41 @@
from transformers.testing_utils import require_vision
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+
if is_vision_available():
from PIL import Image
from transformers import CLIPImageProcessor, Florence2Processor
+
@require_vision
class Florence2ProcessorTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
- vocab_tokens = [] # TODO: add vocab tokens
+ vocab_tokens = [] # TODO: add vocab tokens
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write("".join([x + "\n" for x in vocab_tokens]))
-
+
self.processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
def get_tokenizer(self, **kwargs):
return BartTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
+
def get_rust_tokenizer(self, **kwargs):
return BartTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
+
def get_image_processor(self, **kwargs):
return CLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
+
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def prepare_image_inputs(self):
- """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
- or a list of PyTorch tensors if one specifies torchify=True.
- """
+ """This function prepares a list of PIL images"""
image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
return image_inputs
@@ -133,7 +130,6 @@ def test_tokenizer(self):
for key in encoded_tok.keys():
self.assertListEqual(encoded_processor[key], encoded_tok[key])
-
def test_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
@@ -145,9 +141,10 @@ def test_processor(self):
encoded_processor = processor(text=input_str, images=image_inputs)
- self.assertListEqual(list(encoded_processor.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+ self.assertListEqual(
+ list(encoded_processor.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"]
+ )
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
-
From 3d67bebc5c7bbac47445f619cb0d1edd453abb93 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Sat, 22 Jun 2024 14:39:18 +0200
Subject: [PATCH 27/35] rename folders
---
src/transformers/__init__.py | 6 +++---
src/transformers/models/__init__.py | 2 +-
src/transformers/models/{florence => florence2}/__init__.py | 0
.../{florence => florence2}/configuration_florence2.py | 0
.../models/{florence => florence2}/modeling_florence2.py | 0
.../models/{florence => florence2}/processing_florence2.py | 0
tests/models/{florence => florence2}/__init__.py | 0
.../{florence => florence2}/test_modeling_florence2.py | 0
.../{florence => florence2}/test_processing_florence2.py | 0
9 files changed, 4 insertions(+), 4 deletions(-)
rename src/transformers/models/{florence => florence2}/__init__.py (100%)
rename src/transformers/models/{florence => florence2}/configuration_florence2.py (100%)
rename src/transformers/models/{florence => florence2}/modeling_florence2.py (100%)
rename src/transformers/models/{florence => florence2}/processing_florence2.py (100%)
rename tests/models/{florence => florence2}/__init__.py (100%)
rename tests/models/{florence => florence2}/test_modeling_florence2.py (100%)
rename tests/models/{florence => florence2}/test_processing_florence2.py (100%)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8b31ac0260a3..fea2b07e7d74 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5026,7 +5026,7 @@
FlavaMultimodalConfig,
FlavaTextConfig,
)
- from .models.florence import (
+ from .models.florence2 import (
Florence2Config,
Florence2VisionConfig,
)
@@ -5768,7 +5768,7 @@
FlavaImageProcessor,
FlavaProcessor,
)
- from .models.florence import Florence2Processor
+ from .models.florence2 import Florence2Processor
from .models.fuyu import FuyuImageProcessor, FuyuProcessor
from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
from .models.grounding_dino import GroundingDinoImageProcessor
@@ -6611,7 +6611,7 @@
FlavaPreTrainedModel,
FlavaTextModel,
)
- from .models.florence import (
+ from .models.florence2 import (
Florence2ForConditionalGeneration,
Florence2PreTrainedModel,
Florence2VisionModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 83f6427d8710..ee6abde95e9c 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -86,7 +86,7 @@
fastspeech2_conformer,
flaubert,
flava,
- florence,
+ florence2,
fnet,
focalnet,
fsmt,
diff --git a/src/transformers/models/florence/__init__.py b/src/transformers/models/florence2/__init__.py
similarity index 100%
rename from src/transformers/models/florence/__init__.py
rename to src/transformers/models/florence2/__init__.py
diff --git a/src/transformers/models/florence/configuration_florence2.py b/src/transformers/models/florence2/configuration_florence2.py
similarity index 100%
rename from src/transformers/models/florence/configuration_florence2.py
rename to src/transformers/models/florence2/configuration_florence2.py
diff --git a/src/transformers/models/florence/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
similarity index 100%
rename from src/transformers/models/florence/modeling_florence2.py
rename to src/transformers/models/florence2/modeling_florence2.py
diff --git a/src/transformers/models/florence/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py
similarity index 100%
rename from src/transformers/models/florence/processing_florence2.py
rename to src/transformers/models/florence2/processing_florence2.py
diff --git a/tests/models/florence/__init__.py b/tests/models/florence2/__init__.py
similarity index 100%
rename from tests/models/florence/__init__.py
rename to tests/models/florence2/__init__.py
diff --git a/tests/models/florence/test_modeling_florence2.py b/tests/models/florence2/test_modeling_florence2.py
similarity index 100%
rename from tests/models/florence/test_modeling_florence2.py
rename to tests/models/florence2/test_modeling_florence2.py
diff --git a/tests/models/florence/test_processing_florence2.py b/tests/models/florence2/test_processing_florence2.py
similarity index 100%
rename from tests/models/florence/test_processing_florence2.py
rename to tests/models/florence2/test_processing_florence2.py
From 8b88ca743da68aefa9cb28a9202d3334a40c5c8f Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Sat, 22 Jun 2024 14:49:56 +0200
Subject: [PATCH 28/35] fix folder renaming
---
src/transformers/__init__.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index fea2b07e7d74..0f436e11f2dd 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -423,7 +423,7 @@
"FlavaMultimodalConfig",
"FlavaTextConfig",
],
- "models.florence": [
+ "models.florence2": [
"Florence2Config",
"Florence2VisionConfig",
],
@@ -1134,7 +1134,7 @@
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
_import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
_import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
- _import_structure["models.florence"].extend(["Florence2Processor"])
+ _import_structure["models.florence2"].extend(["Florence2Processor"])
_import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
_import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
_import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
@@ -2124,7 +2124,7 @@
"FlavaTextModel",
]
)
- _import_structure["models.florence"].extend(
+ _import_structure["models.florence2"].extend(
[
"Florence2ForConditionalGeneration",
"Florence2PreTrainedModel",
From 299670b512dedbd0c9823ee8789c2ea511e12996 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Mon, 24 Jun 2024 01:19:55 +0200
Subject: [PATCH 29/35] fix florence2 modeling, copy DropPath from beit
---
.../models/florence2/modeling_florence2.py | 49 ++++++++++++-------
1 file changed, 31 insertions(+), 18 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index a4a7a5d0aafb..e70cee3ea3cb 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -328,28 +328,41 @@ def forward(self, x, size):
x = self.proj(x)
return x, size
+# TODO verify behavior is same as https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py#L150
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+ argument.
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+ random_tensor.floor_() # binarize
+ output = input.div(keep_prob) * random_tensor
+ return output
+
-# https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
-class DropPath(nn.Module):
- """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Florence2
+class Florence2DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
- def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
- super(DropPath, self).__init__()
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
+ super().__init__()
self.drop_prob = drop_prob
- self.scale_by_keep = scale_by_keep
- def forward(self, x):
- if self.drop_prob == 0.0 or not self.training:
- return x
- keep_prob = 1 - self.drop_prob
- shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
- random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
- if keep_prob > 0.0 and self.scale_by_keep:
- random_tensor.div_(keep_prob)
- return x * random_tensor
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ return drop_path(hidden_states, self.drop_prob, self.training)
- def extra_repr(self):
- return f"drop_prob={round(self.drop_prob,3):0.3f}"
+ def extra_repr(self) -> str:
+ return "p={}".format(self.drop_prob)
# TODO: can this be replaced with torch.nn.init.trunc_normal_?
@@ -426,7 +439,7 @@ def __init__(
):
super().__init__()
- drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ drop_path = Florence2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
self.channel_attn = PreNorm(
From 689ad98c1ea55cbe48e9d0c2caa551628c10fb30 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 25 Jun 2024 22:42:08 +0200
Subject: [PATCH 30/35] update Florence2DropPath
---
.../models/florence2/modeling_florence2.py | 37 +++++++++----------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index e70cee3ea3cb..f19b21fe248a 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -328,33 +328,32 @@ def forward(self, x, size):
x = self.proj(x)
return x, size
-# TODO verify behavior is same as https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py#L150
-# Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
- """
- Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+# https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py#L150
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+ changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+ 'survival rate' as the argument.
- Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
- however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
- See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
- layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
- argument.
"""
- if drop_prob == 0.0 or not training:
- return input
+ if drop_prob == 0. or not training:
+ return x
keep_prob = 1 - drop_prob
- shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
- random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
- random_tensor.floor_() # binarize
- output = input.div(keep_prob) * random_tensor
- return output
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0 and scale_by_keep:
+ random_tensor.div_(keep_prob)
+ return x * random_tensor
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Florence2
class Florence2DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
- def __init__(self, drop_prob: Optional[float] = None) -> None:
+ def __init__(self, drop_prob: float = 0.) -> None:
super().__init__()
self.drop_prob = drop_prob
@@ -550,7 +549,7 @@ def __init__(
):
super().__init__()
- drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ drop_path = Florence2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
self.window_attn = PreNorm(
From 6b7feb6ac27d7f2bdfb625d15334995a6669e5cb Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 25 Jun 2024 22:48:58 +0200
Subject: [PATCH 31/35] update Florence2Sequential
---
src/transformers/models/florence2/modeling_florence2.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index f19b21fe248a..d724a592430b 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -190,7 +190,7 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
return pos_embeds
-class MySequential(nn.Sequential):
+class Florence2Sequential(nn.Sequential):
def forward(self, *inputs):
for module in self._modules.values():
if type(inputs) == tuple:
@@ -364,7 +364,6 @@ def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
-# TODO: can this be replaced with torch.nn.init.trunc_normal_?
# https://github.com/huggingface/pytorch-image-models/blob/b28945ff056d454b174f0fb8682e362b87150141/timm/layers/weight_init.py
def trunc_normal_(tensor: torch.Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0) -> torch.Tensor:
r"""Fills the input Tensor with values drawn from a truncated
@@ -643,9 +642,9 @@ def __init__(
)
convs.append(conv_embed)
- block = MySequential(
+ block = Florence2Sequential(
*[
- MySequential(
+ Florence2Sequential(
OrderedDict(
[
(
From 3addc028a2e97bc6d5616119ba82cddd34efdbd2 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 25 Jun 2024 23:07:28 +0200
Subject: [PATCH 32/35] update modeling_florence2, add model-prefix
---
.../models/florence2/modeling_florence2.py | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index d724a592430b..09d2a21111f8 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -66,7 +66,7 @@
_CONFIG_FOR_DOC = "Florence2Config"
-class LearnedAbsolutePositionEmbedding2D(nn.Module):
+class Florence2LearnedAbsolutePositionEmbedding2D(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
@@ -100,7 +100,7 @@ def forward(self, pixel_values):
return pos
-class PositionalEmbeddingCosine1D(nn.Module):
+class Florence2PositionalEmbeddingCosine1D(nn.Module):
"""
This class implements a very simple positional encoding. It follows closely
the encoder from the link below:
@@ -113,7 +113,7 @@ class PositionalEmbeddingCosine1D(nn.Module):
"""
def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
- super(PositionalEmbeddingCosine1D, self).__init__()
+ super(Florence2PositionalEmbeddingCosine1D, self).__init__()
self.embed_dim = embed_dim
self.max_seq_len = max_seq_len
# Generate the sinusoidal arrays.
@@ -152,7 +152,7 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
return pos_embeds
-class LearnedAbsolutePositionEmbedding1D(nn.Module):
+class Florence2LearnedAbsolutePositionEmbedding1D(nn.Module):
"""
Learnable absolute positional embeddings for 1D sequences.
@@ -162,7 +162,7 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
"""
def __init__(self, embedding_dim: int = 512, num_pos: int = 1024) -> None:
- super(LearnedAbsolutePositionEmbedding1D, self).__init__()
+ super(Florence2LearnedAbsolutePositionEmbedding1D, self).__init__()
self.embeddings = nn.Embedding(num_pos, embedding_dim)
self.num_pos = num_pos
@@ -2528,7 +2528,7 @@ def _build_image_projection_layers(self, config):
self.image_proj_norm = nn.LayerNorm(dim_projection)
image_pos_embed_config = config.image_pos_embed
if image_pos_embed_config["type"] == "learned_abs_2d":
- self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+ self.image_pos_embed = Florence2LearnedAbsolutePositionEmbedding2D(
embedding_dim=image_dim_out, num_pos=image_pos_embed_config["max_pos_embeddings"]
)
else:
@@ -2539,7 +2539,7 @@ def _build_image_projection_layers(self, config):
# temporal embedding
visual_temporal_embedding_config = config.visual_temporal_embedding
if visual_temporal_embedding_config["type"] == "COSINE":
- self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+ self.visual_temporal_embed = Florence2PositionalEmbeddingCosine1D(
embed_dim=image_dim_out, max_seq_len=visual_temporal_embedding_config["max_temporal_embeddings"]
)
else:
@@ -2626,7 +2626,7 @@ def _build_image_projection_layers(self, config):
self.image_proj_norm = nn.LayerNorm(dim_projection)
image_pos_embed_config = config.vision_config.image_pos_embed
if image_pos_embed_config["type"] == "learned_abs_2d":
- self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+ self.image_pos_embed = Florence2LearnedAbsolutePositionEmbedding2D(
embedding_dim=image_dim_out, num_pos=image_pos_embed_config["max_pos_embeddings"]
)
else:
@@ -2637,7 +2637,7 @@ def _build_image_projection_layers(self, config):
# temporal embedding
visual_temporal_embedding_config = config.vision_config.visual_temporal_embedding
if visual_temporal_embedding_config["type"] == "COSINE":
- self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+ self.visual_temporal_embed = Florence2PositionalEmbeddingCosine1D(
embed_dim=image_dim_out, max_seq_len=visual_temporal_embedding_config["max_temporal_embeddings"]
)
else:
From 0f624acb955a5641776d0e23dd12f93f5d21bfce Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 26 Jun 2024 00:05:17 +0200
Subject: [PATCH 33/35] update modeling_florence2, add DaViT prefixes
---
.../models/florence2/modeling_florence2.py | 73 +++++++++----------
1 file changed, 34 insertions(+), 39 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index 09d2a21111f8..700d1d5ac2de 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -59,6 +59,7 @@
if is_flash_attn_2_available():
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
logger = logging.get_logger(__name__)
@@ -190,7 +191,7 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
return pos_embeds
-class Florence2Sequential(nn.Sequential):
+class Florence2DaViTSequential(nn.Sequential):
def forward(self, *inputs):
for module in self._modules.values():
if type(inputs) == tuple:
@@ -200,7 +201,7 @@ def forward(self, *inputs):
return inputs
-class PreNorm(nn.Module):
+class Florence2DaViTPreNorm(nn.Module):
def __init__(self, norm, fn, drop_path=None):
super().__init__()
self.norm = norm
@@ -222,7 +223,7 @@ def forward(self, x, *args, **kwargs):
return x, size
-class Mlp(nn.Module):
+class Florence2DaViTMlp(nn.Module):
def __init__(
self,
in_features,
@@ -247,7 +248,7 @@ def forward(self, x, size):
return self.net(x), size
-class DepthWiseConv2d(nn.Module):
+class Florence2DaViTDepthWiseConv2d(nn.Module):
def __init__(
self,
dim_in,
@@ -272,7 +273,7 @@ def forward(self, x, size):
return x, size
-class ConvEmbed(nn.Module):
+class Florence2DaViTConvEmbed(nn.Module):
"""Image to Patch Embedding"""
def __init__(self, patch_size=7, in_chans=3, embed_dim=64, stride=4, padding=2, norm_layer=None, pre_norm=True):
@@ -306,7 +307,7 @@ def forward(self, x, size):
return x, (H, W)
-class ChannelAttention(nn.Module):
+class Florence2DaViTChannelAttention(nn.Module):
def __init__(self, dim, groups=8, qkv_bias=True):
super().__init__()
@@ -349,8 +350,7 @@ def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: b
return x * random_tensor
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Florence2
-class Florence2DropPath(nn.Module):
+class Florence2DaViTDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: float = 0.) -> None:
@@ -422,7 +422,7 @@ def norm_cdf(x):
return tensor
-class ChannelBlock(nn.Module):
+class Florence2DaViTChannelBlock(nn.Module):
def __init__(
self,
dim,
@@ -437,15 +437,15 @@ def __init__(
):
super().__init__()
- drop_path = Florence2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ drop_path = Florence2DaViTDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
- self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
- self.channel_attn = PreNorm(
- norm_layer(dim), ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias), drop_path
+ self.conv1 = Florence2DaViTPreNorm(None, Florence2DaViTDepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+ self.channel_attn = Florence2DaViTPreNorm(
+ norm_layer(dim), Florence2DaViTChannelAttention(dim, groups=groups, qkv_bias=qkv_bias), drop_path
)
- self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
- self.ffn = PreNorm(
- norm_layer(dim), Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer), drop_path
+ self.conv2 = Florence2DaViTPreNorm(None, Florence2DaViTDepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+ self.ffn = Florence2DaViTPreNorm(
+ norm_layer(dim), Florence2DaViTMlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer), drop_path
)
def forward(self, x, size):
@@ -476,7 +476,7 @@ def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
return x
-class WindowAttention(nn.Module):
+class Florence2DaViTWindowAttention(nn.Module):
def __init__(self, dim, num_heads, window_size, qkv_bias=True):
super().__init__()
self.dim = dim
@@ -532,7 +532,7 @@ def forward(self, x, size):
return x, size
-class SpatialBlock(nn.Module):
+class Florence2DaViTSpatialBlock(nn.Module):
def __init__(
self,
dim,
@@ -548,15 +548,15 @@ def __init__(
):
super().__init__()
- drop_path = Florence2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ drop_path = Florence2DaViTDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
- self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
- self.window_attn = PreNorm(
- norm_layer(dim), WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias), drop_path
+ self.conv1 = Florence2DaViTPreNorm(None, Florence2DaViTDepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+ self.window_attn = Florence2DaViTPreNorm(
+ norm_layer(dim), Florence2DaViTWindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias), drop_path
)
- self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
- self.ffn = PreNorm(
- norm_layer(dim), Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer), drop_path
+ self.conv2 = Florence2DaViTPreNorm(None, Florence2DaViTDepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+ self.ffn = Florence2DaViTPreNorm(
+ norm_layer(dim), Florence2DaViTMlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer), drop_path
)
def forward(self, x, size):
@@ -570,7 +570,7 @@ def forward(self, x, size):
return x, size
-class DaViT(nn.Module):
+class Florence2DaViT(nn.Module):
"""DaViT: Dual-Attention Transformer
Args:
@@ -631,7 +631,7 @@ def __init__(
convs = []
blocks = []
for i in range(num_stages):
- conv_embed = ConvEmbed(
+ conv_embed = Florence2DaViTConvEmbed(
patch_size=patch_size[i],
stride=patch_stride[i],
padding=patch_padding[i],
@@ -642,14 +642,14 @@ def __init__(
)
convs.append(conv_embed)
- block = Florence2Sequential(
+ block = Florence2DaViTSequential(
*[
- Florence2Sequential(
+ Florence2DaViTSequential(
OrderedDict(
[
(
"spatial_block",
- SpatialBlock(
+ Florence2DaViTSpatialBlock(
embed_dims[i],
num_heads[i],
window_size,
@@ -662,7 +662,7 @@ def __init__(
),
(
"channel_block",
- ChannelBlock(
+ Florence2DaViTChannelBlock(
embed_dims[i],
num_groups[i],
drop_path_rate=dpr[depth_offset + j * 2 + 1],
@@ -758,11 +758,6 @@ def from_config(cls, config):
)
-if is_flash_attn_2_available():
- from flash_attn import flash_attn_func, flash_attn_varlen_func
- from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
-
-
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -2495,7 +2490,7 @@ class Florence2VisionModel(Florence2PreTrainedModel):
def __init__(self, config: Florence2VisionConfig):
super().__init__(config)
assert config.model_type == "davit", "only DaViT is supported for now"
- self.vision_tower = DaViT.from_config(config=config)
+ self.vision_tower = Florence2DaViT.from_config(config=config)
self.post_init()
@@ -2515,7 +2510,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
def __init__(self, config: Florence2VisionConfig):
super().__init__(config)
assert config.model_type == "davit", "only DaViT is supported for now"
- self.vision_tower = DaViT.from_config(config=config)
+ self.vision_tower = Florence2DaViT.from_config(config=config)
self._build_image_projection_layers(config)
@@ -2601,7 +2596,7 @@ def __init__(self, config: Florence2Config):
super().__init__(config)
assert config.vision_config.model_type == "davit", "only DaViT is supported for now"
del config.vision_config.model_type
- self.vision_tower = DaViT.from_config(config=config.vision_config)
+ self.vision_tower = Florence2DaViT.from_config(config=config.vision_config)
# remove unused layers
del self.vision_tower.head
del self.vision_tower.norms
From 880811f2f50de704508f8c9451c43c795d40f0a3 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:27:00 +0200
Subject: [PATCH 34/35] fix base_model_prefix
---
src/transformers/models/florence2/modeling_florence2.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index 700d1d5ac2de..58035118ba71 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -1504,7 +1504,7 @@ def forward(
class Florence2LanguagePreTrainedModel(PreTrainedModel):
config_class = Florence2LanguageConfig
- base_model_prefix = "model"
+ base_model_prefix = "florence2"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
_no_split_modules = [r"Florence2EncoderLayer", r"Florence2DecoderLayer"]
@@ -2132,7 +2132,6 @@ def forward(
class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel):
- base_model_prefix = "model"
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
@@ -2395,7 +2394,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
)
class Florence2PreTrainedModel(PreTrainedModel):
config_class = Florence2Config
- base_model_prefix = "model"
+ base_model_prefix = "florence2"
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
From 4069145e4fb79764618ec89d5043bf282a168e9b Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 26 Jun 2024 16:26:36 +0200
Subject: [PATCH 35/35] fix missing variables
---
.../models/florence2/modeling_florence2.py | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index 58035118ba71..ce91d2050384 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -2312,6 +2312,10 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
decoding.
Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss.
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
@@ -2360,7 +2364,8 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
image_hidden_states of the model produced by the vision encoder
"""
-
+ loss: Optional[torch.FloatTensor] = None
+ logits: torch.FloatTensor = None
last_hidden_state: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -2369,6 +2374,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ image_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
FLORENCE2_START_DOCSTRING = r"""
@@ -2594,7 +2600,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
def __init__(self, config: Florence2Config):
super().__init__(config)
assert config.vision_config.model_type == "davit", "only DaViT is supported for now"
- del config.vision_config.model_type
self.vision_tower = Florence2DaViT.from_config(config=config.vision_config)
# remove unused layers
del self.vision_tower.head
@@ -2793,7 +2798,8 @@ def forward(
image_features, inputs_embeds
)
- attention_mask = attention_mask.to(inputs_embeds.dtype)
+ if inputs_embeds is not None:
+ attention_mask = attention_mask.to(inputs_embeds.dtype)
outputs = self.language_model(
attention_mask=attention_mask,
labels=labels,