diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 8a2272081801..0a4fd16e0744 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -19,8 +19,6 @@ # limitations under the License. """ Idefics model configuration""" import copy -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -34,180 +32,32 @@ } -# copied from transformers.models.clip.configuration_clip.CLIPTextConfig -class CLIPTextConfig(PretrainedConfig): +class IdeficsVisionConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP - text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the text encoder of the CLIP - [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - vocab_size (`int`, *optional*, defaults to 49408): - Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by - the `inputs_ids` passed when calling [`CLIPModel`]. - hidden_size (`int`, *optional*, defaults to 512): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 2048): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 8): - Number of attention heads for each attention layer in the Transformer encoder. - max_position_embeddings (`int`, *optional*, defaults to 77): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): - A factor for initializing all weight matrices (should be kept to 1, used internally for initialization - testing). - - Example: - - ```python - >>> from transformers import CLIPTextConfig, CLIPTextModel - - >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration - >>> configuration = CLIPTextConfig() - - >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration - >>> model = CLIPTextModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "clip_text_model" - - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - # This differs from `CLIPTokenizer`'s default and from openai/clip - # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - + This is the configuration class to store the configuration of a [`~IdeficsModel`]. It is used to instantiate an + Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Idefics-9B. -# copied from transformers.models.clip.configuration_clip.CLIPVisionConfig -class CLIPVisionConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a - CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP - [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - - Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 32): - The size (resolution) of each patch. - hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): - A factor for initializing all weight matrices (should be kept to 1, used internally for initialization - testing). - - Example: - - ```python - >>> from transformers import CLIPVisionConfig, CLIPVisionModel - - >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration - >>> configuration = CLIPVisionConfig() - - >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration - >>> model = CLIPVisionModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "clip_vision_model" + """ + model_type = "idefics" + attribute_map = { + "hidden_size": "embed_dim", + } def __init__( self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, + vision_model_name="google/vit-base-patch16-224", + embed_dim=768, image_size=224, - patch_size=32, + intermediate_size=5120, + patch_size=14, + num_hidden_layers=32, + num_attention_heads=16, + num_channels=3, hidden_act="quick_gelu", layer_norm_eps=1e-5, attention_dropout=0.0, @@ -215,204 +65,55 @@ def __init__( initializer_factor=1.0, **kwargs, ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size + self.vision_model_name = vision_model_name + self.embed_dim = embed_dim + self.image_size = image_size self.intermediate_size = intermediate_size - self.projection_dim = projection_dim + self.patch_size = patch_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size + self.layer_norm_eps = layer_norm_eps + self.attention_dropout = attention_dropout self.initializer_range = initializer_range self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) + super().__init__(**kwargs) -# copied from transformers.models.clip.configuration_clip.CLIPConfig -class CLIPConfig(PretrainedConfig): +class IdeficsPerceiverConfig(PretrainedConfig): r""" - [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate - a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating - a configuration with the defaults will yield a similar configuration to that of the CLIP - [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + This is the configuration class to store the configuration of a [`~IdeficsModel`]. It is used to instantiate an + Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Idefics-9B. + + e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - - Args: - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`CLIPTextConfig`]. - vision_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`CLIPVisionConfig`]. - projection_dim (`int`, *optional*, defaults to 512): - Dimentionality of text and vision projection layers. - logit_scale_init_value (`float`, *optional*, defaults to 2.6592): - The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation. - kwargs (*optional*): - Dictionary of keyword arguments. - - Example: - - ```python - >>> from transformers import CLIPConfig, CLIPModel - - >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration - >>> configuration = CLIPConfig() - - >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration - >>> model = CLIPModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - - >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig - >>> from transformers import CLIPTextConfig, CLIPVisionConfig - - >>> # Initializing a CLIPText and CLIPVision configuration - >>> config_text = CLIPTextConfig() - >>> config_vision = CLIPVisionConfig() - - >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision) - ```""" - - model_type = "clip" - is_composition = True + """ + model_type = "idefics" def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs + self, + use_resampler=False, + resampler_n_latents=64, + resampler_depth=6, + resampler_n_heads=16, + resampler_head_dim=96, + qk_layer_norms_perceiver=False, + **kwargs, ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + self.use_resampler = use_resampler + self.resampler_n_latents = resampler_n_latents + self.resampler_depth = resampler_depth + self.resampler_n_heads = resampler_n_heads + self.resampler_head_dim = resampler_head_dim + self.qk_layer_norms_perceiver = qk_layer_norms_perceiver super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in - # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most - # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} - - # This is the complete result when using `text_config_dict`. - _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict() - - # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. - for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: - # If specified in `text_config_dict` - if key in text_config_dict: - message = ( - f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " - f'The value `text_config_dict["{key}"]` will be used instead.' - ) - # If inferred from default argument values (just to be super careful) - else: - message = ( - f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The " - f'value `text_config["{key}"]` will be overriden.' - ) - logger.warning(message) - - # Update all values in `text_config` with the ones in `_text_config_dict`. - text_config.update(_text_config_dict) - - if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - - # This is the complete result when using `vision_config_dict`. - _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict() - # convert keys to string instead of integer - if "id2label" in _vision_config_dict: - _vision_config_dict["id2label"] = { - str(key): value for key, value in _vision_config_dict["id2label"].items() - } - - # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. - for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: - # If specified in `vision_config_dict` - if key in vision_config_dict: - message = ( - f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different " - f'values. The value `vision_config_dict["{key}"]` will be used instead.' - ) - # If inferred from default argument values (just to be super careful) - else: - message = ( - f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. " - f'The value `vision_config["{key}"]` will be overriden.' - ) - logger.warning(message) - - # Update all values in `vision_config` with the ones in `_vision_config_dict`. - vision_config.update(_vision_config_dict) - - if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.") - - if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.") - - self.text_config = CLIPTextConfig(**text_config) - self.vision_config = CLIPVisionConfig(**vision_config) - - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - - @classmethod - def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs): - r""" - Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model - configuration. - - Returns: - [`CLIPConfig`]: An instance of a configuration object - """ - - return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. - - Returns: - `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = copy.deepcopy(self.__dict__) - output["text_config"] = self.text_config.to_dict() - output["vision_config"] = self.vision_config.to_dict() - output["model_type"] = self.__class__.model_type - return output - class IdeficsConfig(PretrainedConfig): r""" @@ -464,6 +165,7 @@ class IdeficsConfig(PretrainedConfig): >>> configuration = model.config ```""" model_type = "idefics" + is_composition = True def __init__( self, @@ -492,18 +194,9 @@ def __init__( freeze_lm_head=False, freeze_vision_layers=True, freeze_vision_module_exceptions=[], - vision_model_name="google/vit-base-patch16-224", - vision_embed_dim=768, - vision_image_size=224, - vision_intermediate_size=5120, - vision_patch_size=14, - vision_num_hidden_layers=32, - vision_num_attention_heads=16, use_resampler=False, - resampler_n_latents=64, - resampler_depth=6, - resampler_n_heads=16, - resampler_head_dim=96, + vision_config=None, + perceiver_config=None, **kwargs, ): self.vocab_size = vocab_size @@ -524,32 +217,29 @@ def __init__( self.cross_layer_interval = cross_layer_interval self.qk_layer_norms = qk_layer_norms self.freeze_vision_layers = freeze_vision_layers - self.vision_model_name = vision_model_name self.freeze_text_layers = freeze_text_layers self.freeze_text_module_exceptions = freeze_text_module_exceptions self.freeze_vision_module_exceptions = freeze_vision_module_exceptions self.freeze_lm_head = freeze_lm_head - self.vision_embed_dim = vision_embed_dim - self.vision_image_size = vision_image_size - self.vision_intermediate_size = vision_intermediate_size - self.vision_patch_size = vision_patch_size + self.use_resampler = use_resampler - self.vision_config_dict = {} - self.vision_config_dict["hidden_size"] = vision_embed_dim - self.vision_config_dict["image_size"] = vision_image_size - self.vision_config_dict["num_attention_heads"] = vision_num_attention_heads - self.vision_config_dict["num_hidden_layers"] = vision_num_hidden_layers - self.vision_config_dict["intermediate_size"] = vision_intermediate_size - self.vision_config_dict["patch_size"] = vision_patch_size + if perceiver_config is None: + self.perceiver_config = IdeficsPerceiverConfig() + elif isinstance(perceiver_config, dict): + self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config) + elif isinstance(perceiver_config, IdeficsPerceiverConfig): + self.perceiver_config = perceiver_config - # Resampler params - self.use_resampler = use_resampler - self.resampler_n_latents = resampler_n_latents - self.resampler_depth = resampler_depth - self.resampler_n_heads = resampler_n_heads - self.resampler_head_dim = resampler_head_dim + if vision_config is None: + self.vision_config = IdeficsVisionConfig() + elif isinstance(vision_config, dict): + self.vision_config = IdeficsVisionConfig(**vision_config) + elif isinstance(vision_config, IdeficsVisionConfig): + self.vision_config = vision_config + + self.vision_embed_dim = self.vision_config.embed_dim super().__init__( pad_token_id=pad_token_id, @@ -573,5 +263,9 @@ def to_dict(self): `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) - output["vision_config_dict"] = self.vision_config_dict + + output["vision_config"] = self.vision_config.to_dict() + output["perceiver_config"] = self.perceiver_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 6e4d614d46b5..24c131746c07 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -32,15 +32,14 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...modeling_utils import PretrainedConfig from ...utils import ( - ContextManagers, add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, ) -from .clip import CLIPVisionTransformer -from .configuration_idefics import CLIPVisionConfig, IdeficsConfig +from .configuration_idefics import IdeficsConfig from .perceiver import IdeficsPerceiverResampler +from .vision import IdeficsVisionTransformer logger = logging.get_logger(__name__) @@ -887,46 +886,18 @@ class IdeficsPreTrainedModel(PreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): - def init_a_linear(module, mean=0.0, std=self.config.initializer_range): - with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)): - module.weight.data.normal_(mean=mean, std=std) - if module.bias is not None: - with ContextManagers(deepspeed_gathered_parameters_context_manager(module.bias, modify=True)): - module.bias.data.zero_() - module._is_hf_initialized = True - - if isinstance(module, IdeficsGatedCrossAttentionLayer): - for sub_module_name, sub_module in module.named_modules(): - if isinstance(sub_module, nn.Linear): - if "down_proj" in sub_module_name: - factor = 2 * self.config.num_hidden_layers - else: - factor = 1.0 - init_a_linear(sub_module, std=(0.4 / (sub_module.in_features * factor)) ** 0.5) - sub_module._is_hf_initialized = True - elif isinstance(module, IdeficsPerceiverResampler): - with ContextManagers(deepspeed_gathered_parameters_context_manager(module.latents, modify=True)): - module.latents.data.normal_(mean=0.0, std=(1.0 / self.config.vision_embed_dim) ** 0.5) - module._is_hf_initialized = True - for sub_module_name, sub_module in module.named_modules(): - if isinstance(sub_module, nn.Linear): - if "c_proj" in sub_module_name: - factor = 2 * self.config.num_hidden_layers - else: - factor = 1.0 - init_a_linear(sub_module, std=(0.4 / (self.config.vision_embed_dim * factor)) ** 0.5) - sub_module._is_hf_initialized = True + # important: this ported version of Idefics isn't meant for training from scratch - only + # inference and fine-tuning - so the proper init weights code has been removed - the m4 code + # base should be used for training from scratch and it contains the correct code. + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() elif isinstance(module, nn.Embedding): - with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)): - module.weight.data.normal_(mean=0.0, std=(1.0 / self.config.hidden_size) ** 0.5) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - module._is_hf_initialized = True - - elif isinstance(module, IdeficsDecoupledLinear): - if hasattr(module, "additional_fc"): - init_a_linear(module.additional_fc, std=(1.0 / (module.additional_fc.in_features)) ** 0.5) - module._is_hf_initialized = True + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, IdeficsModel): @@ -1029,21 +1000,20 @@ def __init__(self, config: IdeficsConfig): ) # complain that it's not used - self.image_size = config.vision_image_size - - self.vision_config_dict = config.vision_config_dict - clip_vision_config = CLIPVisionConfig(**self.vision_config_dict) - self.vision_model = CLIPVisionTransformer(clip_vision_config) + self.image_size = config.vision_config.image_size + self.vision_config = config.vision_config + self.vision_model = IdeficsVisionTransformer(config.vision_config) # Perceiver Resampler if config.use_resampler: + perceiver_config = config.perceiver_config self.perceiver_resampler = IdeficsPerceiverResampler( - self.config, - self.config.vision_embed_dim, - config.resampler_depth, - config.resampler_n_heads, - config.resampler_head_dim, - config.resampler_n_latents, + config, + config.vision_embed_dim, + perceiver_config.resampler_depth, + perceiver_config.resampler_n_heads, + perceiver_config.resampler_head_dim, + perceiver_config.resampler_n_latents, ) self.layers = nn.ModuleList([IdeficsDecoderLayer(config) for _ in range(config.num_hidden_layers)]) diff --git a/src/transformers/models/idefics/perceiver.py b/src/transformers/models/idefics/perceiver.py index 43b8a2118285..4ee0b1ca2087 100644 --- a/src/transformers/models/idefics/perceiver.py +++ b/src/transformers/models/idefics/perceiver.py @@ -32,7 +32,7 @@ def __init__(self, config, embed_dim: int, depth: int, n_heads: int, head_dim: i """ super().__init__() self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents - self.qk_layer_norms = config.qk_layer_norms_perceiver + self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver # Create Latents for Perceiver self.latents = nn.Parameter(torch.randn(self.n_latents, self.embed_dim), requires_grad=True) diff --git a/src/transformers/models/idefics/clip.py b/src/transformers/models/idefics/vision.py similarity index 73% rename from src/transformers/models/idefics/clip.py rename to src/transformers/models/idefics/vision.py index 3c4976899007..2f51d3c03be8 100644 --- a/src/transformers/models/idefics/clip.py +++ b/src/transformers/models/idefics/vision.py @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch CLIP model.""" +""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object""" from dataclasses import dataclass -from typing import Any, Optional, Tuple, Union +from typing import Optional, Tuple, Union import torch import torch.utils.checkpoint @@ -26,48 +26,16 @@ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...utils import ( ModelOutput, - add_start_docstrings_to_model_forward, logging, - replace_return_docstrings, ) -from .configuration_idefics import CLIPConfig, CLIPVisionConfig +from .configuration_idefics import IdeficsVisionConfig logger = logging.get_logger(__name__) -CLIP_VISION_INPUTS_DOCSTRING = "" - - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) - - -# contrastive loss function, adapted from -# https://sachinruk.github.io/blog/2021-03-07-clip.html -def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: - return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) - - -def clip_loss(similarity: torch.Tensor) -> torch.Tensor: - caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.t()) - return (caption_loss + image_loss) / 2.0 - - @dataclass -class CLIPVisionModelOutput(ModelOutput): +class IdeficsVisionModelOutput(ModelOutput): """ Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. @@ -95,75 +63,9 @@ class CLIPVisionModelOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None -@dataclass -class CLIPTextModelOutput(ModelOutput): - """ - Base class for text model's outputs that also contains a pooling of the last hidden states. - - Args: - text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): - The text embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - text_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class CLIPOutput(ModelOutput): - """ - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): - Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): - The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text - similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): - The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image - similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): - The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): - The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): - The output of the [`CLIPTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): - The output of the [`CLIPVisionModel`]. - """ - - loss: Optional[torch.FloatTensor] = None - logits_per_image: torch.FloatTensor = None - logits_per_text: torch.FloatTensor = None - text_embeds: torch.FloatTensor = None - image_embeds: torch.FloatTensor = None - text_model_output: BaseModelOutputWithPooling = None - vision_model_output: BaseModelOutputWithPooling = None - - def to_tuple(self) -> Tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) - - -# copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings -class CLIPVisionEmbeddings(nn.Module): - def __init__(self, config: CLIPVisionConfig): +# Adapted from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings +class IdeficsVisionEmbeddings(nn.Module): + def __init__(self, config: IdeficsVisionConfig): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -197,8 +99,8 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: return embeddings -# copied from transformers.models.clip.modeling_clip.CLIPAttention -class CLIPAttention(nn.Module): +# Adapted from transformers.models.clip.modeling_clip.CLIPAttention +class IdeficsVisionAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config): @@ -302,8 +204,8 @@ def forward( return attn_output, attn_weights_reshaped -# copied from transformers.models.clip.modeling_clip.CLIPMLP -class CLIPMLP(nn.Module): +# Adapted from transformers.models.clip.modeling_clip.CLIPMLP +class IdeficsVisionMLP(nn.Module): def __init__(self, config): super().__init__() self.config = config @@ -318,14 +220,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -# copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer -class CLIPEncoderLayer(nn.Module): - def __init__(self, config: CLIPConfig): +# Adapted from transformers.models.clip.modeling_clip.CLIPEncoderLayer +class IdeficsVisionEncoderLayer(nn.Module): + def __init__(self, config: IdeficsVisionConfig): super().__init__() self.embed_dim = config.hidden_size - self.self_attn = CLIPAttention(config) + self.self_attn = IdeficsVisionAttention(config) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = CLIPMLP(config) + self.mlp = IdeficsVisionMLP(config) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) def forward( @@ -369,20 +271,20 @@ def forward( return outputs -# copied from transformers.models.clip.modeling_clip.CLIPEncoder -class CLIPEncoder(nn.Module): +# Adapted from transformers.models.clip.modeling_clip.CLIPEncoder +class IdeficsVisionEncoder(nn.Module): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`CLIPEncoderLayer`]. + [`IdeficsVisionEncoderLayer`]. Args: - config: CLIPConfig + config: IdeficsVisionConfig """ - def __init__(self, config: CLIPConfig): + def __init__(self, config: IdeficsVisionConfig): super().__init__() self.config = config - self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( @@ -473,20 +375,18 @@ def custom_forward(*inputs): ) -# copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer -class CLIPVisionTransformer(nn.Module): - def __init__(self, config: CLIPVisionConfig): +class IdeficsVisionTransformer(nn.Module): + def __init__(self, config: IdeficsVisionConfig): super().__init__() self.config = config embed_dim = config.hidden_size - self.embeddings = CLIPVisionEmbeddings(config) + self.embeddings = IdeficsVisionEmbeddings(config) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self.encoder = CLIPEncoder(config) + self.encoder = IdeficsVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig) + # copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward def forward( self, pixel_values: Optional[torch.FloatTensor] = None, diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 2978ea5aa4d3..2e47f5d0b7e0 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -29,6 +29,7 @@ import torch from transformers import IdeficsForVisionText2Text, IdeficsModel + from transformers.models.idefics.configuration_idefics import IdeficsVisionConfig from transformers.models.idefics.modeling_idefics import IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0 else: @@ -112,6 +113,15 @@ def __init__( self.vision_num_hidden_layers = vision_num_hidden_layers self.vision_intermediate_size = vision_intermediate_size + self.vision_config = IdeficsVisionConfig( + embed_dim=self.vision_embed_dim, + patch_size=self.vision_patch_size, + image_size=self.vision_image_size, + num_attention_heads=self.vision_num_attention_heads, + num_hidden_layers=self.vision_num_hidden_layers, + intermediate_size=self.vision_intermediate_size, + ) + # we set the expected sequence length (which is used in several tests) # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1 @@ -155,12 +165,7 @@ def get_config(self): num_labels=self.num_labels, modality_type_vocab_size=self.modality_type_vocab_size, num_images=self.num_images, - vision_embed_dim=self.vision_embed_dim, - vision_intermediate_size=self.vision_intermediate_size, - vision_num_attention_heads=self.vision_num_attention_heads, - vision_image_size=self.vision_image_size, - vision_patch_size=self.vision_patch_size, - vision_num_hidden_layers=self.vision_num_hidden_layers, + vision_config=self.vision_config, ) def create_and_check_model( @@ -206,13 +211,7 @@ def prepare_pixel_values(self): @require_torch class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else () - pipeline_model_mapping = ( - { - "visual-question-answering": IdeficsForVisionText2Text, - } - if is_torch_available() - else {} - ) + pipeline_model_mapping = {} test_pruning = False test_headmasking = False test_torchscript = False diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index a2043b17bc2f..e0d4a01695fa 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -122,6 +122,8 @@ # TODO: @Younes (for `is_decoder`) "Pix2StructTextConfig": True, "IdeficsConfig": True, + "IdeficsVisionConfig": True, + "IdeficsPerceiverConfig": True, } )