From d3cd1e97ded3305a8d5f37ce3c8f1cd57efecfaf Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 25 Oct 2024 12:32:36 +0200 Subject: [PATCH 01/10] save/load sub-configs --- src/transformers/configuration_utils.py | 20 ++++- .../models/align/configuration_align.py | 44 ++-------- .../models/altclip/configuration_altclip.py | 25 +----- .../models/bark/configuration_bark.py | 41 ++------- .../models/blip/configuration_blip.py | 44 ++-------- .../models/blip_2/configuration_blip_2.py | 45 ++-------- .../bridgetower/configuration_bridgetower.py | 38 ++------- .../chameleon/configuration_chameleon.py | 3 + .../configuration_chinese_clip.py | 41 +-------- .../models/clap/configuration_clap.py | 44 ++-------- .../models/clip/configuration_clip.py | 44 ++-------- .../models/clipseg/configuration_clipseg.py | 44 ++-------- .../models/clvp/configuration_clvp.py | 27 ++---- .../models/dbrx/configuration_dbrx.py | 41 ++------- .../configuration_encoder_decoder.py | 3 + .../configuration_fastspeech2_conformer.py | 4 +- .../models/flava/configuration_flava.py | 84 +++---------------- .../models/git/configuration_git.py | 23 +---- .../models/groupvit/configuration_groupvit.py | 44 ++-------- .../models/idefics/configuration_idefics.py | 7 +- .../models/idefics2/configuration_idefics2.py | 31 ++----- .../models/idefics3/configuration_idefics3.py | 30 ++----- .../configuration_instructblip.py | 45 ++-------- .../configuration_instructblipvideo.py | 44 ++-------- .../models/kosmos2/configuration_kosmos2.py | 45 ++-------- .../models/llava/configuration_llava.py | 4 +- .../llava_next/configuration_llava_next.py | 3 + .../configuration_llava_next_video.py | 4 +- .../modular_llava_next_video.py | 4 +- .../configuration_llava_onevision.py | 3 + .../models/mllama/configuration_mllama.py | 43 ++-------- .../models/moshi/configuration_moshi.py | 3 +- .../models/mpt/configuration_mpt.py | 21 +---- .../models/musicgen/configuration_musicgen.py | 5 ++ .../configuration_musicgen_melody.py | 5 ++ .../models/owlv2/configuration_owlv2.py | 58 ++----------- .../models/owlvit/configuration_owlvit.py | 58 ++----------- .../models/qwen2_vl/configuration_qwen2_vl.py | 23 +---- .../models/siglip/configuration_siglip.py | 44 ++-------- .../configuration_speech_encoder_decoder.py | 3 + .../video_llava/configuration_video_llava.py | 3 + .../models/vipllava/configuration_vipllava.py | 3 + .../configuration_vision_encoder_decoder.py | 3 + .../configuration_vision_text_dual_encoder.py | 3 + .../models/x_clip/configuration_x_clip.py | 44 ++-------- tests/models/align/test_modeling_align.py | 10 +++ tests/models/altclip/test_modeling_altclip.py | 10 +++ tests/models/blip/test_modeling_blip.py | 8 ++ tests/models/blip_2/test_modeling_blip_2.py | 7 ++ .../bridgetower/test_modeling_bridgetower.py | 1 + .../chameleon/test_modeling_chameleon.py | 1 + tests/models/clap/test_modeling_clap.py | 8 ++ tests/models/clip/test_modeling_clip.py | 8 ++ tests/models/clipseg/test_modeling_clipseg.py | 8 ++ tests/models/clvp/test_modeling_clvp.py | 9 +- tests/models/flava/test_modeling_flava.py | 8 ++ tests/models/git/test_modeling_git.py | 1 + .../models/groupvit/test_modeling_groupvit.py | 8 ++ .../test_modeling_instructblip.py | 9 ++ .../test_modeling_instructblipvideo.py | 7 ++ tests/models/llava/test_modeling_llava.py | 9 +- .../llava_next/test_modeling_llava_next.py | 9 +- .../test_modeling_llava_next_video.py | 9 +- .../test_modeling_llava_onevision.py | 9 +- .../test_modeling_musicgen_melody.py | 5 -- tests/models/owlv2/test_modeling_owlv2.py | 8 ++ tests/models/owlvit/test_modeling_owlvit.py | 8 ++ tests/models/siglip/test_modeling_siglip.py | 4 + .../video_llava/test_modeling_video_llava.py | 9 +- .../models/vipllava/test_modeling_vipllava.py | 9 +- tests/models/x_clip/test_modeling_x_clip.py | 8 ++ tests/test_configuration_common.py | 35 +++++++- 72 files changed, 434 insertions(+), 989 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 1d892c49a231..1afdab4e6940 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -190,6 +190,7 @@ class PretrainedConfig(PushToHubMixin): """ model_type: str = "" + base_config_key: str = "" is_composition: bool = False attribute_map: Dict[str, str] = {} _auto_class: Optional[str] = None @@ -543,11 +544,22 @@ def from_pretrained( cls._set_token_in_kwargs(kwargs, token) config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + if cls.base_config_key and cls.base_config_key in config_dict: + config_dict = config_dict[cls.base_config_key] + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) + # sometimes the config has no `base_config_key` if the config is used in several composite models + # e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning + for k, v in config_dict.items(): + if isinstance(v, dict) and v.get("model_type") == cls.model_type: + config_dict = v + + # raise warning only if we still can't see a match in `model_type` + if config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) return cls.from_dict(config_dict, **kwargs) diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index 99fa81b4a935..e3922be387c8 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -14,8 +14,7 @@ # limitations under the License. """ALIGN model configuration""" -import os -from typing import TYPE_CHECKING, List, Union +from typing import TYPE_CHECKING, List if TYPE_CHECKING: @@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig): ```""" model_type = "align_text_model" + base_config_key = "text_config" def __init__( self, @@ -133,24 +133,6 @@ def __init__( self.use_cache = use_cache self.pad_token_id = pad_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from AlignConfig - if config_dict.get("model_type") == "align": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AlignVisionConfig(PretrainedConfig): r""" @@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig): ```""" model_type = "align_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -272,24 +255,6 @@ def __init__( self.drop_connect_rate = drop_connect_rate self.num_hidden_layers = sum(num_block_repeats) * 4 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from AlignConfig - if config_dict.get("model_type") == "align": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AlignConfig(PretrainedConfig): r""" @@ -340,6 +305,9 @@ class AlignConfig(PretrainedConfig): ```""" model_type = "align" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AlignTextConfig" + vision_config_class = "AlignVisionConfig" def __init__( self, diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 7333fa63a352..924d9bda5b09 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -14,9 +14,6 @@ # limitations under the License. """AltCLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig): ```""" model_type = "altclip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -233,24 +231,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from AltCLIPConfig - if config_dict.get("model_type") == "altclip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AltCLIPConfig(PretrainedConfig): r""" @@ -298,6 +278,9 @@ class AltCLIPConfig(PretrainedConfig): ```""" model_type = "altclip" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AltCLIPTextConfig" + vision_config_class = "AltCLIPVisionConfig" def __init__( self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index 6dd08b65e89e..3c8659588511 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -14,8 +14,7 @@ # limitations under the License. """BARK model configuration""" -import os -from typing import Dict, Optional, Union +from typing import Dict from ...configuration_utils import PretrainedConfig from ...utils import add_start_docstrings, logging @@ -64,7 +63,6 @@ class BarkSubModelConfig(PretrainedConfig): - model_type = "bark_module" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { @@ -101,38 +99,6 @@ def __init__( super().__init__(**kwargs) - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - cache_dir: Optional[Union[str, os.PathLike]] = None, - force_download: bool = False, - local_files_only: bool = False, - token: Optional[Union[str, bool]] = None, - revision: str = "main", - **kwargs, - ) -> "PretrainedConfig": - kwargs["cache_dir"] = cache_dir - kwargs["force_download"] = force_download - kwargs["local_files_only"] = local_files_only - kwargs["revision"] = revision - - cls._set_token_in_kwargs(kwargs, token) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the config dict if we are loading from Bark - if config_dict.get("model_type") == "bark": - config_dict = config_dict[f"{cls.model_type}_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @add_start_docstrings( BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"), @@ -265,6 +231,11 @@ class BarkConfig(PretrainedConfig): """ model_type = "bark" + sub_configs = ["semantic_config", "coarse_acoustics_config", "fine_acoustics_config", "codec_config"] + semantic_config_class = "BarkSemanticConfig" + coarse_acoustics_config_class = "BarkCoarseConfig" + fine_acoustics_config_class = "BarkFineConfig" + codec_config_class = "AutoConfig" def __init__( self, diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 4772738be103..2d97b4b19314 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -14,9 +14,6 @@ # limitations under the License. """Blip model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig): ```""" model_type = "blip_text_model" + base_config_key = "text_config" def __init__( self, @@ -146,24 +144,6 @@ def __init__( self.use_cache = use_cache self.label_smoothing = label_smoothing - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from BlipConfig - if config_dict.get("model_type") == "blip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BlipVisionConfig(PretrainedConfig): r""" @@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig): ```""" model_type = "blip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -245,24 +226,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from BlipConfig - if config_dict.get("model_type") == "blip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BlipConfig(PretrainedConfig): r""" @@ -316,6 +279,9 @@ class BlipConfig(PretrainedConfig): ```""" model_type = "blip" + sub_configs = ["text_config", "vision_config"] + text_config_class = "BlipTextConfig" + vision_config_class = "BlipVisionConfig" def __init__( self, diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 16fa4aec3849..20acce294852 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -14,8 +14,7 @@ # limitations under the License. """BLIP-2 model configuration""" -import os -from typing import Optional, Union +from typing import Optional from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES @@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig): ```""" model_type = "blip_2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -106,24 +106,6 @@ def __init__( self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Blip2Config - if config_dict.get("model_type") == "blip-2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Blip2QFormerConfig(PretrainedConfig): r""" @@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig): ```""" model_type = "blip_2_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -229,24 +212,6 @@ def __init__( self.encoder_hidden_size = encoder_hidden_size self.use_qformer_text_input = use_qformer_text_input - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from Blip2Config - if config_dict.get("model_type") == "blip-2": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Blip2Config(PretrainedConfig): r""" @@ -306,6 +271,10 @@ class Blip2Config(PretrainedConfig): ```""" model_type = "blip-2" + sub_configs = ["text_config", "qformer_config", "vision_config"] + text_config_class = "AutoConfig" + qformer_config_class = "Blip2QFormerConfig" + vision_config_class = "Blip2VisionConfig" def __init__( self, diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index 4985b6ef89fe..b60d2cdc5559 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -14,9 +14,6 @@ # limitations under the License. """BridgeTower model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig): ```""" model_type = "bridgetower_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -95,21 +93,6 @@ def __init__( self.share_layernorm = share_layernorm self.remove_last_layer = remove_last_layer - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "bridgetower": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BridgeTowerTextConfig(PretrainedConfig): r""" @@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig): ```""" model_type = "bridgetower_text_model" + base_config_key = "text_config" def __init__( self, @@ -217,21 +201,6 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "bridgetower": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BridgeTowerConfig(PretrainedConfig): r""" @@ -288,6 +257,9 @@ class BridgeTowerConfig(PretrainedConfig): ```""" model_type = "bridgetower" + sub_configs = ["text_config", "vision_config"] + text_config_class = "BridgeTowerTextConfig" + vision_config_class = "BridgeTowerVisionConfig" def __init__( self, diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 67de37f2d01b..cfc2a10af69f 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig): """ model_type = "chameleon_vqgan" + base_config_key = "vq_config" def __init__( self, @@ -187,6 +188,8 @@ class ChameleonConfig(PretrainedConfig): ```""" model_type = "chameleon" + sub_configs = ["vq_config"] + vq_config_class = "ChameleonVQVAEConfig" keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 5b37044fab50..11f1560f3f13 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -14,9 +14,8 @@ # limitations under the License. """Chinese-CLIP model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional if TYPE_CHECKING: @@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig): ```""" model_type = "chinese_clip_text_model" + base_config_key = "text_config" def __init__( self, @@ -141,24 +141,6 @@ def __init__( self.position_embedding_type = position_embedding_type self.use_cache = use_cache - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from ChineseCLIPConfig - if config_dict.get("model_type") == "chinese_clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ChineseCLIPVisionConfig(PretrainedConfig): r""" @@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ```""" model_type = "chinese_clip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -249,24 +232,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from ChineseCLIPConfig - if config_dict.get("model_type") == "chinese_clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ChineseCLIPConfig(PretrainedConfig): r""" diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 1425e2a86289..aed814258f4e 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -14,9 +14,6 @@ # limitations under the License. """CLAP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig): ```""" model_type = "clap_text_model" + base_config_key = "text_config" def __init__( self, @@ -137,24 +135,6 @@ def __init__( self.projection_hidden_act = projection_hidden_act self.projection_dim = projection_dim - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from ClapConfig - if config_dict.get("model_type") == "clap": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClapAudioConfig(PretrainedConfig): r""" @@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig): ```""" model_type = "clap_audio_model" + base_config_key = "audio_config" def __init__( self, @@ -307,24 +288,6 @@ def __init__( self.initializer_factor = initializer_factor self.projection_hidden_act = projection_hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the audio config dict if we are loading from ClapConfig - if config_dict.get("model_type") == "clap": - config_dict = config_dict["audio_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClapConfig(PretrainedConfig): r""" @@ -377,6 +340,9 @@ class ClapConfig(PretrainedConfig): ```""" model_type = "clap" + sub_configs = ["text_config", "audio_config"] + text_config_class = "ClapTextConfig" + audio_config_class = "ClapAudioConfig" def __init__( self, diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 8e027f5c3f01..f19f2df136fa 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -14,9 +14,8 @@ # limitations under the License. """CLIP model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional if TYPE_CHECKING: @@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig): ```""" model_type = "clip_text_model" + base_config_key = "text_config" def __init__( self, @@ -130,24 +130,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPVisionConfig(PretrainedConfig): r""" @@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig): ```""" model_type = "clip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -239,24 +222,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPConfig(PretrainedConfig): r""" @@ -305,6 +270,9 @@ class CLIPConfig(PretrainedConfig): ```""" model_type = "clip" + sub_configs = ["text_config", "vision_config"] + text_config_class = "CLIPTextConfig" + vision_config_class = "CLIPVisionConfig" def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 0ac8196fc7f5..9a9fec755d95 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -14,9 +14,6 @@ # limitations under the License. """CLIPSeg model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig): ```""" model_type = "clipseg_text_model" + base_config_key = "text_config" def __init__( self, @@ -117,24 +115,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPSegConfig - if config_dict.get("model_type") == "clipseg": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPSegVisionConfig(PretrainedConfig): r""" @@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig): ```""" model_type = "clipseg_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -222,24 +203,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPSegConfig - if config_dict.get("model_type") == "clipseg": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPSegConfig(PretrainedConfig): r""" @@ -306,6 +269,9 @@ class CLIPSegConfig(PretrainedConfig): ```""" model_type = "clipseg" + sub_configs = ["text_config", "vision_config"] + text_config_class = "CLIPSegTextConfig" + vision_config_class = "CLIPSegVisionConfig" def __init__( self, diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index d17a04c861bf..a46de9337c55 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig): ```""" model_type = "clvp_encoder" + base_config_key = ["text_config", "speech_config"] def __init__( self, @@ -141,7 +142,7 @@ def from_pretrained( # make sure to have the config_type be either "text_config" or "speech_config" # this is to make sure that we can load only text or speech configs from the nested ClvpConfig. - if config_type not in ["text_config", "speech_config"]: + if config_type not in cls.base_config_key: raise ValueError( f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}" ) @@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig): ```""" model_type = "clvp_decoder" + base_config_key = "decoder_config" def __init__( self, @@ -314,24 +316,6 @@ def __init__( super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the speech config dict if we are loading from ClvpConfig - if config_dict.get("model_type") == "clvp": - config_dict = config_dict["decoder_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClvpConfig(PretrainedConfig): r""" @@ -386,7 +370,10 @@ class ClvpConfig(PretrainedConfig): ```""" model_type = "clvp" - is_composition = True + sub_configs = ["text_config", "speech_config", "decoder_config"] + text_config_class = "ClvpEncoderConfig" + speech_config_class = "ClvpEncoderConfig" + decoder_config_class = "ClvpDecoderConfig" def __init__( self, diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index dde5232ae5cc..0d7836b98e35 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope. """ + base_config_key = "attn_config" + def __init__( self, attn_pdrop: float = 0.0, @@ -61,23 +63,6 @@ def __init__( if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["attn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class DbrxFFNConfig(PretrainedConfig): """Configuration class for Dbrx FFN. @@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig): moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights. """ + base_config_key = "ffn_config" + def __init__( self, ffn_act_fn: dict = None, @@ -128,23 +115,6 @@ def __init__( if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["ffn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class DbrxConfig(PretrainedConfig): r""" @@ -202,6 +172,9 @@ class DbrxConfig(PretrainedConfig): """ model_type = "dbrx" + sub_configs = ["attn_config", "ffn_config"] + attn_config_class = "DbrxAttentionConfig" + ffn_config_class = "DbrxFFNConfig" attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index ab5d49b32fea..6cacc9cd736e 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -70,6 +70,9 @@ class EncoderDecoderConfig(PretrainedConfig): ```""" model_type = "encoder-decoder" + sub_configs = ["encoder_config", "decoder_config"] + encoder_config_class = "AutoConfig" + decoder_config_class = "AutoConfig" is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py index ade5b8b26675..5d6b968de708 100644 --- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -453,7 +453,9 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): """ model_type = "fastspeech2_conformer_with_hifigan" - is_composition = True + sub_configs = ["model_config", "vocoder_config"] + model_config_class = "FastSpeech2ConformerConfig" + vocoder_config_class = "FastSpeech2ConformerHifiGanConfig" def __init__( self, diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index b6349361c0dd..0dc37fbeaa17 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -14,8 +14,7 @@ # limitations under the License. """FLAVA model configurations""" -import os -from typing import Any, Dict, Union +from typing import Any, Dict from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig): ```""" model_type = "flava_image_model" + base_config_key = "image_config" def __init__( self, @@ -124,24 +124,6 @@ def __init__( self.mask_token = mask_token self.vocab_size = vocab_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the image config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["image_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaTextConfig(PretrainedConfig): r""" @@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig): ```""" model_type = "flava_text_model" + base_config_key = "text_config" def __init__( self, @@ -254,24 +237,6 @@ def __init__( self.qkv_bias = qkv_bias self.pad_token_id = pad_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaMultimodalConfig(PretrainedConfig): r""" @@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig): ```""" model_type = "flava_multimodal_model" + base_config_key = "multimodal_config" def __init__( self, @@ -357,27 +323,10 @@ def __init__( self.qkv_bias = qkv_bias self.use_cls_token = use_cls_token - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the multimodal config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["multimodal_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaImageCodebookConfig(PretrainedConfig): model_type = "flava_image_codebook" + base_config_key = "image_codebook_config" r""" [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It @@ -442,24 +391,6 @@ def __init__( self.freeze = freeze self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the image codebook config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["image_codebook_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaConfig(PretrainedConfig): r""" @@ -532,6 +463,11 @@ class FlavaConfig(PretrainedConfig): """ model_type = "flava" + sub_configs = ["text_config", "image_config", "multimodal_config", "image_codebook_config"] + text_config_class = "FlavaTextConfig" + image_config_class = "FlavaImageConfig" + multimodal_config_class = "FlavaMultimodalConfig" + image_codebook_config_class = "FlavaImageCodebookConfig" def __init__( self, diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index ecaea17ff946..5cd7f33315e7 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -13,8 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig): ```""" model_type = "git_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -102,24 +101,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from GITConfig - if config_dict.get("model_type") == "git": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GitConfig(PretrainedConfig): r""" @@ -186,6 +167,8 @@ class GitConfig(PretrainedConfig): ```""" model_type = "git" + sub_configs = ["vision_config"] + vision_config_class = "GitVisionConfig" def __init__( self, diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index e608fbcdbe9c..0497712b83c6 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -14,9 +14,8 @@ # limitations under the License. """GroupViT model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional from ...configuration_utils import PretrainedConfig from ...onnx import OnnxConfig @@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig): ```""" model_type = "groupvit_text_model" + base_config_key = "text_config" def __init__( self, @@ -121,24 +121,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from GroupViTConfig - if config_dict.get("model_type") == "groupvit": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GroupViTVisionConfig(PretrainedConfig): r""" @@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig): ```""" model_type = "groupvit_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -246,24 +229,6 @@ def __init__( self.assign_eps = assign_eps self.assign_mlp_ratio = assign_mlp_ratio - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from GroupViTConfig - if config_dict.get("model_type") == "groupvit": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GroupViTConfig(PretrainedConfig): r""" @@ -292,6 +257,9 @@ class GroupViTConfig(PretrainedConfig): """ model_type = "groupvit" + sub_configs = ["text_config", "vision_config"] + text_config_class = "GroupViTTextConfig" + vision_config_class = "GroupViTVisionConfig" def __init__( self, diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 56b6025a8e89..1aac36a497a3 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -68,7 +68,7 @@ class IdeficsVisionConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ - model_type = "idefics" + model_type = "idefics_vision" attribute_map = { "hidden_size": "embed_dim", } @@ -131,7 +131,7 @@ class IdeficsPerceiverConfig(PretrainedConfig): Whether or not to use qk layer norms in perceiver """ - model_type = "idefics" + model_type = "idefics_perciever" def __init__( self, @@ -235,6 +235,9 @@ class IdeficsConfig(PretrainedConfig): ```""" model_type = "idefics" + sub_configs = ["perceiver_config", "vision_config"] + perceiver_config_class = "IdeficsPerceiverConfig" + vision_config_class = "IdeficsVisionConfig" is_composition = False def __init__( diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 64743d1cd470..44c093b405e4 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -13,9 +13,6 @@ # limitations under the License. """Idefics2 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging from ..auto import CONFIG_MAPPING @@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "idefics2" + model_type = "idefics2_vision" + base_config_key = "vision_config" def __init__( self, @@ -107,24 +105,6 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Idefics2Config - if config_dict.get("model_type") == "idefics2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Idefics2PerceiverConfig(PretrainedConfig): r""" @@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig): The dropout ratio for the attention probabilities. """ - model_type = "idefics2" + model_type = "idefics2_perceiver" def __init__( self, @@ -220,7 +200,10 @@ class Idefics2Config(PretrainedConfig): ```""" model_type = "idefics2" - is_composition = True + sub_configs = ["text_config", "perceiver_config", "vision_config"] + text_config_class = "AutoConfig" + perceiver_config_class = "Idefics2PerceiverConfig" + vision_config_class = "Idefics2VisionConfig" def __init__( self, diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index 45afe685f520..c5336b90a3eb 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -13,9 +13,6 @@ # limitations under the License. """Idefics3 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging from ..auto import CONFIG_MAPPING @@ -76,7 +73,8 @@ class Idefics3VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "idefics3" + model_type = "idefics3_vision" + base_config_key = "vision_config" def __init__( self, @@ -107,24 +105,6 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Idefics3Config - if config_dict.get("model_type") == "idefics3": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Idefics3Config(PretrainedConfig): r""" @@ -165,7 +145,9 @@ class Idefics3Config(PretrainedConfig): ```""" model_type = "idefics3" - is_composition = True + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "Idefics3VisionConfig" def __init__( self, @@ -204,4 +186,4 @@ def __init__( self.text_config = text_config self.scale_factor = scale_factor - super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) + super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings) diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index a274212a945e..212786d74b7c 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -14,9 +14,6 @@ # limitations under the License. """InstructBLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging @@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig): ```""" model_type = "instructblip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -108,24 +106,6 @@ def __init__( self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from InstructBlipConfig - if config_dict.get("model_type") == "instructblip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipQFormerConfig(PretrainedConfig): r""" @@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig): ```""" model_type = "instructblip_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -229,24 +210,6 @@ def __init__( self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from InstructBlipConfig - if config_dict.get("model_type") == "instructblip": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipConfig(PretrainedConfig): r""" @@ -305,6 +268,10 @@ class InstructBlipConfig(PretrainedConfig): ```""" model_type = "instructblip" + sub_configs = ["text_config", "qformer_config", "vision_config"] + text_config_class = "AutoConfig" + qformer_config_class = "InstructBlipQFormerConfig" + vision_config_class = "InstructBlipVisionConfig" def __init__( self, diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index e7c8eeccef98..edfa5cc58bd4 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -19,8 +19,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES @@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig): ```""" model_type = "instructblipvideo_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -113,24 +112,6 @@ def __init__( self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from InstructBlipVideoConfig - if config_dict.get("model_type") == "instructblipvideo": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipVideoQFormerConfig(PretrainedConfig): r""" @@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig): ```""" model_type = "instructblipvideo_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -234,24 +216,6 @@ def __init__( self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from InstructBlipVideoConfig - if config_dict.get("model_type") == "instructblipvideo": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipVideoConfig(PretrainedConfig): r""" @@ -310,6 +274,10 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" + sub_configs = ["text_config", "qformer_config", "vision_config"] + text_config_class = "AutoConfig" + qformer_config_class = "InstructBlipVideoQFormerConfig" + vision_config_class = "InstructBlipVideoVisionConfig" def __init__( self, diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index e49074f8061b..40142b2431e5 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -14,9 +14,6 @@ # limitations under the License. """KOSMOS-2 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -72,6 +69,7 @@ class Kosmos2TextConfig(PretrainedConfig): ```""" model_type = "kosmos_2_text_model" + base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "num_attention_heads": "attention_heads", @@ -124,24 +122,6 @@ def __init__( self.scale_embedding = scale_embedding self.use_cache = use_cache - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from Kosmos2Config - if config_dict.get("model_type") == "kosmos-2": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Kosmos2VisionConfig(PretrainedConfig): r""" @@ -183,6 +163,7 @@ class Kosmos2VisionConfig(PretrainedConfig): ```""" model_type = "kosmos_2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -215,24 +196,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Kosmos2Config - if config_dict.get("model_type") == "kosmos-2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Kosmos2Config(PretrainedConfig): r""" @@ -267,7 +230,9 @@ class Kosmos2Config(PretrainedConfig): ```""" model_type = "kosmos-2" - is_composition = True + sub_configs = ["text_config", "vision_config"] + text_config_class = "Kosmos2TextConfig" + vision_config_class = "Kosmos2VisionConfig" def __init__( self, diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 3a4cb09855f0..1aec99bd2864 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -73,7 +73,9 @@ class LlavaConfig(PretrainedConfig): ```""" model_type = "llava" - is_composition = True + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" def __init__( self, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index e8768dde8572..0bab95d646ab 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -78,6 +78,9 @@ class LlavaNextConfig(PretrainedConfig): ```""" model_type = "llava_next" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" is_composition = False def __init__( diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 0e4e39b4b3ab..0341d2f009a7 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -86,7 +86,9 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - is_composition = True + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" def __init__( self, diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index c1ed7571941b..5c117ba6a4b7 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -100,7 +100,9 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - is_composition = True + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" def __init__( self, diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index eef86c6c8c01..bcb15c4fa484 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -81,6 +81,9 @@ class LlavaOnevisionConfig(PretrainedConfig): ```""" model_type = "llava_onevision" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" is_composition = False def __init__( diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 539fc61ba4ed..b7c87210e615 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -13,8 +13,7 @@ # limitations under the License. """Mllama model configuration""" -import os -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation @@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig): ```""" model_type = "mllama_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -137,23 +137,6 @@ def __init__( def max_aspect_ratio_id(self) -> int: return len(self.supported_aspect_ratios) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mllama": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MllamaTextConfig(PretrainedConfig): r""" @@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig): ```""" model_type = "mllama_text_model" + base_config_key = "text_config" def __init__( self, @@ -311,23 +295,6 @@ def __init__( **kwargs, ) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mllama": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MllamaConfig(PretrainedConfig): r""" @@ -370,7 +337,9 @@ class MllamaConfig(PretrainedConfig): ```""" model_type = "mllama" - is_composition = True + sub_configs = ["text_config", "vision_config"] + text_config_class = "MllamaTextConfig" + vision_config_class = "MllamaVisionConfig" def __init__( self, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 654e4e82a491..a0a3848fede5 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -235,8 +235,9 @@ class MoshiConfig(PretrainedConfig): ```""" model_type = "moshi" - is_composition = True keys_to_ignore_at_inference = ["past_key_values"] + sub_configs = ["audio_encoder_config"] + audio_encoder_config_class = "AutoConfig" def __init__( self, diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index ed822c813ba2..2e0fd0f848a5 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig): The maximum value of the alibi bias. """ + base_config_key = "attn_config" + def __init__( self, attn_type="multihead_attention", @@ -97,23 +99,6 @@ def __init__( f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}" ) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mpt": - config_dict = config_dict["attn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MptConfig(PretrainedConfig): """ @@ -188,6 +173,8 @@ class MptConfig(PretrainedConfig): """ model_type = "mpt" + sub_configs = ["attn_config"] + attn_config_class = "MptAttentionConfig" attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 0d282355defa..b7a8eb9de72c 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig): """ model_type = "musicgen_decoder" + base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -189,6 +190,10 @@ class MusicgenConfig(PretrainedConfig): ```""" model_type = "musicgen" + sub_configs = ["text_encoder_config", "audio_encoder_config", "decoder_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" + decoder_config_class = "MusicgenDecoderConfig" is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 8a77cea02522..94ceddce00ea 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig): """ model_type = "musicgen_melody_decoder" + base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -195,6 +196,10 @@ class MusicgenMelodyConfig(PretrainedConfig): ```""" model_type = "musicgen_melody" + sub_configs = ["text_encoder_config", "audio_encoder_config", "decoder_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" + decoder_config_class = "MusicgenMelodyDecoderConfig" is_composition = True def __init__( diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 43019553c5c6..82d8c7e3f0f1 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -14,8 +14,7 @@ # limitations under the License. """OWLv2 model configuration""" -import os -from typing import TYPE_CHECKING, Dict, Union +from typing import TYPE_CHECKING, Dict if TYPE_CHECKING: @@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig): ```""" model_type = "owlv2_text_model" + base_config_key = "text_config" def __init__( self, @@ -123,24 +123,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from Owlv2Config - if config_dict.get("model_type") == "owlv2": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16 class Owlv2VisionConfig(PretrainedConfig): @@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig): ```""" model_type = "owlv2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -229,24 +212,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Owlv2Config - if config_dict.get("model_type") == "owlv2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2 class Owlv2Config(PretrainedConfig): @@ -276,6 +241,9 @@ class Owlv2Config(PretrainedConfig): """ model_type = "owlv2" + sub_configs = ["text_config", "vision_config"] + text_config_class = "Owlv2TextConfig" + vision_config_class = "Owlv2VisionConfig" def __init__( self, @@ -304,20 +272,6 @@ def __init__( self.return_dict = return_dict self.initializer_factor = 1.0 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @classmethod def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs): r""" diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index 877b348f32c1..fec49c732453 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -14,9 +14,8 @@ # limitations under the License. """OWL-ViT model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional if TYPE_CHECKING: @@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig): ```""" model_type = "owlvit_text_model" + base_config_key = "text_config" def __init__( self, @@ -125,24 +125,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from OwlViTConfig - if config_dict.get("model_type") == "owlvit": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class OwlViTVisionConfig(PretrainedConfig): r""" @@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig): ```""" model_type = "owlvit_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -230,24 +213,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from OwlViTConfig - if config_dict.get("model_type") == "owlvit": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class OwlViTConfig(PretrainedConfig): r""" @@ -276,6 +241,9 @@ class OwlViTConfig(PretrainedConfig): """ model_type = "owlvit" + sub_configs = ["text_config", "vision_config"] + text_config_class = "OwlViTTextConfig" + vision_config_class = "OwlViTVisionConfig" def __init__( self, @@ -304,20 +272,6 @@ def __init__( self.return_dict = return_dict self.initializer_factor = 1.0 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @classmethod def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs): r""" diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 1349006e768c..3f466c542dd3 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -14,9 +14,6 @@ # limitations under the License. """Qwen2VL model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation from ...utils import logging @@ -27,6 +24,7 @@ class Qwen2VLVisionConfig(PretrainedConfig): model_type = "qwen2_vl" + base_config_key = "vision_config" def __init__( self, @@ -55,23 +53,6 @@ def __init__( self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "qwen2_vl": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Qwen2VLConfig(PretrainedConfig): r""" @@ -180,6 +161,8 @@ class Qwen2VLConfig(PretrainedConfig): ```""" model_type = "qwen2_vl" + sub_configs = ["vision_config"] + vision_config_class = "Qwen2VLVisionConfig" keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py index 73622373cbab..b63988c05483 100644 --- a/src/transformers/models/siglip/configuration_siglip.py +++ b/src/transformers/models/siglip/configuration_siglip.py @@ -14,9 +14,6 @@ # limitations under the License. """Siglip model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig): ```""" model_type = "siglip_text_model" + base_config_key = "text_config" def __init__( self, @@ -110,24 +108,6 @@ def __init__( self.hidden_act = hidden_act self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class SiglipVisionConfig(PretrainedConfig): r""" @@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig): ```""" model_type = "siglip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -206,24 +187,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class SiglipConfig(PretrainedConfig): r""" @@ -268,6 +231,9 @@ class SiglipConfig(PretrainedConfig): ```""" model_type = "siglip" + sub_configs = ["text_config", "vision_config"] + text_config_class = "SiglipTextConfig" + vision_config_class = "SiglipVisionConfig" def __init__(self, text_config=None, vision_config=None, **kwargs): super().__init__(**kwargs) diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py index 32a58ec5589e..c7820708e397 100644 --- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py @@ -71,6 +71,9 @@ class SpeechEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "speech-encoder-decoder" + sub_configs = ["encoder_config", "decoder_config"] + encoder_config_class = "AutoConfig" + decoder_config_class = "AutoConfig" is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 8738a02585e0..54376162af29 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -78,6 +78,9 @@ class VideoLlavaConfig(PretrainedConfig): ```""" model_type = "video_llava" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" is_composition = False def __init__( diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index f88be5adfba0..2cbefb70a47b 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -72,6 +72,9 @@ class VipLlavaConfig(PretrainedConfig): ```""" model_type = "vipllava" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" is_composition = False def __init__( diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py index a4aa663f9852..21c17652254a 100644 --- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -78,6 +78,9 @@ class VisionEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "vision-encoder-decoder" + sub_configs = ["encoder_config", "decoder_config"] + encoder_config_class = "AutoConfig" + decoder_config_class = "AutoConfig" is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 4cea34ca2313..3973e090f9af 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -75,6 +75,9 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ```""" model_type = "vision-text-dual-encoder" + sub_configs = ["text_config", "vision_config"] + text_config_class = "AutoConfig" + vision_config_class = "AutoConfig" is_composition = True def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 827046b6c353..80df301cf1b5 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -14,9 +14,6 @@ # limitations under the License. """X-CLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig): ```""" model_type = "xclip_text_model" + base_config_key = "text_config" def __init__( self, @@ -112,24 +110,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from XCLIPConfig - if config_dict.get("model_type") == "xclip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class XCLIPVisionConfig(PretrainedConfig): r""" @@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig): ```""" model_type = "xclip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -239,24 +220,6 @@ def __init__( self.hidden_act = hidden_act self.drop_path_rate = drop_path_rate - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from XCLIPConfig - if config_dict.get("model_type") == "xclip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class XCLIPConfig(PretrainedConfig): r""" @@ -295,6 +258,9 @@ class XCLIPConfig(PretrainedConfig): """ model_type = "xclip" + sub_configs = ["text_config", "vision_config"] + text_config_class = "XCLIPTextConfig" + vision_config_class = "XCLIPVisionConfig" def __init__( self, diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index ddeb585a757d..b0181aec8b63 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -457,11 +457,21 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = AlignModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=AlignConfig, + has_text_modality=False, + common_properties=["projection_dim", "temperature_init_value"], + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="Start to fail after using torch `cu118`.") def test_multi_gpu_data_parallel_forward(self): super().test_multi_gpu_data_parallel_forward() diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 0175e562eda6..c492f991ad2a 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -452,11 +452,21 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = AltCLIPModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=AltCLIPConfig, + has_text_modality=False, + common_properties=["projection_dim", "logit_scale_init_value"], + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 2f8ee3229ff2..d0839132e1a8 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -448,11 +448,19 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BlipModelTester(self) + common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"] + self.config_tester = ConfigTester( + self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index e5d04bd85a34..3d915e61e0be 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -461,6 +461,13 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT def setUp(self): self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"] + self.config_tester = ConfigTester( + self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index cceeee4912dc..db2ec1a3aeb3 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -340,6 +340,7 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index aad26ef147e8..9eaac111d63d 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -293,6 +293,7 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 9f8cc62d2e0f..84724e580353 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -515,11 +515,19 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ClapModelTester(self) + common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"] + self.config_tester = ConfigTester( + self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index a7c8c8ef8410..b43cbdf5f49f 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -745,11 +745,19 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = CLIPModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index c5edf7cb757b..80e97d96dde8 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -472,11 +472,19 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = CLIPSegModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + def test_model_for_image_segmentation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs) diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 0cf89a745233..22baf799525e 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -416,7 +416,14 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = ClvpModelForConditionalGenerationTester(self) - self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.clvp_config_tester = ConfigTester( + self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32 + ) + + def test_config(self): + self.clvp_config_tester.run_common_tests() + self.clvp_config_tester.create_and_test_config_from_and_save_pretrained_composite() def tearDown(self): super().tearDown() diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index d8c8f385e9ce..5cac14d634ca 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -931,11 +931,19 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = self.class_for_tester(self) + common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"] + self.config_tester = ConfigTester( + self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 33da9e26cba0..95eaa5e2ea12 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -427,6 +427,7 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index ce31bc44a611..2e411d4cd7f0 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -559,11 +559,19 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = GroupViTModelTester(self) + common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="hidden_states are tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index a33be021353f..aba89119a9ed 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -465,6 +465,15 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene def setUp(self): self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=InstructBlipConfig, + has_text_modality=False, + common_properties=["num_query_tokens", "image_token_index"], + ) + + def test_config(self): + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 298c7a8d7ff4..59c091d009f2 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -486,11 +486,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( def setUp(self): self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["num_query_tokens", "video_token_index"] + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties + ) def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + def test_config(self): + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 405fad1bd31c..7934ddb8bee7 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -190,7 +190,14 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM def setUp(self): self.model_tester = LlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 6589bf14d24c..40283ec07643 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -222,7 +222,14 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes def setUp(self): self.model_tester = LlavaNextVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 05fc8a49e1e9..d0f6b14bf7b4 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -240,7 +240,14 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati def setUp(self): self.model_tester = LlavaNextVideoVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 0a33898b6307..862d09a68a1c 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -223,7 +223,14 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati def setUp(self): self.model_tester = LlavaOnevisionVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"] + self.config_tester = ConfigTester( + self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index f53fc21ba80c..389dd2ce5a1f 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -47,7 +47,6 @@ from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -187,10 +186,6 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes def setUp(self): self.model_tester = MusicgenMelodyDecoderTester(self) - self.config_tester = ConfigTester(self, config_class=MusicgenMelodyDecoderConfig, hidden_size=16) - - def test_config(self): - self.config_tester.run_common_tests() # special case for labels # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest._prepare_for_class diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index 48070c7bb86c..abd68ecabbc0 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -447,6 +447,14 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Owlv2ModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index a08fae0bc6d1..f41d0efc3cec 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -442,6 +442,14 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = OwlViTModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 2fe06b1511a4..0a2b09103853 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -670,6 +670,10 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip def setUp(self): self.model_tester = SiglipModelTester(self) + self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model def test_model(self): diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 1bd01843981d..6dcc5b0f6cc8 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -210,7 +210,14 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe def setUp(self): self.model_tester = VideoLlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 2c241c23f261..b6dcfa3e2042 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -172,7 +172,14 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest def setUp(self): self.model_tester = VipLlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 8b91019bae18..a2e2d4ad5c98 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -547,6 +547,14 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = XCLIPModelTester(self) + common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"] + self.config_tester = ConfigTester( + self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 81c6a008b133..abb3d6c51bbc 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -17,12 +17,17 @@ import json import os import tempfile +from pathlib import Path -from transformers import is_torch_available +from transformers import AutoConfig, is_torch_available +from transformers.utils import direct_transformers_import from .utils.test_configuration_utils import config_common_kwargs +transformers_module = direct_transformers_import(Path(__file__).parent) + + class ConfigTester: def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs): self.parent = parent @@ -110,6 +115,34 @@ def create_and_test_config_from_and_save_pretrained_subfolder(self): self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + def create_and_test_config_from_and_save_pretrained_composite(self): + config = self.config_class(**self.inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + config.save_pretrained(tmpdirname) + general_config_loaded = self.config_class.from_pretrained(tmpdirname) + general_config_dict = config.to_dict() + + sub_configs = self.config_class.sub_configs + for sub_config_key in sub_configs: + class_name = getattr(self.config_class, f"{sub_config_key}_class") + if class_name != "AutoConfig": + sub_config_class = getattr(transformers_module, class_name) + sub_config_loaded = sub_config_class.from_pretrained(tmpdirname) + else: + sub_class = AutoConfig.for_model(**general_config_dict[sub_config_key]) + sub_config_loaded = sub_class.__class__.from_pretrained(tmpdirname) + + # Pop `transformers_version`, it never exists when a config is part of a general composite config + # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class + sub_config_loaded_dict = sub_config_loaded.to_dict() + sub_config_loaded_dict.pop("transformers_version", None) + self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key]) + + # Verify that the loaded config type is same as in the general config + type_from_general_config = type(getattr(general_config_loaded, sub_config_key)) + self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config)) + def create_and_test_config_with_num_labels(self): config = self.config_class(**self.inputs_dict, num_labels=5) self.parent.assertEqual(len(config.id2label), 5) From bc9177a4c979a61242be956f446bf34435bf6a3a Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 25 Oct 2024 12:40:35 +0200 Subject: [PATCH 02/10] nit forgot these --- src/transformers/configuration_utils.py | 1 + src/transformers/models/bark/configuration_bark.py | 3 +++ .../configuration_fastspeech2_conformer.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 1afdab4e6940..defd097bbcca 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -191,6 +191,7 @@ class PretrainedConfig(PushToHubMixin): model_type: str = "" base_config_key: str = "" + sub_configs: List[str] = [] is_composition: bool = False attribute_map: Dict[str, str] = {} _auto_class: Optional[str] = None diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index 3c8659588511..b51a5f959ffa 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -120,6 +120,7 @@ def __init__( ) class BarkSemanticConfig(BarkSubModelConfig): model_type = "semantic" + base_config_key = "semantic_config" @add_start_docstrings( @@ -142,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig): ) class BarkCoarseConfig(BarkSubModelConfig): model_type = "coarse_acoustics" + base_config_key = "coarse_acoustics_config" @add_start_docstrings( @@ -169,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig): ) class BarkFineConfig(BarkSubModelConfig): model_type = "fine_acoustics" + base_config_key = "fine_acoustics_config" def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs): self.n_codes_total = n_codes_total diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py index 5d6b968de708..ded21b135040 100644 --- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig): ```""" model_type = "fastspeech2_conformer" + base_config_key = "model_config" attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} def __init__( @@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig): ```""" model_type = "hifigan" + base_config_key = "vocoder_config" def __init__( self, From 532048e4229b0b5e62ecb65cd799ca06cf51e720 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 25 Oct 2024 13:02:32 +0200 Subject: [PATCH 03/10] fix copies --- .../models/idefics/configuration_idefics.py | 10 ++++------ .../models/idefics3/configuration_idefics3.py | 3 +-- .../instructblipvideo/modular_instructblipvideo.py | 4 ++++ .../models/kosmos2/configuration_kosmos2.py | 12 +++++++++--- .../models/mllama/configuration_mllama.py | 6 +++--- src/transformers/models/mpt/configuration_mpt.py | 8 ++++---- tests/models/siglip/test_modeling_siglip.py | 1 - 7 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 1aac36a497a3..06e7dedc1f50 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - hidden_size (`int`, *optional*, defaults to 768): + embed_dim (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`) image_size (`int`, *optional*, defaults to 224): The size (resolution) of each image. @@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. - image_num_channels (`int`, *optional*, defaults to `3`): + num_channels (`int`, *optional*, defaults to 3): Number of image channels. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. @@ -64,8 +64,6 @@ class IdeficsVisionConfig(PretrainedConfig): initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization testing). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ model_type = "idefics_vision" @@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig): Args: use_resampler (`bool`, *optional*, defaults to `False`): Whether or not to use the resampler - resampler_n_latents (`int`, *optional*, defaults to ): + resampler_n_latents (`int`, *optional*, defaults to 64): Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). resampler_depth (`int`, *optional*, defaults to 6): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3). diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index c5336b90a3eb..39a4e4ad4ea2 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -54,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - intializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation for initializing all weight matrices in the model. + initializer_range (``, *optional*, defaults to 0.02): Example: diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 2128f25df662..ead50a34ebcd 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -103,6 +103,10 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" + sub_configs = ["text_config", "qformer_config", "vision_config"] + text_config_class = "AutoConfig" + qformer_config_class = "InstructBlipVideoQFormerConfig" + vision_config_class = "InstructBlipVideoVisionConfig" def __init__( self, diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index 40142b2431e5..aecb9100ae4d 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -58,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig): layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -66,6 +66,12 @@ class Kosmos2TextConfig(PretrainedConfig): Scale embeddings by diving by sqrt(embed_dim). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). + pad_token_id (`int`, *optional*, defaults to 1): + Token id used for padding. + bos_token_id (`int`, *optional*, defaults to 0): + Token id used for beginning of string. + eos_token_id (`int`, *optional*, defaults to 2): + Token id used for end of string. ```""" model_type = "kosmos_2_text_model" @@ -151,13 +157,13 @@ class Kosmos2VisionConfig(PretrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): + initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). ```""" diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index b7c87210e615..e29d1f2d309d 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -58,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig): The size (resolution) of each image *tile*. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. - norm_eps (`float`, *optional*, defaults to 1e-5): + norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. max_num_tiles (`int`, *optional*, defaults to 4): Maximum number of tiles for image splitting. @@ -161,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*): + num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If not specified, will default to `num_attention_heads`. intermediate_size (`int`, *optional*, defaults to 14336): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - rope_theta (`float`, *optional*, defaults to 500000.0): + rope_theta (`float`, *optional*, defaults to `500000.0`): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index 2e0fd0f848a5..163c6f20c318 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig): Args: attn_type (`str`, *optional*, defaults to `"multihead_attention"`): type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`. - attn_pdrop (`float`, *optional*, defaults to 0.0): + attn_pdrop (`float`, *optional*, defaults to `0.0`): The dropout probability for the attention layers. attn_impl (`str`, *optional*, defaults to `"torch"`): The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`. clip_qkv (`float`, *optional*): If not `None`, clip the queries, keys, and values in the attention layer to this value. - softmax_scale (`float`, *optional*, defaults to `None`): + softmax_scale (`float`, *optional*): If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to `1/sqrt(hidden_size)`. - prefix_lm (`bool`, *optional*, defaults to `False`)): + prefix_lm (`bool`, *optional*, defaults to `False`): Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another bi-directionally. Tokens outside the prefix use causal attention. qk_ln (`bool`, *optional*, defaults to `False`): Whether to apply layer normalization to the queries and keys in the attention layer. - attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)): + attn_uses_sequence_id (`bool`, *optional*, defaults to `False`): Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train` mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored. diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 0a2b09103853..c4907c2227d5 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -667,7 +667,6 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test test_disk_offload_bin = False _is_composite = True - # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip def setUp(self): self.model_tester = SiglipModelTester(self) self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False) From c74e9a0a865be3ee596a5399c26c97a85505cffb Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 28 Oct 2024 13:17:59 +0100 Subject: [PATCH 04/10] move test to common --- tests/models/align/test_modeling_align.py | 1 - tests/models/altclip/test_modeling_altclip.py | 1 - tests/models/blip/test_modeling_blip.py | 1 - tests/models/blip_2/test_modeling_blip_2.py | 2 +- .../bridgetower/test_modeling_bridgetower.py | 1 - tests/models/chameleon/test_modeling_chameleon.py | 1 - tests/models/clap/test_modeling_clap.py | 1 - tests/models/clip/test_modeling_clip.py | 1 - tests/models/clipseg/test_modeling_clipseg.py | 1 - tests/models/clvp/test_modeling_clvp.py | 1 - tests/models/flava/test_modeling_flava.py | 1 - tests/models/git/test_modeling_git.py | 1 - tests/models/groupvit/test_modeling_groupvit.py | 1 - tests/models/idefics2/test_modeling_idefics2.py | 7 ++++++- tests/models/idefics3/test_modeling_idefics3.py | 7 ++++++- .../instructblip/test_modeling_instructblip.py | 2 +- .../test_modeling_instructblipvideo.py | 2 +- tests/models/kosmos2/test_modeling_kosmos2.py | 7 ++++++- tests/models/llava/test_modeling_llava.py | 1 - .../models/llava_next/test_modeling_llava_next.py | 1 - .../test_modeling_llava_next_video.py | 1 - .../test_modeling_llava_onevision.py | 1 - tests/models/mllama/test_modeling_mllama.py | 7 ++++++- .../test_modeling_musicgen_melody.py | 5 +++++ tests/models/owlv2/test_modeling_owlv2.py | 1 - tests/models/owlvit/test_modeling_owlvit.py | 1 - tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 3 +++ tests/models/siglip/test_modeling_siglip.py | 2 +- .../video_llava/test_modeling_video_llava.py | 1 - tests/models/vipllava/test_modeling_vipllava.py | 1 - tests/models/x_clip/test_modeling_x_clip.py | 1 - tests/test_configuration_common.py | 15 +++++++++++---- 32 files changed, 47 insertions(+), 33 deletions(-) diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index b0181aec8b63..3c7e679686f6 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -470,7 +470,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="Start to fail after using torch `cu118`.") def test_multi_gpu_data_parallel_forward(self): diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index c492f991ad2a..658e2e38d9ad 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -465,7 +465,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index d0839132e1a8..bf2412ff8734 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -459,7 +459,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 3d915e61e0be..ae89dae7e0f6 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -467,7 +467,7 @@ def setUp(self): ) def test_config(self): - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index db2ec1a3aeb3..cceeee4912dc 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -340,7 +340,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index 9eaac111d63d..aad26ef147e8 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -293,7 +293,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 84724e580353..60b77d0efa4b 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -526,7 +526,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index b43cbdf5f49f..fa5de84e0620 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -756,7 +756,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 80e97d96dde8..70951d63eee5 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -483,7 +483,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model_for_image_segmentation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 22baf799525e..c8942f08b630 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -423,7 +423,6 @@ def setUp(self): def test_config(self): self.clvp_config_tester.run_common_tests() - self.clvp_config_tester.create_and_test_config_from_and_save_pretrained_composite() def tearDown(self): super().tearDown() diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 5cac14d634ca..1c35fd705ccd 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -942,7 +942,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 95eaa5e2ea12..33da9e26cba0 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -427,7 +427,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 2e411d4cd7f0..88b55ec56d82 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -570,7 +570,6 @@ def test_model(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip(reason="hidden_states are tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 854b8b934578..2a9ebf997d30 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -186,7 +186,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Idefics2VisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index f0366e7b539a..21e56eb742e3 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -170,7 +170,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Idefics3VisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index aba89119a9ed..3f98debc8a1e 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -473,7 +473,7 @@ def setUp(self): ) def test_config(self): - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 59c091d009f2..77152b233f90 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -496,7 +496,7 @@ def test_for_conditional_generation(self): self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) def test_config(self): - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + self.config_tester.run_common_tests() @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index de6c0b15d661..728703980c97 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -296,7 +296,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = Kosmos2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37) + self.config_tester = ConfigTester( + self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"] + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite from common to skip `image_to_text_projection.latent_query` def test_initialization(self): diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 7934ddb8bee7..d022ce2c7683 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -197,7 +197,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 40283ec07643..a5bad0c59dee 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -229,7 +229,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index d0f6b14bf7b4..1271a0cef327 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -247,7 +247,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 862d09a68a1c..7f96daf3c054 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -230,7 +230,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index fafa2f71331b..c2f269dbb003 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -278,7 +278,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester def setUp(self): self.model_tester = MllamaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"] + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 389dd2ce5a1f..f53fc21ba80c 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -47,6 +47,7 @@ from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -186,6 +187,10 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes def setUp(self): self.model_tester = MusicgenMelodyDecoderTester(self) + self.config_tester = ConfigTester(self, config_class=MusicgenMelodyDecoderConfig, hidden_size=16) + + def test_config(self): + self.config_tester.run_common_tests() # special case for labels # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest._prepare_for_class diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index abd68ecabbc0..df763aed48c7 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -454,7 +454,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index f41d0efc3cec..e0599a50fb98 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -449,7 +449,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 956243dccebe..78ccc4e85375 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -231,6 +231,9 @@ def setUp(self): self.model_tester = Qwen2VLVisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False) + def test_config(self): + self.config_tester.run_common_tests() + def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index c4907c2227d5..61ac78f10299 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -672,7 +672,7 @@ def setUp(self): self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False) def test_config(self): - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() + self.config_tester.run_common_tests() # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model def test_model(self): diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 6dcc5b0f6cc8..55bb5276379b 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -217,7 +217,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index b6dcfa3e2042..a823057f4af4 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -179,7 +179,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index a2e2d4ad5c98..04dd2d9d2968 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -554,7 +554,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - self.config_tester.create_and_test_config_from_and_save_pretrained_composite() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index abb3d6c51bbc..5c57ae92f6b5 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -40,9 +40,10 @@ def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) common_properties = ( ["hidden_size", "num_attention_heads", "num_hidden_layers"] - if self.common_properties is None + if self.common_properties is None and not self.config_class.sub_configs else self.common_properties ) + common_properties = [] if common_properties is None else common_properties # Add common fields for text models if self.has_text_modality: @@ -126,12 +127,14 @@ def create_and_test_config_from_and_save_pretrained_composite(self): sub_configs = self.config_class.sub_configs for sub_config_key in sub_configs: class_name = getattr(self.config_class, f"{sub_config_key}_class") - if class_name != "AutoConfig": + if class_name == "AutoConfig": + sub_class = AutoConfig.for_model(**general_config_dict[sub_config_key]) + sub_config_loaded = sub_class.__class__.from_pretrained(tmpdirname) + elif hasattr(transformers_module, class_name): sub_config_class = getattr(transformers_module, class_name) sub_config_loaded = sub_config_class.from_pretrained(tmpdirname) else: - sub_class = AutoConfig.for_model(**general_config_dict[sub_config_key]) - sub_config_loaded = sub_class.__class__.from_pretrained(tmpdirname) + continue # Pop `transformers_version`, it never exists when a config is part of a general composite config # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class @@ -161,6 +164,9 @@ def check_config_can_be_init_without_params(self): self.parent.assertIsNotNone(config) def check_config_arguments_init(self): + if self.config_class.sub_configs: + return # TODO: @raushan composite models are not consistent in how they set general params + kwargs = copy.deepcopy(config_common_kwargs) config = self.config_class(**kwargs) wrong_values = [] @@ -186,6 +192,7 @@ def run_common_tests(self): self.create_and_test_config_to_json_file() self.create_and_test_config_from_and_save_pretrained() self.create_and_test_config_from_and_save_pretrained_subfolder() + self.create_and_test_config_from_and_save_pretrained_composite() self.create_and_test_config_with_num_labels() self.check_config_can_be_init_without_params() self.check_config_arguments_init() From 3d64e743e32c3e891ab4fe903c07c9750bd2f20a Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 28 Oct 2024 14:19:58 +0100 Subject: [PATCH 05/10] use dict for sub-configs --- src/transformers/configuration_utils.py | 2 +- .../models/align/configuration_align.py | 4 +--- .../models/altclip/configuration_altclip.py | 4 +--- src/transformers/models/bark/configuration_bark.py | 13 +++++++------ src/transformers/models/blip/configuration_blip.py | 4 +--- .../models/blip_2/configuration_blip_2.py | 7 ++----- .../bridgetower/configuration_bridgetower.py | 4 +--- .../models/chameleon/configuration_chameleon.py | 3 +-- .../chinese_clip/configuration_chinese_clip.py | 1 + src/transformers/models/clap/configuration_clap.py | 4 +--- src/transformers/models/clip/configuration_clip.py | 4 +--- .../models/clipseg/configuration_clipseg.py | 4 +--- src/transformers/models/clvp/configuration_clvp.py | 9 +++++---- src/transformers/models/dbrx/configuration_dbrx.py | 8 +++----- .../configuration_encoder_decoder.py | 7 ++----- .../configuration_fastspeech2_conformer.py | 4 +--- .../models/flava/configuration_flava.py | 11 ++++++----- src/transformers/models/git/configuration_git.py | 3 +-- .../models/groupvit/configuration_groupvit.py | 4 +--- .../models/idefics/configuration_idefics.py | 5 +---- .../models/idefics2/configuration_idefics2.py | 11 ++++++----- .../models/idefics3/configuration_idefics3.py | 6 ++---- .../instructblip/configuration_instructblip.py | 11 ++++++----- .../configuration_instructblipvideo.py | 11 ++++++----- .../instructblipvideo/modular_instructblipvideo.py | 11 ++++++----- .../models/kosmos2/configuration_kosmos2.py | 4 +--- .../models/llava/configuration_llava.py | 6 ++---- .../models/llava_next/configuration_llava_next.py | 7 ++----- .../configuration_llava_next_video.py | 6 ++---- .../llava_next_video/modular_llava_next_video.py | 6 ++---- .../configuration_llava_onevision.py | 7 ++----- .../models/mllama/configuration_mllama.py | 4 +--- .../models/moshi/configuration_moshi.py | 3 +-- src/transformers/models/mpt/configuration_mpt.py | 3 +-- .../models/musicgen/configuration_musicgen.py | 9 +++++---- .../configuration_musicgen_melody.py | 9 +++++---- .../models/owlv2/configuration_owlv2.py | 4 +--- .../models/owlvit/configuration_owlvit.py | 4 +--- .../models/qwen2_vl/configuration_qwen2_vl.py | 3 +-- .../models/siglip/configuration_siglip.py | 4 +--- .../configuration_speech_encoder_decoder.py | 4 +--- .../video_llava/configuration_video_llava.py | 7 ++----- .../models/vipllava/configuration_vipllava.py | 7 ++----- .../configuration_vision_encoder_decoder.py | 4 +--- .../configuration_vision_text_dual_encoder.py | 4 +--- .../models/x_clip/configuration_x_clip.py | 4 +--- tests/test_configuration_common.py | 14 +++++--------- 47 files changed, 106 insertions(+), 172 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index defd097bbcca..60f9f34cf861 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -191,7 +191,7 @@ class PretrainedConfig(PushToHubMixin): model_type: str = "" base_config_key: str = "" - sub_configs: List[str] = [] + sub_configs: Dict[str, "PretrainedConfig"] = {} is_composition: bool = False attribute_map: Dict[str, str] = {} _auto_class: Optional[str] = None diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index e3922be387c8..a22ab1dc40f8 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -305,9 +305,7 @@ class AlignConfig(PretrainedConfig): ```""" model_type = "align" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AlignTextConfig" - vision_config_class = "AlignVisionConfig" + sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig} def __init__( self, diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 924d9bda5b09..3c8e91bd4735 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -278,9 +278,7 @@ class AltCLIPConfig(PretrainedConfig): ```""" model_type = "altclip" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AltCLIPTextConfig" - vision_config_class = "AltCLIPVisionConfig" + sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index b51a5f959ffa..a498d1dd1937 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -18,7 +18,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import add_start_docstrings, logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -234,11 +234,12 @@ class BarkConfig(PretrainedConfig): """ model_type = "bark" - sub_configs = ["semantic_config", "coarse_acoustics_config", "fine_acoustics_config", "codec_config"] - semantic_config_class = "BarkSemanticConfig" - coarse_acoustics_config_class = "BarkCoarseConfig" - fine_acoustics_config_class = "BarkFineConfig" - codec_config_class = "AutoConfig" + sub_configs = { + "semantic_config": BarkSemanticConfig, + "coarse_acoustics_config": BarkCoarseConfig, + "fine_acoustics_config": BarkFineConfig, + "codec_config": AutoConfig, + } def __init__( self, diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 2d97b4b19314..18db71eb1489 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -279,9 +279,7 @@ class BlipConfig(PretrainedConfig): ```""" model_type = "blip" - sub_configs = ["text_config", "vision_config"] - text_config_class = "BlipTextConfig" - vision_config_class = "BlipVisionConfig" + sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} def __init__( self, diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 20acce294852..d690d22338a6 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -19,7 +19,7 @@ from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -271,10 +271,7 @@ class Blip2Config(PretrainedConfig): ```""" model_type = "blip-2" - sub_configs = ["text_config", "qformer_config", "vision_config"] - text_config_class = "AutoConfig" - qformer_config_class = "Blip2QFormerConfig" - vision_config_class = "Blip2VisionConfig" + sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig} def __init__( self, diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index b60d2cdc5559..de49283493b6 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -257,9 +257,7 @@ class BridgeTowerConfig(PretrainedConfig): ```""" model_type = "bridgetower" - sub_configs = ["text_config", "vision_config"] - text_config_class = "BridgeTowerTextConfig" - vision_config_class = "BridgeTowerVisionConfig" + sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig} def __init__( self, diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index cfc2a10af69f..9842127e7bb4 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -188,8 +188,7 @@ class ChameleonConfig(PretrainedConfig): ```""" model_type = "chameleon" - sub_configs = ["vq_config"] - vq_config_class = "ChameleonVQVAEConfig" + sub_configs = {"vq_config": ChameleonVQVAEConfig} keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 11f1560f3f13..d50d6c842b31 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -281,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig): ```""" model_type = "chinese_clip" + sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index aed814258f4e..b2added7f0e0 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -340,9 +340,7 @@ class ClapConfig(PretrainedConfig): ```""" model_type = "clap" - sub_configs = ["text_config", "audio_config"] - text_config_class = "ClapTextConfig" - audio_config_class = "ClapAudioConfig" + sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig} def __init__( self, diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index f19f2df136fa..2e1f2deede00 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -270,9 +270,7 @@ class CLIPConfig(PretrainedConfig): ```""" model_type = "clip" - sub_configs = ["text_config", "vision_config"] - text_config_class = "CLIPTextConfig" - vision_config_class = "CLIPVisionConfig" + sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 9a9fec755d95..5474840f357a 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -269,9 +269,7 @@ class CLIPSegConfig(PretrainedConfig): ```""" model_type = "clipseg" - sub_configs = ["text_config", "vision_config"] - text_config_class = "CLIPSegTextConfig" - vision_config_class = "CLIPSegVisionConfig" + sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig} def __init__( self, diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index a46de9337c55..8fd0e150801a 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -370,10 +370,11 @@ class ClvpConfig(PretrainedConfig): ```""" model_type = "clvp" - sub_configs = ["text_config", "speech_config", "decoder_config"] - text_config_class = "ClvpEncoderConfig" - speech_config_class = "ClvpEncoderConfig" - decoder_config_class = "ClvpDecoderConfig" + sub_configs = { + "text_config": ClvpEncoderConfig, + "speech_config": ClvpEncoderConfig, + "decoder_config": ClvpDecoderConfig, + } def __init__( self, diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 0d7836b98e35..99c7adfc6f3a 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -57,7 +57,7 @@ def __init__( self.kv_n_heads = kv_n_heads self.rope_theta = rope_theta - for k in ["model_type"]: + for k in ["model_type", "attn_implementation"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: @@ -109,7 +109,7 @@ def __init__( self.moe_loss_weight = moe_loss_weight self.moe_normalize_expert_weights = moe_normalize_expert_weights - for k in ["model_type"]: + for k in ["model_type", "attn_implementation"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: @@ -172,9 +172,7 @@ class DbrxConfig(PretrainedConfig): """ model_type = "dbrx" - sub_configs = ["attn_config", "ffn_config"] - attn_config_class = "DbrxAttentionConfig" - ffn_config_class = "DbrxFFNConfig" + sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig} attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index 6cacc9cd736e..767da5a0b827 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -17,6 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ..auto import AutoConfig logger = logging.get_logger(__name__) @@ -70,9 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig): ```""" model_type = "encoder-decoder" - sub_configs = ["encoder_config", "decoder_config"] - encoder_config_class = "AutoConfig" - decoder_config_class = "AutoConfig" + sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} is_composition = True def __init__(self, **kwargs): @@ -87,8 +86,6 @@ def __init__(self, **kwargs): decoder_config = kwargs.pop("decoder") decoder_model_type = decoder_config.pop("model_type") - from ..auto.configuration_auto import AutoConfig - self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) self.is_encoder_decoder = True diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py index ded21b135040..59a1b0297516 100644 --- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -455,9 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): """ model_type = "fastspeech2_conformer_with_hifigan" - sub_configs = ["model_config", "vocoder_config"] - model_config_class = "FastSpeech2ConformerConfig" - vocoder_config_class = "FastSpeech2ConformerHifiGanConfig" + sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig} def __init__( self, diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index 0dc37fbeaa17..47cdb488a2eb 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -463,11 +463,12 @@ class FlavaConfig(PretrainedConfig): """ model_type = "flava" - sub_configs = ["text_config", "image_config", "multimodal_config", "image_codebook_config"] - text_config_class = "FlavaTextConfig" - image_config_class = "FlavaImageConfig" - multimodal_config_class = "FlavaMultimodalConfig" - image_codebook_config_class = "FlavaImageCodebookConfig" + sub_configs = { + "text_config": FlavaTextConfig, + "image_config": FlavaImageConfig, + "multimodal_config": FlavaMultimodalConfig, + "image_codebook_config": FlavaImageCodebookConfig, + } def __init__( self, diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index 5cd7f33315e7..1be3e7067bdf 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -167,8 +167,7 @@ class GitConfig(PretrainedConfig): ```""" model_type = "git" - sub_configs = ["vision_config"] - vision_config_class = "GitVisionConfig" + sub_configs = {"vision_config": GitVisionConfig} def __init__( self, diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index 0497712b83c6..e85e4fc91843 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -257,9 +257,7 @@ class GroupViTConfig(PretrainedConfig): """ model_type = "groupvit" - sub_configs = ["text_config", "vision_config"] - text_config_class = "GroupViTTextConfig" - vision_config_class = "GroupViTVisionConfig" + sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig} def __init__( self, diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 06e7dedc1f50..e34a57644001 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -233,10 +233,7 @@ class IdeficsConfig(PretrainedConfig): ```""" model_type = "idefics" - sub_configs = ["perceiver_config", "vision_config"] - perceiver_config_class = "IdeficsPerceiverConfig" - vision_config_class = "IdeficsVisionConfig" - is_composition = False + sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig} def __init__( self, diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 44c093b405e4..408d374c77f7 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -200,10 +200,11 @@ class Idefics2Config(PretrainedConfig): ```""" model_type = "idefics2" - sub_configs = ["text_config", "perceiver_config", "vision_config"] - text_config_class = "AutoConfig" - perceiver_config_class = "Idefics2PerceiverConfig" - vision_config_class = "Idefics2VisionConfig" + sub_configs = { + "text_config": AutoConfig, + "perceiver_config": Idefics2PerceiverConfig, + "vision_config": Idefics2VisionConfig, + } def __init__( self, diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index 39a4e4ad4ea2..4b10d8d2d03a 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -144,9 +144,7 @@ class Idefics3Config(PretrainedConfig): ```""" model_type = "idefics3" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "Idefics3VisionConfig" + sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig} def __init__( self, diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index 212786d74b7c..6124dba3a08e 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -17,7 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -268,10 +268,11 @@ class InstructBlipConfig(PretrainedConfig): ```""" model_type = "instructblip" - sub_configs = ["text_config", "qformer_config", "vision_config"] - text_config_class = "AutoConfig" - qformer_config_class = "InstructBlipQFormerConfig" - vision_config_class = "InstructBlipVisionConfig" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipQFormerConfig, + "vision_config": InstructBlipVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index edfa5cc58bd4..14687a96e54f 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -23,7 +23,7 @@ from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -274,10 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" - sub_configs = ["text_config", "qformer_config", "vision_config"] - text_config_class = "AutoConfig" - qformer_config_class = "InstructBlipVideoQFormerConfig" - vision_config_class = "InstructBlipVideoVisionConfig" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipVideoQFormerConfig, + "vision_config": InstructBlipVideoVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index ead50a34ebcd..4482a1fe5354 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -32,7 +32,7 @@ from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -103,10 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" - sub_configs = ["text_config", "qformer_config", "vision_config"] - text_config_class = "AutoConfig" - qformer_config_class = "InstructBlipVideoQFormerConfig" - vision_config_class = "InstructBlipVideoVisionConfig" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipVideoQFormerConfig, + "vision_config": InstructBlipVideoVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index aecb9100ae4d..921ec336c0be 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -236,9 +236,7 @@ class Kosmos2Config(PretrainedConfig): ```""" model_type = "kosmos-2" - sub_configs = ["text_config", "vision_config"] - text_config_class = "Kosmos2TextConfig" - vision_config_class = "Kosmos2VisionConfig" + sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig} def __init__( self, diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 1aec99bd2864..05034f5cfcf6 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -73,9 +73,7 @@ class LlavaConfig(PretrainedConfig): ```""" model_type = "llava" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 0bab95d646ab..54616edbf96d 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,10 +78,7 @@ class LlavaNextConfig(PretrainedConfig): ```""" model_type = "llava_next" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 0341d2f009a7..2fe889da6033 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -21,7 +21,7 @@ from ...configuration_utils import PretrainedConfig -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig class LlavaNextVideoConfig(PretrainedConfig): @@ -86,9 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 5c117ba6a4b7..88ec3890b5c1 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -32,7 +32,7 @@ logging, replace_return_docstrings, ) -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -100,9 +100,7 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index bcb15c4fa484..46b65b35b1a5 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -18,7 +18,7 @@ from ...utils import ( logging, ) -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -81,10 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig): ```""" model_type = "llava_onevision" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index e29d1f2d309d..635ca503205f 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -337,9 +337,7 @@ class MllamaConfig(PretrainedConfig): ```""" model_type = "mllama" - sub_configs = ["text_config", "vision_config"] - text_config_class = "MllamaTextConfig" - vision_config_class = "MllamaVisionConfig" + sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig} def __init__( self, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index a0a3848fede5..1b31141f020d 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -236,8 +236,7 @@ class MoshiConfig(PretrainedConfig): model_type = "moshi" keys_to_ignore_at_inference = ["past_key_values"] - sub_configs = ["audio_encoder_config"] - audio_encoder_config_class = "AutoConfig" + sub_configs = {"audio_encoder_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index 163c6f20c318..8ee3f8c0c074 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -173,8 +173,7 @@ class MptConfig(PretrainedConfig): """ model_type = "mpt" - sub_configs = ["attn_config"] - attn_config_class = "MptAttentionConfig" + sub_configs = {"attn_config": MptAttentionConfig} attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index b7a8eb9de72c..345e7aef8eb4 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -190,10 +190,11 @@ class MusicgenConfig(PretrainedConfig): ```""" model_type = "musicgen" - sub_configs = ["text_encoder_config", "audio_encoder_config", "decoder_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" - decoder_config_class = "MusicgenDecoderConfig" + sub_configs = { + "text_encoder_config": AutoConfig, + "audio_encoder_config": AutoConfig, + "decoder_config": MusicgenDecoderConfig, + } is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 94ceddce00ea..8c36f2bbee39 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -196,10 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig): ```""" model_type = "musicgen_melody" - sub_configs = ["text_encoder_config", "audio_encoder_config", "decoder_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" - decoder_config_class = "MusicgenMelodyDecoderConfig" + sub_configs = { + "text_encoder_config": AutoConfig, + "audio_encoder_config": AutoConfig, + "decoder_config": MusicgenMelodyDecoderConfig, + } is_composition = True def __init__( diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 82d8c7e3f0f1..f9085eaf9c15 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -241,9 +241,7 @@ class Owlv2Config(PretrainedConfig): """ model_type = "owlv2" - sub_configs = ["text_config", "vision_config"] - text_config_class = "Owlv2TextConfig" - vision_config_class = "Owlv2VisionConfig" + sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig} def __init__( self, diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index fec49c732453..8be707ce99a1 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -241,9 +241,7 @@ class OwlViTConfig(PretrainedConfig): """ model_type = "owlvit" - sub_configs = ["text_config", "vision_config"] - text_config_class = "OwlViTTextConfig" - vision_config_class = "OwlViTVisionConfig" + sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig} def __init__( self, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 3f466c542dd3..55042327de4e 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -161,8 +161,7 @@ class Qwen2VLConfig(PretrainedConfig): ```""" model_type = "qwen2_vl" - sub_configs = ["vision_config"] - vision_config_class = "Qwen2VLVisionConfig" + sub_configs = {"vision_config": Qwen2VLVisionConfig} keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py index b63988c05483..cc8fae93cdb2 100644 --- a/src/transformers/models/siglip/configuration_siglip.py +++ b/src/transformers/models/siglip/configuration_siglip.py @@ -231,9 +231,7 @@ class SiglipConfig(PretrainedConfig): ```""" model_type = "siglip" - sub_configs = ["text_config", "vision_config"] - text_config_class = "SiglipTextConfig" - vision_config_class = "SiglipVisionConfig" + sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig} def __init__(self, text_config=None, vision_config=None, **kwargs): super().__init__(**kwargs) diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py index c7820708e397..89ea3582f2ca 100644 --- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py @@ -71,9 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "speech-encoder-decoder" - sub_configs = ["encoder_config", "decoder_config"] - encoder_config_class = "AutoConfig" - decoder_config_class = "AutoConfig" + sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 54376162af29..87d96ca24ffd 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,10 +78,7 @@ class VideoLlavaConfig(PretrainedConfig): ```""" model_type = "video_llava" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index 2cbefb70a47b..f26c2b2f50fb 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -72,10 +72,7 @@ class VipLlavaConfig(PretrainedConfig): ```""" model_type = "vipllava" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py index 21c17652254a..139cd5bdbccc 100644 --- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -78,9 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "vision-encoder-decoder" - sub_configs = ["encoder_config", "decoder_config"] - encoder_config_class = "AutoConfig" - decoder_config_class = "AutoConfig" + sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 3973e090f9af..8e949c95784d 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -75,9 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ```""" model_type = "vision-text-dual-encoder" - sub_configs = ["text_config", "vision_config"] - text_config_class = "AutoConfig" - vision_config_class = "AutoConfig" + sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} is_composition = True def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 80df301cf1b5..3d3b92d2c8c0 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -258,9 +258,7 @@ class XCLIPConfig(PretrainedConfig): """ model_type = "xclip" - sub_configs = ["text_config", "vision_config"] - text_config_class = "XCLIPTextConfig" - vision_config_class = "XCLIPVisionConfig" + sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig} def __init__( self, diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 5c57ae92f6b5..ac1376a31a10 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -19,7 +19,7 @@ import tempfile from pathlib import Path -from transformers import AutoConfig, is_torch_available +from transformers import is_torch_available from transformers.utils import direct_transformers_import from .utils.test_configuration_utils import config_common_kwargs @@ -125,16 +125,12 @@ def create_and_test_config_from_and_save_pretrained_composite(self): general_config_dict = config.to_dict() sub_configs = self.config_class.sub_configs - for sub_config_key in sub_configs: - class_name = getattr(self.config_class, f"{sub_config_key}_class") - if class_name == "AutoConfig": - sub_class = AutoConfig.for_model(**general_config_dict[sub_config_key]) + for sub_config_key, sub_class in sub_configs.items(): + if sub_class.__name__ == "AutoConfig": + sub_class = sub_class.for_model(**general_config_dict[sub_config_key]) sub_config_loaded = sub_class.__class__.from_pretrained(tmpdirname) - elif hasattr(transformers_module, class_name): - sub_config_class = getattr(transformers_module, class_name) - sub_config_loaded = sub_config_class.from_pretrained(tmpdirname) else: - continue + sub_config_loaded = sub_class.from_pretrained(tmpdirname) # Pop `transformers_version`, it never exists when a config is part of a general composite config # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class From 6f2a28d1d8175b53aad06ed78961d7177da0635c Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 28 Oct 2024 14:42:29 +0100 Subject: [PATCH 06/10] add load-save-laod test --- .../models/dbrx/configuration_dbrx.py | 4 ++-- tests/test_configuration_common.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 99c7adfc6f3a..302b5e6a5582 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -57,7 +57,7 @@ def __init__( self.kv_n_heads = kv_n_heads self.rope_theta = rope_theta - for k in ["model_type", "attn_implementation"]: + for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: @@ -109,7 +109,7 @@ def __init__( self.moe_loss_weight = moe_loss_weight self.moe_normalize_expert_weights = moe_normalize_expert_weights - for k in ["model_type", "attn_implementation"]: + for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index ac1376a31a10..4dbbdedbbc2e 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -117,6 +117,11 @@ def create_and_test_config_from_and_save_pretrained_subfolder(self): self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) def create_and_test_config_from_and_save_pretrained_composite(self): + """ + Tests that composite or nested cofigs can be loaded and saved correctly. In case the config + has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')` + and get a result same as if we loaded the whole config and obtained `config.sub_config` from it. + """ config = self.config_class(**self.inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: @@ -124,11 +129,12 @@ def create_and_test_config_from_and_save_pretrained_composite(self): general_config_loaded = self.config_class.from_pretrained(tmpdirname) general_config_dict = config.to_dict() + # Iterate over all sub_configs if there are any and load them with their own classes sub_configs = self.config_class.sub_configs for sub_config_key, sub_class in sub_configs.items(): if sub_class.__name__ == "AutoConfig": - sub_class = sub_class.for_model(**general_config_dict[sub_config_key]) - sub_config_loaded = sub_class.__class__.from_pretrained(tmpdirname) + sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__ + sub_config_loaded = sub_class.from_pretrained(tmpdirname) else: sub_config_loaded = sub_class.from_pretrained(tmpdirname) @@ -142,6 +148,12 @@ def create_and_test_config_from_and_save_pretrained_composite(self): type_from_general_config = type(getattr(general_config_loaded, sub_config_key)) self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config)) + # Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works + with tempfile.TemporaryDirectory() as tmpdirname2: + sub_config_loaded.save_pretrained(tmpdirname2) + sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2) + self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict()) + def create_and_test_config_with_num_labels(self): config = self.config_class(**self.inputs_dict, num_labels=5) self.parent.assertEqual(len(config.id2label), 5) From ad4fcb788b60ec30b7c16eca4c18aebdb6e7c8cb Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 31 Oct 2024 12:31:46 +0100 Subject: [PATCH 07/10] clean up modeling check --- src/transformers/modeling_utils.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a6fbd7b1a914..e9a07fe97588 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1596,15 +1596,14 @@ def _autoset_attn_implementation( # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict. # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)` # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238 - for key in config: - if isinstance(getattr(config, key), PretrainedConfig): - sub_config = getattr(config, key) - curr_attn_implementation = ( - requested_attn_implementation - if not isinstance(requested_attn_implementation, dict) - else requested_attn_implementation.get(key, None) - ) - sub_config._attn_implementation_internal = curr_attn_implementation + for key in config.sub_configs.keys(): + sub_config = getattr(config, key) + curr_attn_implementation = ( + requested_attn_implementation + if not isinstance(requested_attn_implementation, dict) + else requested_attn_implementation.get(key, None) + ) + sub_config._attn_implementation_internal = curr_attn_implementation if use_flash_attention_2: logger.warning_once( From 2d93388e60c8694b9b22a735d80aaf8284fadb3e Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 31 Oct 2024 12:38:41 +0100 Subject: [PATCH 08/10] oops this are correct keys --- .../models/encoder_decoder/configuration_encoder_decoder.py | 2 +- .../configuration_speech_encoder_decoder.py | 2 +- .../configuration_vision_encoder_decoder.py | 2 +- .../configuration_vision_text_dual_encoder.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index 767da5a0b827..5190ed51ffd3 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -71,7 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig): ```""" model_type = "encoder-decoder" - sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py index 89ea3582f2ca..d7e0211610b6 100644 --- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py @@ -71,7 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "speech-encoder-decoder" - sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py index 139cd5bdbccc..59678f2573ff 100644 --- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -78,7 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "vision-encoder-decoder" - sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 8e949c95784d..98a9c9bb19ae 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -75,7 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ```""" model_type = "vision-text-dual-encoder" - sub_configs = {"encoder_config": AutoConfig, "decoder_config": AutoConfig} + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): From 6398f792d02a3238defce5d93cec247d71d6cbf1 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 4 Nov 2024 17:00:06 +0100 Subject: [PATCH 09/10] fix some tests, missed some composite configs --- .../models/musicgen/configuration_musicgen.py | 6 +++--- .../configuration_musicgen_melody.py | 6 +++--- .../models/paligemma/configuration_paligemma.py | 4 ++-- .../qwen2_audio/configuration_qwen2_audio.py | 4 ++-- tests/test_modeling_common.py | 14 +++++--------- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 345e7aef8eb4..00c030721980 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -191,9 +191,9 @@ class MusicgenConfig(PretrainedConfig): model_type = "musicgen" sub_configs = { - "text_encoder_config": AutoConfig, - "audio_encoder_config": AutoConfig, - "decoder_config": MusicgenDecoderConfig, + "text_encoder": AutoConfig, + "audio_encoder": AutoConfig, + "decoder": MusicgenDecoderConfig, } is_composition = True diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 8c36f2bbee39..e65ad50021c3 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -197,9 +197,9 @@ class MusicgenMelodyConfig(PretrainedConfig): model_type = "musicgen_melody" sub_configs = { - "text_encoder_config": AutoConfig, - "audio_encoder_config": AutoConfig, - "decoder_config": MusicgenMelodyDecoderConfig, + "text_encoder": AutoConfig, + "audio_encoder": AutoConfig, + "decoder": MusicgenMelodyDecoderConfig, } is_composition = True diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py index 64598436dbbf..de60c501292b 100644 --- a/src/transformers/models/paligemma/configuration_paligemma.py +++ b/src/transformers/models/paligemma/configuration_paligemma.py @@ -17,7 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig): ```""" model_type = "paligemma" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py index deb276f33472..925aa60a8dc6 100644 --- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig): ```""" model_type = "qwen2_audio" - is_composition = False + sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig} def __init__( self, diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 13c4d5155be4..c7a11ff0ac8a 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3802,22 +3802,18 @@ def test_attn_implementation_composite_models(self): self.skipTest("Model is not a composite model.") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - sub_configs = { - key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig) - } # set eager as it will be the one supported in all models # we just need to test if passing 'attn_implementation' as a dict fails or not attn_implementation_per_subconfig = {} - for key, sub_config in sub_configs.items(): + for key in config.sub_configs.keys(): attn_implementation_per_subconfig[key] = "eager" config._attn_implementation = attn_implementation_per_subconfig model = model_class(config) - for key in model.config: - if isinstance(getattr(model.config, key), PretrainedConfig): - sub_config = getattr(model.config, key) - self.assertTrue(sub_config._attn_implementation == "eager") + for key in config.sub_configs.keys(): + sub_config = getattr(model.config, key) + self.assertTrue(sub_config._attn_implementation == "eager") for name, submodule in model.named_modules(): class_name = submodule.__class__.__name__ @@ -3826,7 +3822,7 @@ def test_attn_implementation_composite_models(self): or "SdpaSelfAttention" in class_name or "FlashAttention" in class_name ): - raise ValueError("The eager model should not have SDPA/FA2 attention layers") + raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}") @require_torch_sdpa def test_sdpa_can_dispatch_non_composite_models(self): From ceda121879443e1658a70af7289dcd3fc77d3dcb Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 4 Nov 2024 18:03:47 +0100 Subject: [PATCH 10/10] this model was missed --- .../configuration_vision_text_dual_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 98a9c9bb19ae..0d79720e1aa8 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -75,7 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ```""" model_type = "vision-text-dual-encoder" - sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} + sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig} is_composition = True def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):