Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions src/transformers/models/clip/configuration_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ class CLIPConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.

Args:
text_config_dict (`dict`, *optional*):
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
vision_config_dict (`dict`, *optional*):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Expand Down Expand Up @@ -300,25 +300,28 @@ class CLIPConfig(PretrainedConfig):
is_composition = True

def __init__(
self,
text_config_dict=None,
vision_config_dict=None,
projection_dim=512,
logit_scale_init_value=2.6592,
**kwargs
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):
super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
super().__init__(**kwargs)

# If `_config_dict` exist, we use them for the backward compatibility.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For backward compatibility

text_config_dict = kwargs.pop("text_config_dict", None)
vision_config_dict = kwargs.pop("vision_config_dict", None)
if text_config_dict is not None:
text_config = text_config_dict
if vision_config_dict is not None:
vision_config = vision_config_dict

if text_config_dict is None:
text_config_dict = {}
logger.info("text_config_dict is None. Initializing the CLIPTextConfig with default values.")
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the CLIPTextConfig with default values.")

if vision_config_dict is None:
vision_config_dict = {}
logger.info("vision_config_dict is None. initializing the CLIPVisionConfig with default values.")
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. initializing the CLIPVisionConfig with default values.")

self.text_config = CLIPTextConfig(**text_config_dict)
self.vision_config = CLIPVisionConfig(**vision_config_dict)
self.text_config = CLIPTextConfig(**text_config)
self.vision_config = CLIPVisionConfig(**vision_config)

self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
Expand All @@ -334,7 +337,7 @@ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CL
[`CLIPConfig`]: An instance of a configuration object
"""

return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

def to_dict(self):
"""
Expand Down
79 changes: 44 additions & 35 deletions src/transformers/models/flava/configuration_flava.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,11 +471,11 @@ class FlavaConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.

Args:
text_config_dict (`dict`, *optional*):
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`FlavaTextConfig`].
image_config_dict (`dict`, *optional*):
image_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`FlavaImageConfig`].
multimodal_config_dict (`dict`, *optional*):
multimodal_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`FlavaMultimodalConfig`].
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
Expand Down Expand Up @@ -535,10 +535,10 @@ class FlavaConfig(PretrainedConfig):

def __init__(
self,
image_config_dict: Dict[str, Any] = None,
text_config_dict: Dict[str, Any] = None,
multimodal_config_dict: Dict[str, Any] = None,
image_codebook_config_dict: Dict[str, Any] = None,
image_config: Dict[str, Any] = None,
text_config: Dict[str, Any] = None,
multimodal_config: Dict[str, Any] = None,
image_codebook_config: Dict[str, Any] = None,
hidden_size: int = 768,
layer_norm_eps: float = 1e-12,
projection_dim: int = 768,
Expand All @@ -559,33 +559,42 @@ def __init__(
):
super().__init__(**kwargs)

if image_config_dict is None:
image_config_dict = {}
logger.info("image_config_dict is None. initializing the FlavaImageConfig with default values.")

if text_config_dict is None:
text_config_dict = {}
logger.info("text_config_dict is None. Initializing the FlavaTextConfig with default values.")

if multimodal_config_dict is None:
multimodal_config_dict = {}
logger.info("multimodal_config_dict is None. initializing the FlavaMultimodalConfig with default values.")

if image_codebook_config_dict is None:
image_codebook_config_dict = {}
# If `_config_dict` exist, we use them for the backward compatibility.
text_config_dict = kwargs.pop("text_config_dict", None)
image_config_dict = kwargs.pop("vision_config_dict", None)
multimodal_config_dict = kwargs.pop("multimodal_config_dict", None)
image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None)
if text_config_dict is not None:
text_config = text_config_dict
if image_config_dict is not None:
image_config = image_config_dict
if multimodal_config_dict is not None:
multimodal_config = multimodal_config_dict
if image_codebook_config_dict is not None:
image_codebook_config = image_codebook_config_dict

if image_config is None:
image_config = {}
logger.info("image_config is None. initializing the FlavaImageConfig with default values.")

if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the FlavaTextConfig with default values.")

if multimodal_config is None:
multimodal_config = {}
logger.info("multimodal_config is None. initializing the FlavaMultimodalConfig with default values.")

if image_codebook_config is None:
image_codebook_config = {}
logger.info(
"image_codebook_config_dict is None. initializing the FlavaImageCodebookConfig with default values."
"image_codebook_config is None. initializing the FlavaImageCodebookConfig with default values."
)

self.image_config_dict = image_config_dict
self.text_config_dict = text_config_dict
self.multimodal_config_dict = multimodal_config_dict
self.image_codebook_config_dict = image_codebook_config_dict

self.image_config = FlavaImageConfig(**self.image_config_dict)
self.text_config = FlavaTextConfig(**self.text_config_dict)
self.multimodal_config = FlavaMultimodalConfig(**self.multimodal_config_dict)
self.image_codebook_config = FlavaImageCodebookConfig(**self.image_codebook_config_dict)
self.image_config = FlavaImageConfig(**image_config)
self.text_config = FlavaTextConfig(**text_config)
self.multimodal_config = FlavaMultimodalConfig(**multimodal_config)
self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config)
self.projection_dim = projection_dim
self.init_codebook = init_codebook

Expand Down Expand Up @@ -623,10 +632,10 @@ def from_configs(
"""

return cls(
image_config_dict=image_config.to_dict(),
text_config_dict=text_config.to_dict(),
multimodal_config_dict=multimodal_config.to_dict(),
image_codebook_config_dict=image_codebook_config.to_dict(),
image_config=image_config.to_dict(),
text_config=text_config.to_dict(),
multimodal_config=multimodal_config.to_dict(),
image_codebook_config=image_codebook_config.to_dict(),
**kwargs,
)

Expand Down
36 changes: 22 additions & 14 deletions src/transformers/models/groupvit/configuration_groupvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,9 @@ class GroupViTConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.

Args:
text_config_dict (`dict`, *optional*):
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`GroupViTTextConfig`].
vision_config_dict (`dict`, *optional*):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`GroupViTVisionConfig`].
projection_dim (`int`, *optional*, defaults to 256):
Dimentionality of text and vision projection layers.
Expand All @@ -300,25 +300,33 @@ class GroupViTConfig(PretrainedConfig):

def __init__(
self,
text_config_dict=None,
vision_config_dict=None,
text_config=None,
vision_config=None,
projection_dim=256,
projection_intermediate_dim=4096,
logit_scale_init_value=2.6592,
**kwargs
):
super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
super().__init__(**kwargs)

# If `_config_dict` exist, we use them for the backward compatibility.
text_config_dict = kwargs.pop("text_config_dict", None)
vision_config_dict = kwargs.pop("vision_config_dict", None)
if text_config_dict is not None:
text_config = text_config_dict
if vision_config_dict is not None:
vision_config = vision_config_dict

if text_config_dict is None:
text_config_dict = {}
logger.info("text_config_dict is None. Initializing the GroupViTTextConfig with default values.")
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the GroupViTTextConfig with default values.")

if vision_config_dict is None:
vision_config_dict = {}
logger.info("vision_config_dict is None. initializing the GroupViTVisionConfig with default values.")
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. initializing the GroupViTVisionConfig with default values.")

self.text_config = GroupViTTextConfig(**text_config_dict)
self.vision_config = GroupViTVisionConfig(**vision_config_dict)
self.text_config = GroupViTTextConfig(**text_config)
self.vision_config = GroupViTVisionConfig(**vision_config)

self.projection_dim = projection_dim
self.projection_intermediate_dim = projection_intermediate_dim
Expand All @@ -337,7 +345,7 @@ def from_text_vision_configs(cls, text_config: GroupViTTextConfig, vision_config
[`GroupViTConfig`]: An instance of a configuration object
"""

return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

def to_dict(self):
"""
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/owlvit/configuration_owlvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,9 @@ class OwlViTConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.

Args:
text_config_dict (`dict`, *optional*):
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`OwlViTTextConfig`].
vision_config_dict (`dict`, *optional*):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`OwlViTVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
Expand All @@ -285,15 +285,15 @@ def __init__(
return_dict=True,
**kwargs
):
super().__init__(text_config=text_config, vision_config=vision_config, **kwargs)
super().__init__(**kwargs)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to pass text/vision config to super, as we will set self.text_config and self.vision_config below


if text_config is None:
text_config = {}
logger.info("text_config_dict is None. Initializing the OwlViTTextConfig with default values.")
logger.info("text_config is None. Initializing the OwlViTTextConfig with default values.")

if vision_config is None:
vision_config = {}
logger.info("vision_config_dict is None. initializing the OwlViTVisionConfig with default values.")
logger.info("vision_config is None. initializing the OwlViTVisionConfig with default values.")

self.text_config = OwlViTTextConfig(**text_config)
self.vision_config = OwlViTVisionConfig(**vision_config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.

Args:
text_config_dict (`dict`):
text_config (`dict`):
Dictionary of configuration options that defines text model config.
vision_config_dict (`dict`):
vision_config (`dict`):
Dictionary of configuration options that defines vison model config.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Expand Down
36 changes: 22 additions & 14 deletions src/transformers/models/x_clip/configuration_x_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,9 @@ class XCLIPConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.

Args:
text_config_dict (`dict`, *optional*):
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`XCLIPTextConfig`].
vision_config_dict (`dict`, *optional*):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`XCLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Expand Down Expand Up @@ -309,8 +309,8 @@ class XCLIPConfig(PretrainedConfig):

def __init__(
self,
text_config_dict=None,
vision_config_dict=None,
text_config=None,
vision_config=None,
projection_dim=512,
prompt_layers=2,
prompt_alpha=0.1,
Expand All @@ -321,18 +321,26 @@ def __init__(
logit_scale_init_value=2.6592,
**kwargs
):
super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
super().__init__(**kwargs)

# If `_config_dict` exist, we use them for the backward compatibility.
text_config_dict = kwargs.pop("text_config_dict", None)
vision_config_dict = kwargs.pop("vision_config_dict", None)
if text_config_dict is not None:
text_config = text_config_dict
if vision_config_dict is not None:
vision_config = vision_config_dict

if text_config_dict is None:
text_config_dict = {}
logger.info("text_config_dict is None. Initializing the XCLIPTextConfig with default values.")
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the XCLIPTextConfig with default values.")

if vision_config_dict is None:
vision_config_dict = {}
logger.info("vision_config_dict is None. initializing the XCLIPVisionConfig with default values.")
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. initializing the XCLIPVisionConfig with default values.")

self.text_config = XCLIPTextConfig(**text_config_dict)
self.vision_config = XCLIPVisionConfig(**vision_config_dict)
self.text_config = XCLIPTextConfig(**text_config)
self.vision_config = XCLIPVisionConfig(**vision_config)

self.projection_dim = projection_dim
self.prompt_layers = prompt_layers
Expand All @@ -354,7 +362,7 @@ def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: X
[`XCLIPConfig`]: An instance of a configuration object
"""

return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

def to_dict(self):
"""
Expand Down