Skip to content
Merged
4 changes: 0 additions & 4 deletions src/transformers/models/cohere2/configuration_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,6 @@ def __init__(
layer_type_validation(self.layer_types, self.num_hidden_layers)

self.rope_parameters = rope_parameters
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
4 changes: 0 additions & 4 deletions src/transformers/models/cohere2/modular_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,6 @@ def __init__(
layer_type_validation(self.layer_types, self.num_hidden_layers)

self.rope_parameters = rope_parameters
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class Cohere2VisionConfig(PreTrainedConfig):
The token ID to use as placeholder for the image input.
alignment_intermediate_size (`int`, *optional*, defaults to 36864):
The size of the intermediate layer for alignment.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
"""

model_type = "cohere2_vision"
Expand All @@ -49,6 +51,7 @@ def __init__(
downsample_factor=2,
image_token_id=255036,
alignment_intermediate_size=36864,
tie_word_embeddings=True,
**kwargs,
):
self.downsample_factor = downsample_factor
Expand All @@ -73,9 +76,10 @@ def __init__(
text_config["model_type"] = text_config.get("model_type", "cohere2")
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True)
text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=tie_word_embeddings)

self.text_config = text_config
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class DeepseekVLConfig(PreTrainedConfig):
The config object or dictionary of the vision backbone.
image_token_id (`int`, *optional*, defaults to 100015):
The index representing image tokens in the model's token vocabulary.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Example:

Expand All @@ -68,6 +70,7 @@ def __init__(
text_config: AutoConfig | None = None,
vision_config: AutoConfig | None = None,
image_token_id: int = 100015,
tie_word_embeddings: bool | None = True,
**kwargs,
):
if text_config is None:
Expand All @@ -89,6 +92,7 @@ def __init__(
self.text_config = text_config
self.vision_config = vision_config
self.image_token_id = image_token_id
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/deepseek_vl/modular_deepseek_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class DeepseekVLConfig(PreTrainedConfig):
The config object or dictionary of the vision backbone.
image_token_id (`int`, *optional*, defaults to 100015):
The index representing image tokens in the model's token vocabulary.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Example:

Expand All @@ -79,6 +81,7 @@ def __init__(
text_config: AutoConfig | None = None,
vision_config: AutoConfig | None = None,
image_token_id: int = 100015,
tie_word_embeddings: bool | None = True,
**kwargs,
):
if text_config is None:
Expand All @@ -100,6 +103,7 @@ def __init__(
self.text_config = text_config
self.vision_config = vision_config
self.image_token_id = image_token_id
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class DeepseekVLHybridConfig(PreTrainedConfig):
The config object or dictionary of the high resolution vision backbone.
image_token_id (`int`, *optional*, defaults to 100015):
The index representing image tokens in the model's token vocabulary.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Example:

Expand All @@ -70,6 +72,7 @@ def __init__(
vision_config: AutoConfig | None = None,
high_res_vision_config: AutoConfig | None = None,
image_token_id: int = 100015,
tie_word_embeddings: bool | None = True,
**kwargs,
):
if high_res_vision_config is None:
Expand Down Expand Up @@ -100,6 +103,7 @@ def __init__(
self.text_config = text_config
self.vision_config = vision_config
self.image_token_id = image_token_id
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ class DeepseekVLHybridConfig(DeepseekVLConfig):
The config object or dictionary of the high resolution vision backbone.
image_token_id (`int`, *optional*, defaults to 100015):
The index representing image tokens in the model's token vocabulary.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Example:

Expand All @@ -126,6 +128,7 @@ def __init__(
vision_config: AutoConfig | None = None,
high_res_vision_config: AutoConfig | None = None,
image_token_id: int = 100015,
tie_word_embeddings: bool | None = True,
**kwargs,
):
if high_res_vision_config is None:
Expand All @@ -142,6 +145,7 @@ def __init__(
text_config=text_config,
vision_config=vision_config,
image_token_id=image_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class DeformableDetrConfig(PreTrainedConfig):
disable_custom_kernels (`bool`, *optional*, defaults to `False`):
Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
kernels are not supported by PyTorch ONNX export.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Examples:

Expand Down Expand Up @@ -194,6 +196,7 @@ def __init__(
eos_coefficient=0.1,
focal_alpha=0.25,
disable_custom_kernels=False,
tie_word_embeddings=True,
**kwargs,
):
# We default to values which were previously hard-coded in the model. This enables configurability of the config
Expand Down Expand Up @@ -268,6 +271,7 @@ def __init__(
self.eos_coefficient = eos_coefficient
self.focal_alpha = focal_alpha
self.disable_custom_kernels = disable_custom_kernels
self.tie_word_embeddings = tie_word_embeddings

super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)

Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/emu3/configuration_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings


```python
Expand Down Expand Up @@ -206,6 +208,7 @@ def __init__(
attention_bias=False,
attention_dropout: float = 0.1,
initializer_range: float = 0.02,
tie_word_embeddings: bool | None = False,
**kwargs,
):
self.vocab_size = vocab_size
Expand All @@ -227,6 +230,7 @@ def __init__(
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/exaone4/configuration_exaone4.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pad_token_id (`int`, *optional*):
The id of the padding token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_parameters (`RopeParameters`, *optional*):
Expand Down Expand Up @@ -139,6 +141,7 @@ def __init__(
use_cache: bool | None = True,
bos_token_id: int | None = 0,
eos_token_id: int | None = 2,
pad_token_id: int | None = None,
tie_word_embeddings: bool | None = False,
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
attention_dropout: float | None = 0.0,
Expand All @@ -163,6 +166,7 @@ def __init__(
self.sliding_window_pattern = sliding_window_pattern
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.tie_word_embeddings = tie_word_embeddings

self.layer_types = layer_types
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/exaone4/modular_exaone4.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pad_token_id (`int`, *optional*):
The id of the padding token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_parameters (`RopeParameters`, *optional*):
Expand Down Expand Up @@ -173,6 +175,7 @@ def __init__(
use_cache: bool | None = True,
bos_token_id: int | None = 0,
eos_token_id: int | None = 2,
pad_token_id: int | None = None,
tie_word_embeddings: bool | None = False,
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
attention_dropout: float | None = 0.0,
Expand All @@ -197,6 +200,7 @@ def __init__(
self.sliding_window_pattern = sliding_window_pattern
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.tie_word_embeddings = tie_word_embeddings

self.layer_types = layer_types
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class FalconMambaConfig(PreTrainedConfig):
Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings


Example:
Expand Down Expand Up @@ -130,6 +132,7 @@ def __init__(
use_cache=True,
use_falcon_mambapy=False,
mixer_rms_eps=1e-6,
tie_word_embeddings=True,
**kwargs,
):
self.vocab_size = vocab_size
Expand Down Expand Up @@ -162,6 +165,7 @@ def __init__(
self.residual_in_fp32 = residual_in_fp32
self.use_cache = use_cache
self.use_falcon_mambapy = use_falcon_mambapy
self.tie_word_embeddings = tie_word_embeddings

super().__init__(**kwargs)
self.mixer_rms_eps = mixer_rms_eps
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/falcon_mamba/modular_falcon_mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ class FalconMambaConfig(MambaConfig):
Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings


Example:
Expand Down Expand Up @@ -154,6 +156,7 @@ def __init__(
use_cache=True,
use_falcon_mambapy=False,
mixer_rms_eps=1e-6,
tie_word_embeddings=True,
**kwargs,
):
super().__init__(
Expand Down Expand Up @@ -181,6 +184,7 @@ def __init__(
rescale_prenorm_residual=rescale_prenorm_residual,
use_cache=use_cache,
use_falcon_mambapy=use_falcon_mambapy,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.mixer_rms_eps = mixer_rms_eps
Expand Down
10 changes: 10 additions & 0 deletions src/transformers/models/fast_vlm/configuration_fast_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class FastVlmConfig(PreTrainedConfig):
vision features. Only -1 supported.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings

Example:

Expand Down Expand Up @@ -82,6 +84,7 @@ def __init__(
vision_feature_select_strategy="full",
vision_feature_layer=-1,
multimodal_projector_bias=True,
tie_word_embeddings=False,
**kwargs,
):
self.image_token_id = image_token_id
Expand Down Expand Up @@ -130,6 +133,13 @@ def __init__(

self.text_config = text_config
self.multimodal_projector_bias = multimodal_projector_bias
self.tie_word_embeddings = tie_word_embeddings

# The default value is `False` but this config is used with many model types
# Attr `tie_word_embeddings` was saved in text config for those models, so we
# need an ugly workaround and forward-pass the attr from text config
if not tie_word_embeddings and self.text_config.tie_word_embeddings:
self.tie_word_embeddings = self.text_config.tie_word_embeddings

super().__init__(**kwargs)

Expand Down
10 changes: 10 additions & 0 deletions src/transformers/models/fast_vlm/modular_fast_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class FastVlmConfig(LlavaConfig):
vision features. Only -1 supported.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings

Example:

Expand Down Expand Up @@ -90,6 +92,7 @@ def __init__(
vision_feature_select_strategy="full",
vision_feature_layer=-1,
multimodal_projector_bias=True,
tie_word_embeddings=False,
**kwargs,
):
self.image_token_id = image_token_id
Expand Down Expand Up @@ -138,6 +141,13 @@ def __init__(

self.text_config = text_config
self.multimodal_projector_bias = multimodal_projector_bias
self.tie_word_embeddings = tie_word_embeddings

# The default value is `False` but this config is used with many model types
# Attr `tie_word_embeddings` was saved in text config for those models, so we
# need an ugly workaround and forward-pass the attr from text config
if not tie_word_embeddings and self.text_config.tie_word_embeddings:
self.tie_word_embeddings = self.text_config.tie_word_embeddings

PreTrainedConfig.__init__(**kwargs)

Expand Down
7 changes: 4 additions & 3 deletions src/transformers/models/flava/configuration_flava.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,9 +431,8 @@ class FlavaConfig(PreTrainedConfig):
Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
return_loss (`bool`, *optional*, defaults to `True`):
Whether to return loss or not

kwargs (*optional*):
Dictionary of keyword arguments.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Example:

Expand Down Expand Up @@ -483,6 +482,7 @@ def __init__(
global_backprop_contrastive: bool = True,
skip_unmasked_multimodal_encoder: bool = True,
return_loss: bool = True,
tie_word_embeddings: bool | None = True,
**kwargs,
):
# If `_config_dict` exist, we use them for the backward compatibility.
Expand Down Expand Up @@ -663,6 +663,7 @@ def __init__(
self.global_backprop_contrastive = global_backprop_contrastive
self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder
self.return_loss = return_loss
self.tie_word_embeddings = tie_word_embeddings
super().__init__(**kwargs)


Expand Down
Loading