huggingface · zucchini-nlp · Jan 30, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py
@@ -185,10 +185,6 @@ def __init__(
         layer_type_validation(self.layer_types, self.num_hidden_layers)
 
         self.rope_parameters = rope_parameters
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py
@@ -209,10 +209,6 @@ def __init__(
         layer_type_validation(self.layer_types, self.num_hidden_layers)
 
         self.rope_parameters = rope_parameters
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py
@@ -37,6 +37,8 @@ class Cohere2VisionConfig(PreTrainedConfig):
             The token ID to use as placeholder for the image input.
         alignment_intermediate_size (`int`, *optional*, defaults to 36864):
             The size of the intermediate layer for alignment.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     """
 
     model_type = "cohere2_vision"
@@ -49,6 +51,7 @@ def __init__(
         downsample_factor=2,
         image_token_id=255036,
         alignment_intermediate_size=36864,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.downsample_factor = downsample_factor
@@ -73,9 +76,10 @@ def __init__(
             text_config["model_type"] = text_config.get("model_type", "cohere2")
             text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
-            text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True)
+            text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=tie_word_embeddings)
 
         self.text_config = text_config
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
@@ -44,6 +44,8 @@ class DeepseekVLConfig(PreTrainedConfig):
             The config object or dictionary of the vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -68,6 +70,7 @@ def __init__(
         text_config: AutoConfig | None = None,
         vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if text_config is None:
@@ -89,6 +92,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -55,6 +55,8 @@ class DeepseekVLConfig(PreTrainedConfig):
             The config object or dictionary of the vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -79,6 +81,7 @@ def __init__(
         text_config: AutoConfig | None = None,
         vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if text_config is None:
@@ -100,6 +103,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py
@@ -45,6 +45,8 @@ class DeepseekVLHybridConfig(PreTrainedConfig):
             The config object or dictionary of the high resolution vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -70,6 +72,7 @@ def __init__(
         vision_config: AutoConfig | None = None,
         high_res_vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if high_res_vision_config is None:
@@ -100,6 +103,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -101,6 +101,8 @@ class DeepseekVLHybridConfig(DeepseekVLConfig):
             The config object or dictionary of the high resolution vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -126,6 +128,7 @@ def __init__(
         vision_config: AutoConfig | None = None,
         high_res_vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if high_res_vision_config is None:
@@ -142,6 +145,7 @@ def __init__(
             text_config=text_config,
             vision_config=vision_config,
             image_token_id=image_token_id,
+            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
 

diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -126,6 +126,8 @@ class DeformableDetrConfig(PreTrainedConfig):
         disable_custom_kernels (`bool`, *optional*, defaults to `False`):
             Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
             kernels are not supported by PyTorch ONNX export.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Examples:
 
@@ -194,6 +196,7 @@ def __init__(
         eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         # We default to values which were previously hard-coded in the model. This enables configurability of the config
@@ -268,6 +271,7 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 

diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py
@@ -166,6 +166,8 @@ class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
 
     ```python
@@ -206,6 +208,7 @@ def __init__(
         attention_bias=False,
         attention_dropout: float = 0.1,
         initializer_range: float = 0.02,
+        tie_word_embeddings: bool | None = False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -227,6 +230,7 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 

diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py
@@ -67,6 +67,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`RopeParameters`, *optional*):
@@ -139,6 +141,7 @@ def __init__(
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float | None = 0.0,
@@ -163,6 +166,7 @@ def __init__(
         self.sliding_window_pattern = sliding_window_pattern
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
 
         self.layer_types = layer_types

diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py
@@ -101,6 +101,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`RopeParameters`, *optional*):
@@ -173,6 +175,7 @@ def __init__(
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float | None = 0.0,
@@ -197,6 +200,7 @@ def __init__(
         self.sliding_window_pattern = sliding_window_pattern
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
 
         self.layer_types = layer_types

diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
@@ -84,6 +84,8 @@ class FalconMambaConfig(PreTrainedConfig):
             Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
         mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
             The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
 
     Example:
@@ -130,6 +132,7 @@ def __init__(
         use_cache=True,
         use_falcon_mambapy=False,
         mixer_rms_eps=1e-6,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -162,6 +165,7 @@ def __init__(
         self.residual_in_fp32 = residual_in_fp32
         self.use_cache = use_cache
         self.use_falcon_mambapy = use_falcon_mambapy
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
         self.mixer_rms_eps = mixer_rms_eps

diff --git a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py
@@ -110,6 +110,8 @@ class FalconMambaConfig(MambaConfig):
             Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
         mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
             The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
 
     Example:
@@ -154,6 +156,7 @@ def __init__(
         use_cache=True,
         use_falcon_mambapy=False,
         mixer_rms_eps=1e-6,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         super().__init__(
@@ -181,6 +184,7 @@ def __init__(
             rescale_prenorm_residual=rescale_prenorm_residual,
             use_cache=use_cache,
             use_falcon_mambapy=use_falcon_mambapy,
+            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
         self.mixer_rms_eps = mixer_rms_eps

diff --git a/src/transformers/models/fast_vlm/configuration_fast_vlm.py b/src/transformers/models/fast_vlm/configuration_fast_vlm.py
@@ -51,6 +51,8 @@ class FastVlmConfig(PreTrainedConfig):
             vision features. Only -1 supported.
         multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the multimodal projector.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -82,6 +84,7 @@ def __init__(
         vision_feature_select_strategy="full",
         vision_feature_layer=-1,
         multimodal_projector_bias=True,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -130,6 +133,13 @@ def __init__(
 
         self.text_config = text_config
         self.multimodal_projector_bias = multimodal_projector_bias
+        self.tie_word_embeddings = tie_word_embeddings
+
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
 
         super().__init__(**kwargs)
 

diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py
@@ -63,6 +63,8 @@ class FastVlmConfig(LlavaConfig):
             vision features. Only -1 supported.
         multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the multimodal projector.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -90,6 +92,7 @@ def __init__(
         vision_feature_select_strategy="full",
         vision_feature_layer=-1,
         multimodal_projector_bias=True,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -138,6 +141,13 @@ def __init__(
 
         self.text_config = text_config
         self.multimodal_projector_bias = multimodal_projector_bias
+        self.tie_word_embeddings = tie_word_embeddings
+
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
 
         PreTrainedConfig.__init__(**kwargs)
 

diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
@@ -431,9 +431,8 @@ class FlavaConfig(PreTrainedConfig):
             Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
         return_loss (`bool`, *optional*, defaults to `True`):
             Whether to return loss or not
-
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -483,6 +482,7 @@ def __init__(
         global_backprop_contrastive: bool = True,
         skip_unmasked_multimodal_encoder: bool = True,
         return_loss: bool = True,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         # If `_config_dict` exist, we use them for the backward compatibility.
@@ -663,6 +663,7 @@ def __init__(
         self.global_backprop_contrastive = global_backprop_contrastive
         self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder
         self.return_loss = return_loss
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)