diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py
index fbda54d2f11c..206857d20a38 100644
--- a/src/transformers/models/cohere2/configuration_cohere2.py
+++ b/src/transformers/models/cohere2/configuration_cohere2.py
@@ -185,10 +185,6 @@ def __init__(
         layer_type_validation(self.layer_types, self.num_hidden_layers)
 
         self.rope_parameters = rope_parameters
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py
index 0989822b4fbe..ca3a8e14af89 100644
--- a/src/transformers/models/cohere2/modular_cohere2.py
+++ b/src/transformers/models/cohere2/modular_cohere2.py
@@ -209,10 +209,6 @@ def __init__(
         layer_type_validation(self.layer_types, self.num_hidden_layers)
 
         self.rope_parameters = rope_parameters
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py
index 911a25266105..6bfa27011cce 100644
--- a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py
@@ -37,6 +37,8 @@ class Cohere2VisionConfig(PreTrainedConfig):
             The token ID to use as placeholder for the image input.
         alignment_intermediate_size (`int`, *optional*, defaults to 36864):
             The size of the intermediate layer for alignment.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     """
 
     model_type = "cohere2_vision"
@@ -49,6 +51,7 @@ def __init__(
         downsample_factor=2,
         image_token_id=255036,
         alignment_intermediate_size=36864,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.downsample_factor = downsample_factor
@@ -73,9 +76,10 @@ def __init__(
             text_config["model_type"] = text_config.get("model_type", "cohere2")
             text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
-            text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True)
+            text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=tie_word_embeddings)
 
         self.text_config = text_config
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
index 24f7690fd764..2372bf321c7d 100644
--- a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
@@ -44,6 +44,8 @@ class DeepseekVLConfig(PreTrainedConfig):
             The config object or dictionary of the vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -68,6 +70,7 @@ def __init__(
         text_config: AutoConfig | None = None,
         vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if text_config is None:
@@ -89,6 +92,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
index 23c3ed23c0d3..90f447090a23 100644
--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -55,6 +55,8 @@ class DeepseekVLConfig(PreTrainedConfig):
             The config object or dictionary of the vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -79,6 +81,7 @@ def __init__(
         text_config: AutoConfig | None = None,
         vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if text_config is None:
@@ -100,6 +103,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py
index 35b3dc75166e..cee5d6fe3280 100644
--- a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py
@@ -45,6 +45,8 @@ class DeepseekVLHybridConfig(PreTrainedConfig):
             The config object or dictionary of the high resolution vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -70,6 +72,7 @@ def __init__(
         vision_config: AutoConfig | None = None,
         high_res_vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if high_res_vision_config is None:
@@ -100,6 +103,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 8488833e6aaf..4b027a7cc421 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -101,6 +101,8 @@ class DeepseekVLHybridConfig(DeepseekVLConfig):
             The config object or dictionary of the high resolution vision backbone.
         image_token_id (`int`, *optional*, defaults to 100015):
             The index representing image tokens in the model's token vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -126,6 +128,7 @@ def __init__(
         vision_config: AutoConfig | None = None,
         high_res_vision_config: AutoConfig | None = None,
         image_token_id: int = 100015,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         if high_res_vision_config is None:
@@ -142,6 +145,7 @@ def __init__(
             text_config=text_config,
             vision_config=vision_config,
             image_token_id=image_token_id,
+            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
 
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 46c9ac9a2a49..045068ec702d 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -126,6 +126,8 @@ class DeformableDetrConfig(PreTrainedConfig):
         disable_custom_kernels (`bool`, *optional*, defaults to `False`):
             Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
             kernels are not supported by PyTorch ONNX export.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Examples:
 
@@ -194,6 +196,7 @@ def __init__(
         eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         # We default to values which were previously hard-coded in the model. This enables configurability of the config
@@ -268,6 +271,7 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py
index 25e6d14be0d9..9fd071e82781 100644
--- a/src/transformers/models/emu3/configuration_emu3.py
+++ b/src/transformers/models/emu3/configuration_emu3.py
@@ -166,6 +166,8 @@ class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
 
     ```python
@@ -206,6 +208,7 @@ def __init__(
         attention_bias=False,
         attention_dropout: float = 0.1,
         initializer_range: float = 0.02,
+        tie_word_embeddings: bool | None = False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -227,6 +230,7 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py
index f6bf32726736..0d464c8c8f04 100644
--- a/src/transformers/models/exaone4/configuration_exaone4.py
+++ b/src/transformers/models/exaone4/configuration_exaone4.py
@@ -67,6 +67,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`RopeParameters`, *optional*):
@@ -139,6 +141,7 @@ def __init__(
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float | None = 0.0,
@@ -163,6 +166,7 @@ def __init__(
         self.sliding_window_pattern = sliding_window_pattern
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
 
         self.layer_types = layer_types
diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py
index a1746024b5f3..95f6e2128c94 100644
--- a/src/transformers/models/exaone4/modular_exaone4.py
+++ b/src/transformers/models/exaone4/modular_exaone4.py
@@ -101,6 +101,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`RopeParameters`, *optional*):
@@ -173,6 +175,7 @@ def __init__(
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float | None = 0.0,
@@ -197,6 +200,7 @@ def __init__(
         self.sliding_window_pattern = sliding_window_pattern
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
 
         self.layer_types = layer_types
diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
index ce0eac49c72e..efa14928b164 100644
--- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
+++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
@@ -84,6 +84,8 @@ class FalconMambaConfig(PreTrainedConfig):
             Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
         mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
             The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
 
     Example:
@@ -130,6 +132,7 @@ def __init__(
         use_cache=True,
         use_falcon_mambapy=False,
         mixer_rms_eps=1e-6,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -162,6 +165,7 @@ def __init__(
         self.residual_in_fp32 = residual_in_fp32
         self.use_cache = use_cache
         self.use_falcon_mambapy = use_falcon_mambapy
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
         self.mixer_rms_eps = mixer_rms_eps
diff --git a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py
index a9026c81c767..67fa0c9ab509 100644
--- a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py
+++ b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py
@@ -110,6 +110,8 @@ class FalconMambaConfig(MambaConfig):
             Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
         mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
             The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
 
     Example:
@@ -154,6 +156,7 @@ def __init__(
         use_cache=True,
         use_falcon_mambapy=False,
         mixer_rms_eps=1e-6,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         super().__init__(
@@ -181,6 +184,7 @@ def __init__(
             rescale_prenorm_residual=rescale_prenorm_residual,
             use_cache=use_cache,
             use_falcon_mambapy=use_falcon_mambapy,
+            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
         self.mixer_rms_eps = mixer_rms_eps
diff --git a/src/transformers/models/fast_vlm/configuration_fast_vlm.py b/src/transformers/models/fast_vlm/configuration_fast_vlm.py
index 46e5a6ccbf76..925b4eaa5c71 100644
--- a/src/transformers/models/fast_vlm/configuration_fast_vlm.py
+++ b/src/transformers/models/fast_vlm/configuration_fast_vlm.py
@@ -51,6 +51,8 @@ class FastVlmConfig(PreTrainedConfig):
             vision features. Only -1 supported.
         multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the multimodal projector.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -82,6 +84,7 @@ def __init__(
         vision_feature_select_strategy="full",
         vision_feature_layer=-1,
         multimodal_projector_bias=True,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -130,6 +133,13 @@ def __init__(
 
         self.text_config = text_config
         self.multimodal_projector_bias = multimodal_projector_bias
+        self.tie_word_embeddings = tie_word_embeddings
+
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py
index 7f35ca4c1ee9..7346be927616 100644
--- a/src/transformers/models/fast_vlm/modular_fast_vlm.py
+++ b/src/transformers/models/fast_vlm/modular_fast_vlm.py
@@ -63,6 +63,8 @@ class FastVlmConfig(LlavaConfig):
             vision features. Only -1 supported.
         multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the multimodal projector.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -90,6 +92,7 @@ def __init__(
         vision_feature_select_strategy="full",
         vision_feature_layer=-1,
         multimodal_projector_bias=True,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -138,6 +141,13 @@ def __init__(
 
         self.text_config = text_config
         self.multimodal_projector_bias = multimodal_projector_bias
+        self.tie_word_embeddings = tie_word_embeddings
+
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
 
         PreTrainedConfig.__init__(**kwargs)
 
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index 853abcfec886..408184111afb 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -431,9 +431,8 @@ class FlavaConfig(PreTrainedConfig):
             Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
         return_loss (`bool`, *optional*, defaults to `True`):
             Whether to return loss or not
-
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -483,6 +482,7 @@ def __init__(
         global_backprop_contrastive: bool = True,
         skip_unmasked_multimodal_encoder: bool = True,
         return_loss: bool = True,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         # If `_config_dict` exist, we use them for the backward compatibility.
@@ -663,6 +663,7 @@ def __init__(
         self.global_backprop_contrastive = global_backprop_contrastive
         self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder
         self.return_loss = return_loss
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/florence2/configuration_florence2.py b/src/transformers/models/florence2/configuration_florence2.py
index 3ece88742a1a..226d17a476f5 100644
--- a/src/transformers/models/florence2/configuration_florence2.py
+++ b/src/transformers/models/florence2/configuration_florence2.py
@@ -153,6 +153,8 @@ class Florence2Config(PreTrainedConfig):
             The image token index to encode the image prompt.
         is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
             Whether the model is used as an encoder/decoder or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -187,6 +189,7 @@ def __init__(
         vision_config=None,
         image_token_id=51289,
         is_encoder_decoder=True,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         if isinstance(text_config, dict):
@@ -204,6 +207,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(
             is_encoder_decoder=is_encoder_decoder,
diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py
index 18170ede0e5c..055f7685803c 100644
--- a/src/transformers/models/florence2/modular_florence2.py
+++ b/src/transformers/models/florence2/modular_florence2.py
@@ -175,6 +175,8 @@ class Florence2Config(PreTrainedConfig):
             The image token index to encode the image prompt.
         is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
             Whether the model is used as an encoder/decoder or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -209,6 +211,7 @@ def __init__(
         vision_config=None,
         image_token_id=51289,
         is_encoder_decoder=True,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         if isinstance(text_config, dict):
@@ -226,6 +229,7 @@ def __init__(
         self.text_config = text_config
         self.vision_config = vision_config
         self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(
             is_encoder_decoder=is_encoder_decoder,
diff --git a/src/transformers/models/glm46v/configuration_glm46v.py b/src/transformers/models/glm46v/configuration_glm46v.py
index 7db51870c1c3..d7bd651e2779 100644
--- a/src/transformers/models/glm46v/configuration_glm46v.py
+++ b/src/transformers/models/glm46v/configuration_glm46v.py
@@ -50,6 +50,8 @@ class Glm46VConfig(PreTrainedConfig):
             The video start token index to encode the start of video.
         video_end_token_id (`int`, *optional*, defaults to 151362):
             The video end token index to encode the end of video.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
     ```python
     >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig
@@ -78,6 +80,7 @@ def __init__(
         image_end_token_id=151340,
         video_start_token_id=151361,
         video_end_token_id=151362,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         if isinstance(vision_config, dict):
@@ -98,6 +101,7 @@ def __init__(
         self.video_end_token_id = video_end_token_id
         self.image_start_token_id = image_start_token_id
         self.image_end_token_id = image_end_token_id
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/glm46v/modular_glm46v.py b/src/transformers/models/glm46v/modular_glm46v.py
index d6a72bff671b..a52bd3411840 100644
--- a/src/transformers/models/glm46v/modular_glm46v.py
+++ b/src/transformers/models/glm46v/modular_glm46v.py
@@ -52,6 +52,8 @@ class Glm46VConfig(PreTrainedConfig):
             The video start token index to encode the start of video.
         video_end_token_id (`int`, *optional*, defaults to 151362):
             The video end token index to encode the end of video.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
 
     ```python
     >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig
@@ -80,6 +82,7 @@ def __init__(
         image_end_token_id=151340,
         video_start_token_id=151361,
         video_end_token_id=151362,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         if isinstance(vision_config, dict):
@@ -100,6 +103,7 @@ def __init__(
         self.video_end_token_id = video_end_token_id
         self.image_start_token_id = image_start_token_id
         self.image_end_token_id = image_end_token_id
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/got_ocr2/configuration_got_ocr2.py b/src/transformers/models/got_ocr2/configuration_got_ocr2.py
index 4048ddaa4328..49384f5f45b4 100644
--- a/src/transformers/models/got_ocr2/configuration_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/configuration_got_ocr2.py
@@ -134,6 +134,9 @@ class GotOcr2Config(PreTrainedConfig):
             The image token index to encode the image prompt.
         image_seq_length (`int`, *optional*, defaults to 576):
             Sequence length of one image embedding.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+
 
     ```python
     >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config
@@ -160,6 +163,7 @@ def __init__(
         text_config: dict | None = None,
         image_token_index: int | None = 151859,
         image_seq_length: int | None = 576,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         self.image_token_index = image_token_index
@@ -188,7 +192,7 @@ def __init__(
                 initializer_range=0.02,
                 rms_norm_eps=1e-6,
                 use_cache=True,
-                tie_word_embeddings=True,
+                tie_word_embeddings=tie_word_embeddings,
                 rope_theta=1000000.0,
                 rope_parameters=None,
                 use_sliding_window=False,
@@ -198,6 +202,7 @@ def __init__(
             )
 
         self.text_config = text_config
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py
index f50cd2bb03c4..2cd299fa4bc7 100644
--- a/src/transformers/models/got_ocr2/modular_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py
@@ -155,6 +155,9 @@ class GotOcr2Config(PreTrainedConfig):
             The image token index to encode the image prompt.
         image_seq_length (`int`, *optional*, defaults to 576):
             Sequence length of one image embedding.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+
 
     ```python
     >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config
@@ -181,6 +184,7 @@ def __init__(
         text_config: dict | None = None,
         image_token_index: int | None = 151859,
         image_seq_length: int | None = 576,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         self.image_token_index = image_token_index
@@ -209,7 +213,7 @@ def __init__(
                 initializer_range=0.02,
                 rms_norm_eps=1e-6,
                 use_cache=True,
-                tie_word_embeddings=True,
+                tie_word_embeddings=tie_word_embeddings,
                 rope_theta=1000000.0,
                 rope_parameters=None,
                 use_sliding_window=False,
@@ -219,6 +223,7 @@ def __init__(
             )
 
         self.text_config = text_config
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
index 557452843660..f043ea5d1a60 100644
--- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -111,6 +111,7 @@ def __init__(
         use_cache=True,
         bos_token_id=50256,
         eos_token_id=50256,
+        pad_token_id=None,
         attention_softmax_in_fp32=True,
         scale_attention_softmax_in_fp32=True,
         multi_query=True,
@@ -141,6 +142,7 @@ def __init__(
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index c51fb8928bef..f42e8f06b126 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -122,6 +122,7 @@ def __init__(
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         use_parallel_residual: bool | None = True,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
@@ -132,6 +133,7 @@ def __init__(
         self.is_decoder = is_decoder
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index 3aac2b14f308..8a2a32aced52 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -102,6 +102,7 @@ def __init__(
         use_cache=True,
         bos_token_id=50256,
         eos_token_id=50256,
+        pad_token_id=None,
         tie_word_embeddings=False,
         **kwargs,
     ):
@@ -122,9 +123,7 @@ def __init__(
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/internvl/configuration_internvl.py b/src/transformers/models/internvl/configuration_internvl.py
index c0503bbbd715..40dcfe2d8035 100644
--- a/src/transformers/models/internvl/configuration_internvl.py
+++ b/src/transformers/models/internvl/configuration_internvl.py
@@ -167,6 +167,9 @@ class InternVLConfig(PreTrainedConfig):
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Can be one of `"default"` or `"full"`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+
 
     ```python
     >>> from transformers import InternVLForConditionalGeneration, InternVLConfig
@@ -194,6 +197,7 @@ def __init__(
         projector_hidden_act="gelu",
         vision_feature_layer=-1,
         vision_feature_select_strategy="default",
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -217,6 +221,7 @@ def __init__(
             text_config = CONFIG_MAPPING["qwen2"]()
 
         self.text_config = text_config
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py
index 2ea2c6c20136..b945ce846656 100644
--- a/src/transformers/models/jetmoe/configuration_jetmoe.py
+++ b/src/transformers/models/jetmoe/configuration_jetmoe.py
@@ -68,6 +68,8 @@ class JetMoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             The id of the "beginning-of-sequence" token.
         eos_token_id (`int`, *optional*, defaults to 2):
             The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether the model's input and output word embeddings should be tied.
         rope_parameters (`RopeParameters`, *optional*):
@@ -115,6 +117,7 @@ def __init__(
         use_cache: bool | None = True,
         bos_token_id: int | None = 1,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = True,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         rms_norm_eps: int | None = 1e-6,
@@ -143,11 +146,9 @@ def __init__(
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.rms_norm_eps = rms_norm_eps
         self.rope_parameters = rope_parameters
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
         self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index d141bbb0784b..b62047377d3f 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -160,6 +160,12 @@ def __init__(
 
         self.text_config = text_config
 
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
+
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index d3670efec2d3..487834caae0c 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -179,6 +179,12 @@ def __init__(
 
         self.text_config = text_config
 
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
+
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
index b5c1a4c89593..4369b22933bd 100644
--- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
@@ -188,6 +188,12 @@ def __init__(
 
         self.text_config = text_config
 
+        # The default value is `False` but this config is used with many model types
+        # Attr `tie_word_embeddings` was saved in text config for those models, so we
+        # need an ugly workaround and forward-pass the attr from text config
+        if not tie_word_embeddings and self.text_config.tie_word_embeddings:
+            self.tie_word_embeddings = self.text_config.tie_word_embeddings
+
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py
index 10ea8f69dde1..b4326ee1f904 100644
--- a/src/transformers/models/mamba/configuration_mamba.py
+++ b/src/transformers/models/mamba/configuration_mamba.py
@@ -80,7 +80,8 @@ class MambaConfig(PreTrainedConfig):
             Whether or not the cache should be used.
         use_mambapy (`bool`, *optional*, defaults to `False`):
             Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
-
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     Example:
 
@@ -125,6 +126,7 @@ def __init__(
         rescale_prenorm_residual=False,
         use_cache=True,
         use_mambapy=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -152,6 +154,7 @@ def __init__(
         self.residual_in_fp32 = residual_in_fp32
         self.use_cache = use_cache
         self.use_mambapy = use_mambapy
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index 3a7c62c39e05..ad9cca14dba2 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -147,6 +147,12 @@ class MptConfig(PreTrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*):
+            The id of the beginning of sequence token.
+        eos_token_id (`int`, *optional*):
+            The id of the end of sequence token.
 
     Example:
 
@@ -193,6 +199,9 @@ def __init__(
         use_cache: bool = False,
         initializer_range=0.02,
         tie_word_embeddings=True,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
         **kwargs,
     ):
         if attn_config is None:
@@ -219,6 +228,9 @@ def __init__(
         self.use_cache = use_cache
         self.initializer_range = initializer_range
         self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/ovis2/configuration_ovis2.py b/src/transformers/models/ovis2/configuration_ovis2.py
index 26b00abae66a..45306a433f03 100644
--- a/src/transformers/models/ovis2/configuration_ovis2.py
+++ b/src/transformers/models/ovis2/configuration_ovis2.py
@@ -129,6 +129,8 @@ class Ovis2Config(PreTrainedConfig):
             Vocabulary size of the text model.
         hidden_size (`int`, *optional*, defaults to 1536):
             Dimensionality of the encoder layers and the pooler layer.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
 
     ```python
     >>> from transformers import Ovis2ForConditionalGeneration, Ovis2Config
@@ -155,6 +157,7 @@ def __init__(
         visual_indicator_token_ids=[151666, 151667, 151668, 151669, 151670],
         vocab_size=151643,
         hidden_size=1536,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         if isinstance(vision_config, dict):
@@ -175,6 +178,7 @@ def __init__(
         self.hidden_size = hidden_size
         self.image_token_id = image_token_id
         self.visual_indicator_token_ids = visual_indicator_token_ids
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py
index a8bf97887e45..f28c0ecfc076 100644
--- a/src/transformers/models/stablelm/configuration_stablelm.py
+++ b/src/transformers/models/stablelm/configuration_stablelm.py
@@ -87,6 +87,8 @@ class StableLmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
             The id of the `BOS` token in the vocabulary.
         eos_token_id (int, *optional*, defaults to 0):
             The id of the `EOS` token in the vocabulary.
+        pad_token_id (int, *optional*):
+            The id of the `PAD` token in the vocabulary.
 
     Example:
 
@@ -122,6 +124,7 @@ def __init__(
         attention_dropout: float | None = 0.0,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 0,
+        pad_token_id: int | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -147,6 +150,7 @@ def __init__(
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index 0beec764fda6..66c01d9efdc3 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -95,6 +95,8 @@ class TvpConfig(PreTrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability of attention layers.
+        pad_token_id (`int`, *optional*):
+            The id of a PAD token in the vocabulary.
     """
 
     model_type = "tvp"
@@ -128,6 +130,7 @@ def __init__(
         layer_norm_eps=1e-12,
         initializer_range=0.02,
         attention_probs_dropout_prob=0.1,
+        pad_token_id=None,
         **kwargs,
     ):
         if backbone_config is None and backbone is None:
@@ -172,6 +175,7 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.pad_token_id = pad_token_id
 
         super().__init__(**kwargs)
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index f1cad9d6fc50..004cbc01e3e1 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -68,6 +68,7 @@ def __init__(
         num_feature_levels=4,
         encoder_n_points=2,
         decoder_n_points=6,
+        tie_word_embeddings=False,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -88,6 +89,7 @@ def __init__(
         self.num_feature_levels = num_feature_levels
         self.encoder_n_points = encoder_n_points
         self.decoder_n_points = decoder_n_points
+        self.tie_word_embeddings = tie_word_embeddings
 
         # we also set the expected seq length for both encoder and decoder
         self.encoder_seq_length = (
@@ -149,6 +151,9 @@ def get_config(self):
             backbone=None,
             backbone_config=resnet_config,
             use_pretrained_backbone=False,
+            # FIXME; cls attr `toed_weihgt_keys` must not be modified in __init__
+            # Several models affected so for now just let it be and fix in separate PR
+            tie_word_embeddings=self.tie_word_embeddings,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index fda76cf864ef..6711681ac98a 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -304,6 +304,10 @@ def test_model(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("LayoutLM needs specific combination of config values and cannot run with defaults")
+    def test_model_forward_default_config_values(self):
+        pass
+
     def test_for_sequence_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0b1177fe7f92..5cf50361495d 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1501,6 +1501,62 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                     model_row_output[key] = model_row_output[key][1:]
                 recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
 
+    def test_model_forward_default_config_values(
+        self,
+    ):
+        """
+        Tests that the model can run forward pass when config is intialized without common attributes.
+        We expect that these attributes have a default value and will not cause errors. See #41541
+        where the attributes were removed from `PreTrainedConfig` and moved to each model's config
+        class.
+        """
+        common_config_properties = [
+            "pad_token_id",
+            "eos_token_id",
+            "bos_token_id",
+            "sep_token_id",
+            "tie_word_embeddings",
+        ]
+        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
+        batch_size = self.model_tester.batch_size
+
+        config_dict = config.to_diff_dict()
+        for common_config_property in common_config_properties:
+            config_dict.pop(common_config_property, None)
+            for subconfig_key in config.sub_configs:
+                subconfig = config_dict.get(subconfig_key, {})
+                if subconfig:
+                    subconfig.pop(common_config_property, None)
+        config = config.__class__(**config_dict)
+
+        # Set special tokens to `0` so it is guaranteed to be in vocab range
+        for special_token in ["pad_token_id", "eos_token_id", "bos_token_id", "sep_token_id"]:
+            if hasattr(config, special_token):
+                setattr(config, special_token, 0)
+            for subconfig_key in config.sub_configs:
+                subconfig = getattr(config, subconfig_key, None)
+                if subconfig and hasattr(subconfig, special_token):
+                    setattr(subconfig, special_token, 0)
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ not in [
+                *get_values(MODEL_MAPPING_NAMES),
+            ]:
+                continue
+
+            model = model_class(copy.deepcopy(config)).to(torch_device).eval()
+            single_batch_input = {}
+            for key, value in batched_input.items():
+                if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
+                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
+                    single_batch_shape = value.shape[0] // batch_size
+                    single_batch_input[key] = value[:single_batch_shape]
+                else:
+                    single_batch_input[key] = value
+
+            with torch.no_grad():
+                model(**single_batch_input)
+
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
         if not self.model_tester.is_training:
             self.skipTest(reason="ModelTester is not configured to run training tests")