diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index fbda54d2f11c..206857d20a38 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -185,10 +185,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 0989822b4fbe..ca3a8e14af89 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -209,10 +209,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py index 911a25266105..6bfa27011cce 100644 --- a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py @@ -37,6 +37,8 @@ class Cohere2VisionConfig(PreTrainedConfig): The token ID to use as placeholder for the image input. alignment_intermediate_size (`int`, *optional*, defaults to 36864): The size of the intermediate layer for alignment. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings """ model_type = "cohere2_vision" @@ -49,6 +51,7 @@ def __init__( downsample_factor=2, image_token_id=255036, alignment_intermediate_size=36864, + tie_word_embeddings=True, **kwargs, ): self.downsample_factor = downsample_factor @@ -73,9 +76,10 @@ def __init__( text_config["model_type"] = text_config.get("model_type", "cohere2") text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) elif text_config is None: - text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True) + text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=tie_word_embeddings) self.text_config = text_config + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py index 24f7690fd764..2372bf321c7d 100644 --- a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py @@ -44,6 +44,8 @@ class DeepseekVLConfig(PreTrainedConfig): The config object or dictionary of the vision backbone. image_token_id (`int`, *optional*, defaults to 100015): The index representing image tokens in the model's token vocabulary. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -68,6 +70,7 @@ def __init__( text_config: AutoConfig | None = None, vision_config: AutoConfig | None = None, image_token_id: int = 100015, + tie_word_embeddings: bool | None = True, **kwargs, ): if text_config is None: @@ -89,6 +92,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py index 23c3ed23c0d3..90f447090a23 100644 --- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -55,6 +55,8 @@ class DeepseekVLConfig(PreTrainedConfig): The config object or dictionary of the vision backbone. image_token_id (`int`, *optional*, defaults to 100015): The index representing image tokens in the model's token vocabulary. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -79,6 +81,7 @@ def __init__( text_config: AutoConfig | None = None, vision_config: AutoConfig | None = None, image_token_id: int = 100015, + tie_word_embeddings: bool | None = True, **kwargs, ): if text_config is None: @@ -100,6 +103,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py index 35b3dc75166e..cee5d6fe3280 100644 --- a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py @@ -45,6 +45,8 @@ class DeepseekVLHybridConfig(PreTrainedConfig): The config object or dictionary of the high resolution vision backbone. image_token_id (`int`, *optional*, defaults to 100015): The index representing image tokens in the model's token vocabulary. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -70,6 +72,7 @@ def __init__( vision_config: AutoConfig | None = None, high_res_vision_config: AutoConfig | None = None, image_token_id: int = 100015, + tie_word_embeddings: bool | None = True, **kwargs, ): if high_res_vision_config is None: @@ -100,6 +103,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 8488833e6aaf..4b027a7cc421 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -101,6 +101,8 @@ class DeepseekVLHybridConfig(DeepseekVLConfig): The config object or dictionary of the high resolution vision backbone. image_token_id (`int`, *optional*, defaults to 100015): The index representing image tokens in the model's token vocabulary. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -126,6 +128,7 @@ def __init__( vision_config: AutoConfig | None = None, high_res_vision_config: AutoConfig | None = None, image_token_id: int = 100015, + tie_word_embeddings: bool | None = True, **kwargs, ): if high_res_vision_config is None: @@ -142,6 +145,7 @@ def __init__( text_config=text_config, vision_config=vision_config, image_token_id=image_token_id, + tie_word_embeddings=tie_word_embeddings, **kwargs, ) diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index 46c9ac9a2a49..045068ec702d 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -126,6 +126,8 @@ class DeformableDetrConfig(PreTrainedConfig): disable_custom_kernels (`bool`, *optional*, defaults to `False`): Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom kernels are not supported by PyTorch ONNX export. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Examples: @@ -194,6 +196,7 @@ def __init__( eos_coefficient=0.1, focal_alpha=0.25, disable_custom_kernels=False, + tie_word_embeddings=True, **kwargs, ): # We default to values which were previously hard-coded in the model. This enables configurability of the config @@ -268,6 +271,7 @@ def __init__( self.eos_coefficient = eos_coefficient self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels + self.tie_word_embeddings = tie_word_embeddings super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 25e6d14be0d9..9fd071e82781 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -166,6 +166,8 @@ class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings ```python @@ -206,6 +208,7 @@ def __init__( attention_bias=False, attention_dropout: float = 0.1, initializer_range: float = 0.02, + tie_word_embeddings: bool | None = False, **kwargs, ): self.vocab_size = vocab_size @@ -227,6 +230,7 @@ def __init__( self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index f6bf32726736..0d464c8c8f04 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -67,6 +67,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): Beginning of stream token id. eos_token_id (`int`, *optional*, defaults to 2): End of stream token id. + pad_token_id (`int`, *optional*): + The id of the padding token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_parameters (`RopeParameters`, *optional*): @@ -139,6 +141,7 @@ def __init__( use_cache: bool | None = True, bos_token_id: int | None = 0, eos_token_id: int | None = 2, + pad_token_id: int | None = None, tie_word_embeddings: bool | None = False, rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, attention_dropout: float | None = 0.0, @@ -163,6 +166,7 @@ def __init__( self.sliding_window_pattern = sliding_window_pattern self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.tie_word_embeddings = tie_word_embeddings self.layer_types = layer_types diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index a1746024b5f3..95f6e2128c94 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -101,6 +101,8 @@ class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): Beginning of stream token id. eos_token_id (`int`, *optional*, defaults to 2): End of stream token id. + pad_token_id (`int`, *optional*): + The id of the padding token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_parameters (`RopeParameters`, *optional*): @@ -173,6 +175,7 @@ def __init__( use_cache: bool | None = True, bos_token_id: int | None = 0, eos_token_id: int | None = 2, + pad_token_id: int | None = None, tie_word_embeddings: bool | None = False, rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, attention_dropout: float | None = 0.0, @@ -197,6 +200,7 @@ def __init__( self.sliding_window_pattern = sliding_window_pattern self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.tie_word_embeddings = tie_word_embeddings self.layer_types = layer_types diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py index ce0eac49c72e..efa14928b164 100644 --- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py @@ -84,6 +84,8 @@ class FalconMambaConfig(PreTrainedConfig): Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. mixer_rms_eps (`float`, *optional*, defaults to 1e-06): The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -130,6 +132,7 @@ def __init__( use_cache=True, use_falcon_mambapy=False, mixer_rms_eps=1e-6, + tie_word_embeddings=True, **kwargs, ): self.vocab_size = vocab_size @@ -162,6 +165,7 @@ def __init__( self.residual_in_fp32 = residual_in_fp32 self.use_cache = use_cache self.use_falcon_mambapy = use_falcon_mambapy + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) self.mixer_rms_eps = mixer_rms_eps diff --git a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py index a9026c81c767..67fa0c9ab509 100644 --- a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py @@ -110,6 +110,8 @@ class FalconMambaConfig(MambaConfig): Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. mixer_rms_eps (`float`, *optional*, defaults to 1e-06): The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -154,6 +156,7 @@ def __init__( use_cache=True, use_falcon_mambapy=False, mixer_rms_eps=1e-6, + tie_word_embeddings=True, **kwargs, ): super().__init__( @@ -181,6 +184,7 @@ def __init__( rescale_prenorm_residual=rescale_prenorm_residual, use_cache=use_cache, use_falcon_mambapy=use_falcon_mambapy, + tie_word_embeddings=tie_word_embeddings, **kwargs, ) self.mixer_rms_eps = mixer_rms_eps diff --git a/src/transformers/models/fast_vlm/configuration_fast_vlm.py b/src/transformers/models/fast_vlm/configuration_fast_vlm.py index 46e5a6ccbf76..925b4eaa5c71 100644 --- a/src/transformers/models/fast_vlm/configuration_fast_vlm.py +++ b/src/transformers/models/fast_vlm/configuration_fast_vlm.py @@ -51,6 +51,8 @@ class FastVlmConfig(PreTrainedConfig): vision features. Only -1 supported. multimodal_projector_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the multimodal projector. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings Example: @@ -82,6 +84,7 @@ def __init__( vision_feature_select_strategy="full", vision_feature_layer=-1, multimodal_projector_bias=True, + tie_word_embeddings=False, **kwargs, ): self.image_token_id = image_token_id @@ -130,6 +133,13 @@ def __init__( self.text_config = text_config self.multimodal_projector_bias = multimodal_projector_bias + self.tie_word_embeddings = tie_word_embeddings + + # The default value is `False` but this config is used with many model types + # Attr `tie_word_embeddings` was saved in text config for those models, so we + # need an ugly workaround and forward-pass the attr from text config + if not tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = self.text_config.tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py index 7f35ca4c1ee9..7346be927616 100644 --- a/src/transformers/models/fast_vlm/modular_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modular_fast_vlm.py @@ -63,6 +63,8 @@ class FastVlmConfig(LlavaConfig): vision features. Only -1 supported. multimodal_projector_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the multimodal projector. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings Example: @@ -90,6 +92,7 @@ def __init__( vision_feature_select_strategy="full", vision_feature_layer=-1, multimodal_projector_bias=True, + tie_word_embeddings=False, **kwargs, ): self.image_token_id = image_token_id @@ -138,6 +141,13 @@ def __init__( self.text_config = text_config self.multimodal_projector_bias = multimodal_projector_bias + self.tie_word_embeddings = tie_word_embeddings + + # The default value is `False` but this config is used with many model types + # Attr `tie_word_embeddings` was saved in text config for those models, so we + # need an ugly workaround and forward-pass the attr from text config + if not tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = self.text_config.tie_word_embeddings PreTrainedConfig.__init__(**kwargs) diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index 853abcfec886..408184111afb 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -431,9 +431,8 @@ class FlavaConfig(PreTrainedConfig): Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses. return_loss (`bool`, *optional*, defaults to `True`): Whether to return loss or not - - kwargs (*optional*): - Dictionary of keyword arguments. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -483,6 +482,7 @@ def __init__( global_backprop_contrastive: bool = True, skip_unmasked_multimodal_encoder: bool = True, return_loss: bool = True, + tie_word_embeddings: bool | None = True, **kwargs, ): # If `_config_dict` exist, we use them for the backward compatibility. @@ -663,6 +663,7 @@ def __init__( self.global_backprop_contrastive = global_backprop_contrastive self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder self.return_loss = return_loss + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/florence2/configuration_florence2.py b/src/transformers/models/florence2/configuration_florence2.py index 3ece88742a1a..226d17a476f5 100644 --- a/src/transformers/models/florence2/configuration_florence2.py +++ b/src/transformers/models/florence2/configuration_florence2.py @@ -153,6 +153,8 @@ class Florence2Config(PreTrainedConfig): The image token index to encode the image prompt. is_encoder_decoder (bool, optional, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -187,6 +189,7 @@ def __init__( vision_config=None, image_token_id=51289, is_encoder_decoder=True, + tie_word_embeddings=True, **kwargs, ): if isinstance(text_config, dict): @@ -204,6 +207,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__( is_encoder_decoder=is_encoder_decoder, diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index 18170ede0e5c..055f7685803c 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -175,6 +175,8 @@ class Florence2Config(PreTrainedConfig): The image token index to encode the image prompt. is_encoder_decoder (bool, optional, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -209,6 +211,7 @@ def __init__( vision_config=None, image_token_id=51289, is_encoder_decoder=True, + tie_word_embeddings=True, **kwargs, ): if isinstance(text_config, dict): @@ -226,6 +229,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__( is_encoder_decoder=is_encoder_decoder, diff --git a/src/transformers/models/glm46v/configuration_glm46v.py b/src/transformers/models/glm46v/configuration_glm46v.py index 7db51870c1c3..d7bd651e2779 100644 --- a/src/transformers/models/glm46v/configuration_glm46v.py +++ b/src/transformers/models/glm46v/configuration_glm46v.py @@ -50,6 +50,8 @@ class Glm46VConfig(PreTrainedConfig): The video start token index to encode the start of video. video_end_token_id (`int`, *optional*, defaults to 151362): The video end token index to encode the end of video. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings ```python >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig @@ -78,6 +80,7 @@ def __init__( image_end_token_id=151340, video_start_token_id=151361, video_end_token_id=151362, + tie_word_embeddings=False, **kwargs, ): if isinstance(vision_config, dict): @@ -98,6 +101,7 @@ def __init__( self.video_end_token_id = video_end_token_id self.image_start_token_id = image_start_token_id self.image_end_token_id = image_end_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/glm46v/modular_glm46v.py b/src/transformers/models/glm46v/modular_glm46v.py index d6a72bff671b..a52bd3411840 100644 --- a/src/transformers/models/glm46v/modular_glm46v.py +++ b/src/transformers/models/glm46v/modular_glm46v.py @@ -52,6 +52,8 @@ class Glm46VConfig(PreTrainedConfig): The video start token index to encode the start of video. video_end_token_id (`int`, *optional*, defaults to 151362): The video end token index to encode the end of video. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings ```python >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig @@ -80,6 +82,7 @@ def __init__( image_end_token_id=151340, video_start_token_id=151361, video_end_token_id=151362, + tie_word_embeddings=False, **kwargs, ): if isinstance(vision_config, dict): @@ -100,6 +103,7 @@ def __init__( self.video_end_token_id = video_end_token_id self.image_start_token_id = image_start_token_id self.image_end_token_id = image_end_token_id + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/got_ocr2/configuration_got_ocr2.py b/src/transformers/models/got_ocr2/configuration_got_ocr2.py index 4048ddaa4328..49384f5f45b4 100644 --- a/src/transformers/models/got_ocr2/configuration_got_ocr2.py +++ b/src/transformers/models/got_ocr2/configuration_got_ocr2.py @@ -134,6 +134,9 @@ class GotOcr2Config(PreTrainedConfig): The image token index to encode the image prompt. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + ```python >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config @@ -160,6 +163,7 @@ def __init__( text_config: dict | None = None, image_token_index: int | None = 151859, image_seq_length: int | None = 576, + tie_word_embeddings: bool | None = True, **kwargs, ): self.image_token_index = image_token_index @@ -188,7 +192,7 @@ def __init__( initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, - tie_word_embeddings=True, + tie_word_embeddings=tie_word_embeddings, rope_theta=1000000.0, rope_parameters=None, use_sliding_window=False, @@ -198,6 +202,7 @@ def __init__( ) self.text_config = text_config + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index f50cd2bb03c4..2cd299fa4bc7 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -155,6 +155,9 @@ class GotOcr2Config(PreTrainedConfig): The image token index to encode the image prompt. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + ```python >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config @@ -181,6 +184,7 @@ def __init__( text_config: dict | None = None, image_token_index: int | None = 151859, image_seq_length: int | None = 576, + tie_word_embeddings: bool | None = True, **kwargs, ): self.image_token_index = image_token_index @@ -209,7 +213,7 @@ def __init__( initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, - tie_word_embeddings=True, + tie_word_embeddings=tie_word_embeddings, rope_theta=1000000.0, rope_parameters=None, use_sliding_window=False, @@ -219,6 +223,7 @@ def __init__( ) self.text_config = text_config + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py index 557452843660..f043ea5d1a60 100644 --- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py @@ -111,6 +111,7 @@ def __init__( use_cache=True, bos_token_id=50256, eos_token_id=50256, + pad_token_id=None, attention_softmax_in_fp32=True, scale_attention_softmax_in_fp32=True, multi_query=True, @@ -141,6 +142,7 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id super().__init__(**kwargs) diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index c51fb8928bef..f42e8f06b126 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -122,6 +122,7 @@ def __init__( use_cache: bool | None = True, bos_token_id: int | None = 0, eos_token_id: int | None = 2, + pad_token_id: int | None = None, tie_word_embeddings: bool | None = False, use_parallel_residual: bool | None = True, rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, @@ -132,6 +133,7 @@ def __init__( self.is_decoder = is_decoder self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py index 3aac2b14f308..8a2a32aced52 100644 --- a/src/transformers/models/gptj/configuration_gptj.py +++ b/src/transformers/models/gptj/configuration_gptj.py @@ -102,6 +102,7 @@ def __init__( use_cache=True, bos_token_id=50256, eos_token_id=50256, + pad_token_id=None, tie_word_embeddings=False, **kwargs, ): @@ -122,9 +123,7 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/internvl/configuration_internvl.py b/src/transformers/models/internvl/configuration_internvl.py index c0503bbbd715..40dcfe2d8035 100644 --- a/src/transformers/models/internvl/configuration_internvl.py +++ b/src/transformers/models/internvl/configuration_internvl.py @@ -167,6 +167,9 @@ class InternVLConfig(PreTrainedConfig): vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + ```python >>> from transformers import InternVLForConditionalGeneration, InternVLConfig @@ -194,6 +197,7 @@ def __init__( projector_hidden_act="gelu", vision_feature_layer=-1, vision_feature_select_strategy="default", + tie_word_embeddings=True, **kwargs, ): self.image_token_id = image_token_id @@ -217,6 +221,7 @@ def __init__( text_config = CONFIG_MAPPING["qwen2"]() self.text_config = text_config + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 2ea2c6c20136..b945ce846656 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -68,6 +68,8 @@ class JetMoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 2): The id of the "end-of-sequence" token. + pad_token_id (`int`, *optional*): + The id of the padding token. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether the model's input and output word embeddings should be tied. rope_parameters (`RopeParameters`, *optional*): @@ -115,6 +117,7 @@ def __init__( use_cache: bool | None = True, bos_token_id: int | None = 1, eos_token_id: int | None = 2, + pad_token_id: int | None = None, tie_word_embeddings: bool | None = True, rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, rms_norm_eps: int | None = 1e-6, @@ -143,11 +146,9 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.rms_norm_eps = rms_norm_eps self.rope_parameters = rope_parameters - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index d141bbb0784b..b62047377d3f 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -160,6 +160,12 @@ def __init__( self.text_config = text_config + # The default value is `False` but this config is used with many model types + # Attr `tie_word_embeddings` was saved in text config for those models, so we + # need an ugly workaround and forward-pass the attr from text config + if not tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = self.text_config.tie_word_embeddings + super().__init__(**kwargs) diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index d3670efec2d3..487834caae0c 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -179,6 +179,12 @@ def __init__( self.text_config = text_config + # The default value is `False` but this config is used with many model types + # Attr `tie_word_embeddings` was saved in text config for those models, so we + # need an ugly workaround and forward-pass the attr from text config + if not tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = self.text_config.tie_word_embeddings + super().__init__(**kwargs) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index b5c1a4c89593..4369b22933bd 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -188,6 +188,12 @@ def __init__( self.text_config = text_config + # The default value is `False` but this config is used with many model types + # Attr `tie_word_embeddings` was saved in text config for those models, so we + # need an ugly workaround and forward-pass the attr from text config + if not tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = self.text_config.tie_word_embeddings + super().__init__(**kwargs) diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py index 10ea8f69dde1..b4326ee1f904 100644 --- a/src/transformers/models/mamba/configuration_mamba.py +++ b/src/transformers/models/mamba/configuration_mamba.py @@ -80,7 +80,8 @@ class MambaConfig(PreTrainedConfig): Whether or not the cache should be used. use_mambapy (`bool`, *optional*, defaults to `False`): Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. - + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -125,6 +126,7 @@ def __init__( rescale_prenorm_residual=False, use_cache=True, use_mambapy=False, + tie_word_embeddings=True, **kwargs, ): self.vocab_size = vocab_size @@ -152,6 +154,7 @@ def __init__( self.residual_in_fp32 = residual_in_fp32 self.use_cache = use_cache self.use_mambapy = use_mambapy + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index 3a7c62c39e05..ad9cca14dba2 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -147,6 +147,12 @@ class MptConfig(PreTrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*): + The id of the beginning of sequence token. + eos_token_id (`int`, *optional*): + The id of the end of sequence token. Example: @@ -193,6 +199,9 @@ def __init__( use_cache: bool = False, initializer_range=0.02, tie_word_embeddings=True, + pad_token_id=None, + bos_token_id=None, + eos_token_id=None, **kwargs, ): if attn_config is None: @@ -219,6 +228,9 @@ def __init__( self.use_cache = use_cache self.initializer_range = initializer_range self.tie_word_embeddings = tie_word_embeddings + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id super().__init__(**kwargs) diff --git a/src/transformers/models/ovis2/configuration_ovis2.py b/src/transformers/models/ovis2/configuration_ovis2.py index 26b00abae66a..45306a433f03 100644 --- a/src/transformers/models/ovis2/configuration_ovis2.py +++ b/src/transformers/models/ovis2/configuration_ovis2.py @@ -129,6 +129,8 @@ class Ovis2Config(PreTrainedConfig): Vocabulary size of the text model. hidden_size (`int`, *optional*, defaults to 1536): Dimensionality of the encoder layers and the pooler layer. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings ```python >>> from transformers import Ovis2ForConditionalGeneration, Ovis2Config @@ -155,6 +157,7 @@ def __init__( visual_indicator_token_ids=[151666, 151667, 151668, 151669, 151670], vocab_size=151643, hidden_size=1536, + tie_word_embeddings=True, **kwargs, ): if isinstance(vision_config, dict): @@ -175,6 +178,7 @@ def __init__( self.hidden_size = hidden_size self.image_token_id = image_token_id self.visual_indicator_token_ids = visual_indicator_token_ids + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index a8bf97887e45..f28c0ecfc076 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -87,6 +87,8 @@ class StableLmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): The id of the `BOS` token in the vocabulary. eos_token_id (int, *optional*, defaults to 0): The id of the `EOS` token in the vocabulary. + pad_token_id (int, *optional*): + The id of the `PAD` token in the vocabulary. Example: @@ -122,6 +124,7 @@ def __init__( attention_dropout: float | None = 0.0, bos_token_id: int | None = 0, eos_token_id: int | None = 0, + pad_token_id: int | None = None, **kwargs, ): self.vocab_size = vocab_size @@ -147,6 +150,7 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index 0beec764fda6..66c01d9efdc3 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -95,6 +95,8 @@ class TvpConfig(PreTrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): The dropout probability of attention layers. + pad_token_id (`int`, *optional*): + The id of a PAD token in the vocabulary. """ model_type = "tvp" @@ -128,6 +130,7 @@ def __init__( layer_norm_eps=1e-12, initializer_range=0.02, attention_probs_dropout_prob=0.1, + pad_token_id=None, **kwargs, ): if backbone_config is None and backbone is None: @@ -172,6 +175,7 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.pad_token_id = pad_token_id super().__init__(**kwargs) diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index f1cad9d6fc50..004cbc01e3e1 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -68,6 +68,7 @@ def __init__( num_feature_levels=4, encoder_n_points=2, decoder_n_points=6, + tie_word_embeddings=False, ): self.parent = parent self.batch_size = batch_size @@ -88,6 +89,7 @@ def __init__( self.num_feature_levels = num_feature_levels self.encoder_n_points = encoder_n_points self.decoder_n_points = decoder_n_points + self.tie_word_embeddings = tie_word_embeddings # we also set the expected seq length for both encoder and decoder self.encoder_seq_length = ( @@ -149,6 +151,9 @@ def get_config(self): backbone=None, backbone_config=resnet_config, use_pretrained_backbone=False, + # FIXME; cls attr `toed_weihgt_keys` must not be modified in __init__ + # Several models affected so for now just let it be and fix in separate PR + tie_word_embeddings=self.tie_word_embeddings, ) def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index fda76cf864ef..6711681ac98a 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -304,6 +304,10 @@ def test_model(self): def test_multi_gpu_data_parallel_forward(self): pass + @unittest.skip("LayoutLM needs specific combination of config values and cannot run with defaults") + def test_model_forward_default_config_values(self): + pass + def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 0b1177fe7f92..5cf50361495d 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1501,6 +1501,62 @@ def recursive_check(batched_object, single_row_object, model_name, key): model_row_output[key] = model_row_output[key][1:] recursive_check(model_batched_output[key], model_row_output[key], model_name, key) + def test_model_forward_default_config_values( + self, + ): + """ + Tests that the model can run forward pass when config is intialized without common attributes. + We expect that these attributes have a default value and will not cause errors. See #41541 + where the attributes were removed from `PreTrainedConfig` and moved to each model's config + class. + """ + common_config_properties = [ + "pad_token_id", + "eos_token_id", + "bos_token_id", + "sep_token_id", + "tie_word_embeddings", + ] + config, batched_input = self.model_tester.prepare_config_and_inputs_for_common() + batch_size = self.model_tester.batch_size + + config_dict = config.to_diff_dict() + for common_config_property in common_config_properties: + config_dict.pop(common_config_property, None) + for subconfig_key in config.sub_configs: + subconfig = config_dict.get(subconfig_key, {}) + if subconfig: + subconfig.pop(common_config_property, None) + config = config.__class__(**config_dict) + + # Set special tokens to `0` so it is guaranteed to be in vocab range + for special_token in ["pad_token_id", "eos_token_id", "bos_token_id", "sep_token_id"]: + if hasattr(config, special_token): + setattr(config, special_token, 0) + for subconfig_key in config.sub_configs: + subconfig = getattr(config, subconfig_key, None) + if subconfig and hasattr(subconfig, special_token): + setattr(subconfig, special_token, 0) + + for model_class in self.all_model_classes: + if model_class.__name__ not in [ + *get_values(MODEL_MAPPING_NAMES), + ]: + continue + + model = model_class(copy.deepcopy(config)).to(torch_device).eval() + single_batch_input = {} + for key, value in batched_input.items(): + if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0: + # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size + single_batch_shape = value.shape[0] // batch_size + single_batch_input[key] = value[:single_batch_shape] + else: + single_batch_input[key] = value + + with torch.no_grad(): + model(**single_batch_input) + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="ModelTester is not configured to run training tests")