diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index c0170f45ac43..a399f6d8f00d 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -1069,7 +1069,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( @auto_docstring class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} + _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.embed_tokens.weight"} _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} _keep_in_fp32_modules_strict = ["codec_model"] diff --git a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py index 790482c33727..31abf2408d12 100644 --- a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py @@ -251,6 +251,7 @@ def __init__(self, config): class KyutaiSpeechToTextForConditionalGeneration(LlamaForCausalLM, GenerationMixin): + _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.embed_tokens.weight"} _keep_in_fp32_modules_strict = ["codec_model"] output_modalities = ("audio", "text") diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 7b82b5ac5b89..32ea6e7a14af 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -54,8 +54,6 @@ class LlavaNextVideoConfig(PreTrainedConfig): image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. video_token_index (`int`, *optional*, defaults to 32000): The video token index to encode the image prompt. spatial_pool_mode (`str`, *optional*, defaults to `"average"`): @@ -103,7 +101,6 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_grid_pinpoints=None, - tie_word_embeddings=False, video_token_index=32000, spatial_pool_mode="average", spatial_pool_stride=2, @@ -160,7 +157,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False __all__ = ["LlavaNextVideoConfig"] diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 4fec99df3b30..61b4b5fdf920 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -72,8 +72,6 @@ class LlavaNextVideoConfig(PreTrainedConfig): image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. video_token_index (`int`, *optional*, defaults to 32000): The video token index to encode the image prompt. spatial_pool_mode (`str`, *optional*, defaults to `"average"`): @@ -121,7 +119,6 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_grid_pinpoints=None, - tie_word_embeddings=False, video_token_index=32000, spatial_pool_mode="average", spatial_pool_stride=2, @@ -178,7 +175,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False class LlavaNextVideoModelOutputWithPast(LlavaNextModelOutputWithPast): diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 9fd1e850f0e5..b72c2235ed7e 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -58,8 +58,6 @@ class LlavaOnevisionConfig(PreTrainedConfig): image_grid_pinpoints (`List`, *optional*): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. multimodal_projector_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the multimodal projector. @@ -102,7 +100,6 @@ def __init__( vision_feature_layer=-1, vision_aspect_ratio="anyres_max_9", image_grid_pinpoints=None, - tie_word_embeddings=False, multimodal_projector_bias=True, **kwargs, ): @@ -188,7 +185,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False __all__ = ["LlavaOnevisionConfig"]