diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index b3cb6ebd1be4..81854dda1b74 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -594,7 +594,6 @@ def set_param_for_module( missing_keys.discard(target_name) if ref is not None and ref.shape != param_value.shape and hf_quantizer is None: mismatch_keys.add((target_name, param_value.shape, ref.shape)) - module_obj.param_name._is_hf_initialized = False # Needs to be initialized else: # super important otherwise _init_weight will re-init the param param_value._is_hf_initialized = True diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py index 8d3ae310687e..7974fa4dd173 100644 --- a/src/transformers/integrations/hub_kernels.py +++ b/src/transformers/integrations/hub_kernels.py @@ -341,6 +341,12 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _ mapping[kernel_name] = kernel except FileNotFoundError: mapping[kernel_name] = None + except AssertionError as error: + logger.warning_once( + f"Failed to load the '{kernel_name}' kernel from '{repo_id}' because the current environment does not " + f"support the required backend: {error}" + ) + mapping[kernel_name] = None else: # Try to import is_{kernel_name}_available from ..utils diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index a1075016c3f4..fc68464f252f 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -194,6 +194,7 @@ def __init__( bos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, num_hidden_layers=encoder_layers, + tie_word_embeddings=tie_word_embeddings, ) if "decoder" in common_kwargs: del common_kwargs["decoder"] diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index c0170f45ac43..a399f6d8f00d 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -1069,7 +1069,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( @auto_docstring class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} + _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.embed_tokens.weight"} _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} _keep_in_fp32_modules_strict = ["codec_model"] diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 7b82b5ac5b89..86dec22c8e5b 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -103,7 +103,6 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_grid_pinpoints=None, - tie_word_embeddings=False, video_token_index=32000, spatial_pool_mode="average", spatial_pool_stride=2, @@ -160,7 +159,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False __all__ = ["LlavaNextVideoConfig"] diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 9fd1e850f0e5..cb957a992216 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -102,7 +102,6 @@ def __init__( vision_feature_layer=-1, vision_aspect_ratio="anyres_max_9", image_grid_pinpoints=None, - tie_word_embeddings=False, multimodal_projector_bias=True, **kwargs, ): @@ -188,7 +187,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False __all__ = ["LlavaOnevisionConfig"] diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 76c951668f46..bff720e5cfe2 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -221,5 +221,11 @@ def __init__(self, text_encoder, audio_encoder, decoder, **kwargs): def sampling_rate(self): return self.audio_encoder.sampling_rate + # overriding these because they crash - not 100% sure of that one + def get_text_config(self, decoder=None, encoder=None): + if decoder is None and encoder is None: + decoder = True + return super().get_text_config(decoder=decoder, encoder=encoder) + __all__ = ["MusicgenConfig", "MusicgenDecoderConfig"] diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index a4ec8528590a..af184b241408 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -234,5 +234,11 @@ def __init__( def sampling_rate(self): return self.audio_encoder.sampling_rate + # overriding these because they crash - not 100% sure of that one + def get_text_config(self, decoder=None, encoder=None): + if decoder is None and encoder is None: + decoder = True + return super().get_text_config(decoder=decoder, encoder=encoder) + __all__ = ["MusicgenMelodyConfig", "MusicgenMelodyDecoderConfig"] diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py index acc29cac7ec0..d14c6b3225a2 100644 --- a/tests/models/fsmt/test_modeling_fsmt.py +++ b/tests/models/fsmt/test_modeling_fsmt.py @@ -125,6 +125,7 @@ def get_config(self): eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, + tie_word_embeddings=True, ) def prepare_config_and_inputs_for_common(self): @@ -254,6 +255,7 @@ def test_ensure_weights_are_shared(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() config.tie_word_embeddings = True + config.decoder.tie_word_embeddings = True model = FSMTForConditionalGeneration(config) # FSMT shares three weights. @@ -270,6 +272,7 @@ def test_ensure_weights_are_shared(self): ) config.tie_word_embeddings = False + config.decoder.tie_word_embeddings = False model = FSMTForConditionalGeneration(config) # FSMT shares three weights. diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index d2b5e0949cac..46f644e080bb 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2103,6 +2103,49 @@ def test_tied_weights_keys(self): f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", ) + def test_tie_word_embeddings_is_authoritative(self): + original_config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + tied_config = copy.deepcopy(original_config) + tied_config.get_text_config().tie_word_embeddings = True + + untied_config = copy.deepcopy(original_config) + untied_config.get_text_config().tie_word_embeddings = False + + model_tied = model_class(tied_config) + model_untied = model_class(untied_config) + + if not hasattr(model_tied, "_tied_weights_keys") or not model_tied._tied_weights_keys: + continue + + tied_keys = model_tied._tied_weights_keys + state_dict_tied = model_tied.state_dict() + state_dict_untied = model_untied.state_dict() + + for target_key, source_key in tied_keys.items(): + if target_key not in state_dict_tied or source_key not in state_dict_tied: + continue + if target_key not in state_dict_untied or source_key not in state_dict_untied: + continue + + target_tied_ptr = id_tensor_storage(state_dict_tied[target_key]) + source_tied_ptr = id_tensor_storage(state_dict_tied[source_key]) + target_untied_ptr = id_tensor_storage(state_dict_untied[target_key]) + source_untied_ptr = id_tensor_storage(state_dict_untied[source_key]) + + self.assertEqual( + target_tied_ptr, + source_tied_ptr, + f"{model_class}: With tie_word_embeddings=True, '{target_key}' should share storage with '{source_key}'", + ) + self.assertNotEqual( + target_untied_ptr, + source_untied_ptr, + f"{model_class}: With tie_word_embeddings=False, '{target_key}' should NOT share storage with '{source_key}'. " + f"Config tie_word_embeddings must be authoritative over class-level _tied_weights_keys.", + ) + def test_model_weights_reload_no_missing_tied_weights(self): for model_class in self.all_model_classes: config, _ = self.model_tester.prepare_config_and_inputs_for_common()