From 47b0e0ea72783c34d8422ebe776a70919ead0966 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 10:53:23 +0100 Subject: [PATCH 01/21] one fix --- src/transformers/models/mistral/modeling_mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 60c7e2d49eed..b0b1738018e3 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -412,7 +412,7 @@ def forward( @auto_docstring class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} + _tied_weights_keys = {} _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} From 70fb4029d24522278ee938ca85fa70189a54e7b9 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 11:59:12 +0100 Subject: [PATCH 02/21] attempt mistral3 --- src/transformers/models/mistral/modular_mistral.py | 2 +- src/transformers/models/mistral3/configuration_mistral3.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/mistral/modular_mistral.py b/src/transformers/models/mistral/modular_mistral.py index 709ff855c399..e5f33487133b 100644 --- a/src/transformers/models/mistral/modular_mistral.py +++ b/src/transformers/models/mistral/modular_mistral.py @@ -174,7 +174,7 @@ def forward( class MistralForCausalLM(LlamaForCausalLM): - pass + _tied_weights_keys = {} class MistralForTokenClassification(LlamaForTokenClassification): diff --git a/src/transformers/models/mistral3/configuration_mistral3.py b/src/transformers/models/mistral3/configuration_mistral3.py index 59851c135987..2cbac8952fa0 100644 --- a/src/transformers/models/mistral3/configuration_mistral3.py +++ b/src/transformers/models/mistral3/configuration_mistral3.py @@ -110,7 +110,10 @@ def __init__( if isinstance(text_config, dict): text_config["model_type"] = text_config.get("model_type", "mistral") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + text_config = CONFIG_MAPPING[text_config["model_type"]]( + **text_config, + tie_word_embeddings=False + ) elif text_config is None: text_config = CONFIG_MAPPING["mistral"]( attention_dropout=0.0, From 6ce2bc66a356387081cadc8fe5cfbf414b9d2186 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 13:43:37 +0100 Subject: [PATCH 03/21] empty dict --- src/transformers/models/mistral3/modeling_mistral3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 0f6e2a1d3efc..8fc7735be32f 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -360,7 +360,7 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin) r"^multi_modal_projector": "model.multi_modal_projector", r"^language_model.lm_head": "lm_head", } - _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} + _tied_weights_keys = {} # "lm_head.weight": "model.language_model.embed_tokens.weight"} def __init__(self, config: Mistral3Config): super().__init__(config) From 60665c6981567762fe35895b5f36edd904a310c1 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 15:18:41 +0100 Subject: [PATCH 04/21] that was olmoe's problem --- src/transformers/conversion_mapping.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index b6aad7e94650..f27abae657e6 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -117,6 +117,21 @@ def _build_checkpoint_conversion_mapping(): mapping["qwen3_vl_moe"] = mapping["qwen2_moe"].copy() mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy() mapping["minimax"] = mapping["mixtral"].copy() + mapping["olmoe"] = [ + WeightConverter( + source_keys=[ + "mlp.experts.*.gate_proj.weight", + "mlp.experts.*.up_proj.weight", + ], + target_keys="mlp.experts.gate_up_proj", + operations=[MergeModulelist(dim=0), Concatenate(dim=1)], + ), + WeightConverter( + source_keys=["mlp.experts.*.down_proj.weight"], + target_keys="mlp.experts.down_proj", + operations=[MergeModulelist(dim=0)], + ), + ] return mapping From aa7ab8039c6baeeec6f32ff415dc591c090bb0de Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 16:54:03 +0100 Subject: [PATCH 05/21] current CI status? --- src/transformers/conversion_mapping.py | 28 +++++++++---------- .../models/mistral/modeling_mistral.py | 2 +- .../models/mistral/modular_mistral.py | 2 +- .../models/mistral3/configuration_mistral3.py | 5 +--- .../models/mistral3/modeling_mistral3.py | 2 +- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index f27abae657e6..735291c50d9f 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -118,20 +118,20 @@ def _build_checkpoint_conversion_mapping(): mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy() mapping["minimax"] = mapping["mixtral"].copy() mapping["olmoe"] = [ - WeightConverter( - source_keys=[ - "mlp.experts.*.gate_proj.weight", - "mlp.experts.*.up_proj.weight", - ], - target_keys="mlp.experts.gate_up_proj", - operations=[MergeModulelist(dim=0), Concatenate(dim=1)], - ), - WeightConverter( - source_keys=["mlp.experts.*.down_proj.weight"], - target_keys="mlp.experts.down_proj", - operations=[MergeModulelist(dim=0)], - ), - ] + WeightConverter( + source_keys=[ + "mlp.experts.*.gate_proj.weight", + "mlp.experts.*.up_proj.weight", + ], + target_keys="mlp.experts.gate_up_proj", + operations=[MergeModulelist(dim=0), Concatenate(dim=1)], + ), + WeightConverter( + source_keys=["mlp.experts.*.down_proj.weight"], + target_keys="mlp.experts.down_proj", + operations=[MergeModulelist(dim=0)], + ), + ] return mapping diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index b0b1738018e3..60c7e2d49eed 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -412,7 +412,7 @@ def forward( @auto_docstring class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin): - _tied_weights_keys = {} + _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} diff --git a/src/transformers/models/mistral/modular_mistral.py b/src/transformers/models/mistral/modular_mistral.py index e5f33487133b..709ff855c399 100644 --- a/src/transformers/models/mistral/modular_mistral.py +++ b/src/transformers/models/mistral/modular_mistral.py @@ -174,7 +174,7 @@ def forward( class MistralForCausalLM(LlamaForCausalLM): - _tied_weights_keys = {} + pass class MistralForTokenClassification(LlamaForTokenClassification): diff --git a/src/transformers/models/mistral3/configuration_mistral3.py b/src/transformers/models/mistral3/configuration_mistral3.py index 2cbac8952fa0..80e6a91bf22c 100644 --- a/src/transformers/models/mistral3/configuration_mistral3.py +++ b/src/transformers/models/mistral3/configuration_mistral3.py @@ -110,10 +110,7 @@ def __init__( if isinstance(text_config, dict): text_config["model_type"] = text_config.get("model_type", "mistral") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **text_config, - tie_word_embeddings=False - ) + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config, tie_word_embeddings=False) elif text_config is None: text_config = CONFIG_MAPPING["mistral"]( attention_dropout=0.0, diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 8fc7735be32f..0f6e2a1d3efc 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -360,7 +360,7 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin) r"^multi_modal_projector": "model.multi_modal_projector", r"^language_model.lm_head": "lm_head", } - _tied_weights_keys = {} # "lm_head.weight": "model.language_model.embed_tokens.weight"} + _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} def __init__(self, config: Mistral3Config): super().__init__(config) From afa33c5d0e279de74e8d8aedf4b0dac90da7b96a Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 16:55:12 +0100 Subject: [PATCH 06/21] actual CI status --- src/transformers/models/mistral3/configuration_mistral3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/mistral3/configuration_mistral3.py b/src/transformers/models/mistral3/configuration_mistral3.py index 80e6a91bf22c..59851c135987 100644 --- a/src/transformers/models/mistral3/configuration_mistral3.py +++ b/src/transformers/models/mistral3/configuration_mistral3.py @@ -110,7 +110,7 @@ def __init__( if isinstance(text_config, dict): text_config["model_type"] = text_config.get("model_type", "mistral") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config, tie_word_embeddings=False) + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) elif text_config is None: text_config = CONFIG_MAPPING["mistral"]( attention_dropout=0.0, From 22a258f6baa33f2588e7071fd9156dbb7093a669 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 17:17:25 +0100 Subject: [PATCH 07/21] simplify --- src/transformers/conversion_mapping.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 735291c50d9f..10375807422a 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -116,22 +116,8 @@ def _build_checkpoint_conversion_mapping(): mapping["qwen3_next"] = mapping["qwen2_moe"].copy() mapping["qwen3_vl_moe"] = mapping["qwen2_moe"].copy() mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy() + mapping["olmoe"] = mapping["qwen2_moe"].copy() mapping["minimax"] = mapping["mixtral"].copy() - mapping["olmoe"] = [ - WeightConverter( - source_keys=[ - "mlp.experts.*.gate_proj.weight", - "mlp.experts.*.up_proj.weight", - ], - target_keys="mlp.experts.gate_up_proj", - operations=[MergeModulelist(dim=0), Concatenate(dim=1)], - ), - WeightConverter( - source_keys=["mlp.experts.*.down_proj.weight"], - target_keys="mlp.experts.down_proj", - operations=[MergeModulelist(dim=0)], - ), - ] return mapping From 076f20ed6096c136b095aeae65a726dac0bed25c Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 25 Nov 2025 17:56:52 +0100 Subject: [PATCH 08/21] hmm? --- src/transformers/modeling_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 748d7af639af..bac9fe1c5085 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2248,8 +2248,10 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict: return expanded_tied_weights tied_mapping = self._tied_weights_keys + text_config = self.config.get_text_config(decoder=True) + tie_word_embeddings = getattr(text_config, "tie_word_embeddings", self.config.tie_word_embeddings) # If the config does not specify any tying, return empty dict - if not self.config.tie_word_embeddings and not self.config.tie_encoder_decoder: + if not tie_word_embeddings and not self.config.tie_encoder_decoder: return {} # If None, return empty dict elif tied_mapping is None: @@ -3174,7 +3176,7 @@ def save_pretrained( shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} # Recursively descend to find tied weight keys - _tied_weights_keys = set(_get_tied_weight_keys(self)) + _tied_weights_keys = set(self.all_tied_weights_keys.keys()) error_names = [] to_delete_names = set() for names in shared_ptrs.values(): From 58ec6e632c153bc4acdfaa345e3dfdb58a0c1773 Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 25 Nov 2025 18:14:14 +0100 Subject: [PATCH 09/21] bird --- src/transformers/modeling_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index bac9fe1c5085..51f9c801377f 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3176,7 +3176,11 @@ def save_pretrained( shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} # Recursively descend to find tied weight keys - _tied_weights_keys = set(self.all_tied_weights_keys.keys()) + tied_keys_attr = getattr(self, "all_tied_weights_keys", None) + if tied_keys_attr is not None: + _tied_weights_keys = set(tied_keys_attr.keys()) + else: + _tied_weights_keys = set(_get_tied_weight_keys(self)) error_names = [] to_delete_names = set() for names in shared_ptrs.values(): @@ -4410,7 +4414,9 @@ def _move_missing_keys_from_meta_to_cpu( # The tied weight keys are in the "missing" usually, but they should not be moved (they will be tied anyway) # This is especially important because if they are moved, they will lose the `_is_hf_initialized` flag, and they # will be re-initialized for nothing (which can be quite long) - for key in missing_keys - self.all_tied_weights_keys.keys(): + tied_keys_attr = getattr(self, "all_tied_weights_keys", {}) or {} + tied_keys = set(tied_keys_attr.keys()) + for key in missing_keys - tied_keys: param = model_state_dict[key] # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them if param.device == torch.device("meta"): From 5a05c977749d8bb3996eff5866468639694c5a09 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 22:36:15 +0100 Subject: [PATCH 10/21] force tie word embeddings to false --- src/transformers/configuration_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 5d6fbda009dc..81eec8c688a0 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -755,6 +755,10 @@ def from_dict( config = cls(**config_dict) + # default tie_word_embeddings to False if None, see https://github.com/huggingface/transformers/issues/42313 + if hasattr(config, 'tie_word_embeddings') and config.tie_word_embeddings is None: + config.tie_word_embeddings = False + # Update config with kwargs if needed if "num_labels" in kwargs and "id2label" in kwargs: num_labels = kwargs["num_labels"] From 9c81dfcfd951ecfb06bfb4a16ac94ae156dfa1a4 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 25 Nov 2025 22:58:16 +0100 Subject: [PATCH 11/21] specifics of FSMT --- src/transformers/configuration_utils.py | 2 +- src/transformers/models/fsmt/configuration_fsmt.py | 1 + tests/models/fsmt/test_modeling_fsmt.py | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 81eec8c688a0..212a4f4626cd 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -756,7 +756,7 @@ def from_dict( config = cls(**config_dict) # default tie_word_embeddings to False if None, see https://github.com/huggingface/transformers/issues/42313 - if hasattr(config, 'tie_word_embeddings') and config.tie_word_embeddings is None: + if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is None: config.tie_word_embeddings = False # Update config with kwargs if needed diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index a1075016c3f4..fc68464f252f 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -194,6 +194,7 @@ def __init__( bos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, num_hidden_layers=encoder_layers, + tie_word_embeddings=tie_word_embeddings, ) if "decoder" in common_kwargs: del common_kwargs["decoder"] diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py index acc29cac7ec0..d14c6b3225a2 100644 --- a/tests/models/fsmt/test_modeling_fsmt.py +++ b/tests/models/fsmt/test_modeling_fsmt.py @@ -125,6 +125,7 @@ def get_config(self): eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, + tie_word_embeddings=True, ) def prepare_config_and_inputs_for_common(self): @@ -254,6 +255,7 @@ def test_ensure_weights_are_shared(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() config.tie_word_embeddings = True + config.decoder.tie_word_embeddings = True model = FSMTForConditionalGeneration(config) # FSMT shares three weights. @@ -270,6 +272,7 @@ def test_ensure_weights_are_shared(self): ) config.tie_word_embeddings = False + config.decoder.tie_word_embeddings = False model = FSMTForConditionalGeneration(config) # FSMT shares three weights. From e22acad60fcbc349ea6558ba5f681d5d3abcc63c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 26 Nov 2025 14:20:59 +0100 Subject: [PATCH 12/21] wrong reference? --- src/transformers/core_model_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 983ae97cbdc5..80c90e538b27 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -504,7 +504,7 @@ def set_param_for_module( missing_keys.discard(target_name) if ref is not None and ref.shape != param_value.shape and hf_quantizer is None: mismatch_keys.add((target_name, param_value.shape, ref.shape)) - module_obj.param_name._is_hf_initialized = False # Needs to be initialized + ref._is_hf_initialized = False # unsure - but seems unreachable else? else: # super important otherwise _init_weight will re-init the param param_value._is_hf_initialized = True From f939769ebe29c2863ca39be2b61357c94284bd3a Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 27 Nov 2025 16:44:13 +0100 Subject: [PATCH 13/21] finalize --- src/transformers/core_model_loading.py | 1 - src/transformers/modeling_utils.py | 7 +++- tests/test_modeling_common.py | 45 ++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 80c90e538b27..9411402a7def 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -504,7 +504,6 @@ def set_param_for_module( missing_keys.discard(target_name) if ref is not None and ref.shape != param_value.shape and hf_quantizer is None: mismatch_keys.add((target_name, param_value.shape, ref.shape)) - ref._is_hf_initialized = False # unsure - but seems unreachable else? else: # super important otherwise _init_weight will re-init the param param_value._is_hf_initialized = True diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 51f9c801377f..9ead247e6431 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2249,7 +2249,12 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict: tied_mapping = self._tied_weights_keys text_config = self.config.get_text_config(decoder=True) - tie_word_embeddings = getattr(text_config, "tie_word_embeddings", self.config.tie_word_embeddings) + if not hasattr(text_config, "tie_word_embeddings"): + logger.warning( + f"Text config {text_config.__class__.__name__} does not have 'tie_word_embeddings' attribute. " + "This may cause issues with weight tying." + ) + tie_word_embeddings = text_config.tie_word_embeddings # If the config does not specify any tying, return empty dict if not tie_word_embeddings and not self.config.tie_encoder_decoder: return {} diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 66a6dfd12653..76f3d6398d82 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2100,6 +2100,51 @@ def test_tied_weights_keys(self): f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", ) + def test_tie_word_embeddings_is_authoritative(self): + original_config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + tied_config = copy.deepcopy(original_config) + tied_config.get_text_config().tie_word_embeddings = True + tied_config.tie_word_embeddings = True + + untied_config = copy.deepcopy(original_config) + untied_config.get_text_config().tie_word_embeddings = False + untied_config.tie_word_embeddings = False + + model_tied = model_class(tied_config) + model_untied = model_class(untied_config) + + if not hasattr(model_tied, "_tied_weights_keys") or not model_tied._tied_weights_keys: + continue + + tied_keys = model_tied._tied_weights_keys + state_dict_tied = model_tied.state_dict() + state_dict_untied = model_untied.state_dict() + + for target_key, source_key in tied_keys.items(): + if target_key not in state_dict_tied or source_key not in state_dict_tied: + continue + if target_key not in state_dict_untied or source_key not in state_dict_untied: + continue + + target_tied_ptr = id_tensor_storage(state_dict_tied[target_key]) + source_tied_ptr = id_tensor_storage(state_dict_tied[source_key]) + target_untied_ptr = id_tensor_storage(state_dict_untied[target_key]) + source_untied_ptr = id_tensor_storage(state_dict_untied[source_key]) + + self.assertEqual( + target_tied_ptr, + source_tied_ptr, + f"{model_class}: With tie_word_embeddings=True, '{target_key}' should share storage with '{source_key}'", + ) + self.assertNotEqual( + target_untied_ptr, + source_untied_ptr, + f"{model_class}: With tie_word_embeddings=False, '{target_key}' should NOT share storage with '{source_key}'. " + f"Config tie_word_embeddings must be authoritative over class-level _tied_weights_keys.", + ) + def test_model_weights_reload_no_missing_tied_weights(self): for model_class in self.all_model_classes: config, _ = self.model_tester.prepare_config_and_inputs_for_common() From 9960aa2e3256a75081b336bc1165b757a8f3234a Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 27 Nov 2025 16:51:45 +0100 Subject: [PATCH 14/21] fixup --- tests/test_modeling_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 76f3d6398d82..838e1addcbcb 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2106,11 +2106,9 @@ def test_tie_word_embeddings_is_authoritative(self): for model_class in self.all_model_classes: tied_config = copy.deepcopy(original_config) tied_config.get_text_config().tie_word_embeddings = True - tied_config.tie_word_embeddings = True untied_config = copy.deepcopy(original_config) untied_config.get_text_config().tie_word_embeddings = False - untied_config.tie_word_embeddings = False model_tied = model_class(tied_config) model_untied = model_class(untied_config) From 24f0b0992adb1c8aa21ed14b831513596997554a Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 27 Nov 2025 17:36:41 +0100 Subject: [PATCH 15/21] weird mamba error --- src/transformers/integrations/hub_kernels.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py index 0ab866ecbd6d..76f2333c5955 100644 --- a/src/transformers/integrations/hub_kernels.py +++ b/src/transformers/integrations/hub_kernels.py @@ -288,6 +288,12 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _ mapping[kernel_name] = kernel except FileNotFoundError: mapping[kernel_name] = None + except AssertionError as error: + logger.warning_once( + f"Failed to load the '{kernel_name}' kernel from '{repo_id}' because the current environment does not " + f"support the required backend: {error}" + ) + mapping[kernel_name] = None else: # Try to import is_{kernel_name}_available from ..utils From d2a03d2a3d2857883e1fcc34476d9b03c6c18e84 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 27 Nov 2025 17:37:28 +0100 Subject: [PATCH 16/21] fix tied weights --- .../kyutai_speech_to_text/modeling_kyutai_speech_to_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 1e95b92d528d..5b0c3497ad76 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -1069,7 +1069,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( @auto_docstring class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} + _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.embed_tokens.weight"} _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} _keep_in_fp32_modules_strict = ["codec_model"] From 8c4fb33714916d4a6c3bb64070dca35fbd063193 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 27 Nov 2025 17:37:42 +0100 Subject: [PATCH 17/21] hack musicgen --- src/transformers/models/musicgen/configuration_musicgen.py | 6 ++++++ .../models/musicgen_melody/configuration_musicgen_melody.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 76c951668f46..bff720e5cfe2 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -221,5 +221,11 @@ def __init__(self, text_encoder, audio_encoder, decoder, **kwargs): def sampling_rate(self): return self.audio_encoder.sampling_rate + # overriding these because they crash - not 100% sure of that one + def get_text_config(self, decoder=None, encoder=None): + if decoder is None and encoder is None: + decoder = True + return super().get_text_config(decoder=decoder, encoder=encoder) + __all__ = ["MusicgenConfig", "MusicgenDecoderConfig"] diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index a4ec8528590a..af184b241408 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -234,5 +234,11 @@ def __init__( def sampling_rate(self): return self.audio_encoder.sampling_rate + # overriding these because they crash - not 100% sure of that one + def get_text_config(self, decoder=None, encoder=None): + if decoder is None and encoder is None: + decoder = True + return super().get_text_config(decoder=decoder, encoder=encoder) + __all__ = ["MusicgenMelodyConfig", "MusicgenMelodyDecoderConfig"] From c8d8ad06b57ebf1bd57f11c339c63a4f4aadb11e Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 27 Nov 2025 17:41:11 +0100 Subject: [PATCH 18/21] tie_encoder_decoder workaround --- src/transformers/modeling_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9ead247e6431..26db03c27fae 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2254,9 +2254,11 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict: f"Text config {text_config.__class__.__name__} does not have 'tie_word_embeddings' attribute. " "This may cause issues with weight tying." ) - tie_word_embeddings = text_config.tie_word_embeddings + tie_word_embeddings = getattr(text_config, "tie_word_embeddings", None) + tie_encoder_decoder = getattr(self.config, "tie_encoder_decoder", False) + should_tie = tie_encoder_decoder if tie_word_embeddings is None else tie_word_embeddings # If the config does not specify any tying, return empty dict - if not tie_word_embeddings and not self.config.tie_encoder_decoder: + if not should_tie: return {} # If None, return empty dict elif tied_mapping is None: From 65dca34d03ba82d3f861204f3ad09dde01d2e019 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Mon, 1 Dec 2025 16:38:17 +0100 Subject: [PATCH 19/21] revert unwanted changes --- src/transformers/configuration_utils.py | 4 ---- src/transformers/conversion_mapping.py | 1 - src/transformers/modeling_utils.py | 21 +++------------------ 3 files changed, 3 insertions(+), 23 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index c08df46a665d..53d6a8a900a8 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -751,10 +751,6 @@ def from_dict( config = cls(**config_dict) - # default tie_word_embeddings to False if None, see https://github.com/huggingface/transformers/issues/42313 - if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is None: - config.tie_word_embeddings = False - # Update config with kwargs if needed if "num_labels" in kwargs and "id2label" in kwargs: num_labels = kwargs["num_labels"] diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 10375807422a..b6aad7e94650 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -116,7 +116,6 @@ def _build_checkpoint_conversion_mapping(): mapping["qwen3_next"] = mapping["qwen2_moe"].copy() mapping["qwen3_vl_moe"] = mapping["qwen2_moe"].copy() mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy() - mapping["olmoe"] = mapping["qwen2_moe"].copy() mapping["minimax"] = mapping["mixtral"].copy() return mapping diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 75d4c42ccf14..403f5f6eb295 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2254,17 +2254,8 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict: return expanded_tied_weights tied_mapping = self._tied_weights_keys - text_config = self.config.get_text_config(decoder=True) - if not hasattr(text_config, "tie_word_embeddings"): - logger.warning( - f"Text config {text_config.__class__.__name__} does not have 'tie_word_embeddings' attribute. " - "This may cause issues with weight tying." - ) - tie_word_embeddings = getattr(text_config, "tie_word_embeddings", None) - tie_encoder_decoder = getattr(self.config, "tie_encoder_decoder", False) - should_tie = tie_encoder_decoder if tie_word_embeddings is None else tie_word_embeddings # If the config does not specify any tying, return empty dict - if not should_tie: + if not self.config.tie_word_embeddings and not self.config.tie_encoder_decoder: return {} # If None, return empty dict elif tied_mapping is None: @@ -3187,11 +3178,7 @@ def save_pretrained( shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} # Recursively descend to find tied weight keys - tied_keys_attr = getattr(self, "all_tied_weights_keys", None) - if tied_keys_attr is not None: - _tied_weights_keys = set(tied_keys_attr.keys()) - else: - _tied_weights_keys = set(_get_tied_weight_keys(self)) + _tied_weights_keys = set(_get_tied_weight_keys(self)) error_names = [] to_delete_names = set() for names in shared_ptrs.values(): @@ -4423,9 +4410,7 @@ def _move_missing_keys_from_meta_to_cpu( # The tied weight keys are in the "missing" usually, but they should not be moved (they will be tied anyway) # This is especially important because if they are moved, they will lose the `_is_hf_initialized` flag, and they # will be re-initialized for nothing (which can be quite long) - tied_keys_attr = getattr(self, "all_tied_weights_keys", {}) or {} - tied_keys = set(tied_keys_attr.keys()) - for key in missing_keys - tied_keys: + for key in missing_keys - self.all_tied_weights_keys.keys(): param = model_state_dict[key] # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them if param.device == torch.device("meta"): From f85d0f4f44c954de5c5a1ced268c67cb9ee7507c Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Mon, 1 Dec 2025 16:47:47 +0100 Subject: [PATCH 20/21] hardcode llava onevision --- .../llava_onevision/configuration_llava_onevision.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 9fd1e850f0e5..d9f35dc7dc28 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -102,7 +102,6 @@ def __init__( vision_feature_layer=-1, vision_aspect_ratio="anyres_max_9", image_grid_pinpoints=None, - tie_word_embeddings=False, multimodal_projector_bias=True, **kwargs, ): @@ -188,7 +187,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False __all__ = ["LlavaOnevisionConfig"] From b5537a3930c32c1806e759712027fceb392d1ab3 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Mon, 1 Dec 2025 16:58:39 +0100 Subject: [PATCH 21/21] more --- .../llava_next_video/configuration_llava_next_video.py | 9 +++++++-- .../llava_onevision/configuration_llava_onevision.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 7b82b5ac5b89..86dec22c8e5b 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -103,7 +103,6 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_grid_pinpoints=None, - tie_word_embeddings=False, video_token_index=32000, spatial_pool_mode="average", spatial_pool_stride=2, @@ -160,7 +159,13 @@ def __init__( self.text_config = text_config - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(**kwargs) + + # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: + self.tie_word_embeddings = True + self.text_config.tie_word_embeddings = False __all__ = ["LlavaNextVideoConfig"] diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index d9f35dc7dc28..cb957a992216 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -190,7 +190,7 @@ def __init__( super().__init__(**kwargs) # Due to a mismatch at model addition-time, the `tie_word_embeddings` was saved in the text config, even - # though it concerns the main model... So we hardcode a fix here + # though it concerns the main model, while it was set to False by default in the main model... So we hardcode a fix here if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = True self.text_config.tie_word_embeddings = False