From 873cf5b0e407887b534b5488ec5d5040b6fb5ae0 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 15 Jan 2026 16:49:42 +0100 Subject: [PATCH 1/7] fix --- src/transformers/conversion_mapping.py | 17 ++++- src/transformers/core_model_loading.py | 93 +++++++++++++++----------- 2 files changed, 68 insertions(+), 42 deletions(-) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 6f4781524c3b..8aa68c021aae 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -82,6 +82,21 @@ def _build_checkpoint_conversion_mapping(): operations=[MergeModulelist(dim=0)], ), ], + "qwen3_vl_moe": [ + WeightConverter( + source_patterns=[ + "mlp.experts.*.gate_proj.weight", + "mlp.experts.*.up_proj.weight", + ], + target_patterns="mlp.experts.gate_up_proj", + operations=[MergeModulelist(dim=0), Concatenate(dim=1), Transpose(1, 2)], + ), + WeightConverter( + source_patterns="mlp.experts.*.down_proj.weight", + target_patterns="mlp.experts.down_proj", + operations=[MergeModulelist(dim=0), Transpose(1, 2)], + ), + ], "phimoe": [ WeightConverter( source_patterns=[ @@ -228,13 +243,11 @@ def _build_checkpoint_conversion_mapping(): WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias") ] mapping["glm4_moe"] = mapping["qwen2_moe"].copy() - mapping["glm4_moe_lite"] = mapping["qwen2_moe"].copy() mapping["glm4v_moe"] = mapping["qwen2_moe"].copy() mapping["longcat_flash"] = mapping["qwen2_moe"].copy() mapping["qwen3_moe"] = mapping["qwen2_moe"].copy() mapping["qwen3_omni_moe"] = mapping["qwen2_moe"].copy() mapping["qwen3_next"] = mapping["qwen2_moe"].copy() - mapping["qwen3_vl_moe"] = mapping["qwen2_moe"].copy() mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy() mapping["minimax"] = mapping["mixtral"].copy() mapping["minimax_m2"] = mapping["mixtral"].copy() diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 9e43baf498b1..d3f5f2c5b124 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -113,12 +113,12 @@ def convert( ) -> dict[str, torch.Tensor]: tensors = next(iter(input_dict.values())) tensor = tensors[0] if isinstance(tensors, list) else tensors - targets = self.get_target_pattern(input_dict, target_patterns) + targets = self.get_target_patterns(input_dict, target_patterns) sizes = len(targets) chunks = torch.chunk(tensor, sizes, dim=self.dim) return dict(zip(targets, chunks)) - def get_target_pattern(self, input_dict: dict, target_patterns: list[str]) -> list[str]: + def get_target_patterns(self, input_dict: dict, target_patterns: list[str]) -> list[str]: # Here we always return the target patterns if len(input_dict) > 1 or len(target_patterns) == 1: raise ValueError("Undefined Operation encountered!") @@ -245,6 +245,53 @@ def reverse_op(self) -> ConversionOps: return MergeModulelist(self.dim) +class Transpose(ConversionOps): + """ + Transposes the given tensor along dim0 and dim1. + """ + + def __init__(self, dim0: int = 0, dim1: int = 1): + self.dim0 = dim0 + self.dim1 = dim1 + + @torch.no_grad + def convert( + self, + input_dict: dict[str, torch.Tensor], + source_patterns: list[str], + target_patterns: list[str], + config, + **kwargs, + ) -> dict[str, torch.Tensor]: + target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns) + tensors = next(iter(input_dict.values())) + tensor = tensors[0] if isinstance(tensors, list) else tensors + return {target_pattern: torch.transpose(tensor, dim0=self.dim0, dim1=self.dim1).contiguous()} + + def get_target_pattern( + self, + input_dict: dict[str, torch.Tensor | list[torch.Tensor]], + source_patterns: list[str], + target_patterns: list[str], + ) -> str: + if len(input_dict) != 1: + raise ValueError("Undefined Operation encountered!") + # Here it's the first operation of a chain, so return the source + if len(target_patterns) > 1: + # Here it's the first operation of a chain, so return the source + if len(source_patterns) == 1: + return source_patterns[0] + else: + raise ValueError("Undefined Operation encountered!") + # Here it's the only operation, or the last operation in a chain, so we return the target + else: + return target_patterns[0] + + @property + def reverse_op(self) -> ConversionOps: + return Transpose(dim0=self.dim1, dim1=self.dim0) + + class PermuteForRope(ConversionOps): """ Applies the permutation required to convert complex RoPE weights to the split sin/cos format. @@ -402,43 +449,6 @@ def reverse_op(self) -> ConversionOps: return ErnieFuseAndSplitTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim) -class Transpose(ConversionOps): - """ - Transposes the given tensor along dim0 and dim1. - """ - - def __init__(self, dim0: int = 0, dim1: int = 1): - self.dim0 = dim0 - self.dim1 = dim1 - - @torch.no_grad() - def convert( - self, - input_dict: dict[str, list[torch.Tensor]], - source_patterns: list[str], - target_patterns: list[str], - config, - **kwargs, - ) -> dict[str, list[torch.Tensor]]: - if len(input_dict) != len(target_patterns): - raise ValueError( - f"Transpose conversion can only happen on each key ({len(input_dict)}) " - f"and should match exact one target ({len(target_patterns)})." - ) - - output: dict[str, list[torch.Tensor]] = {} - for key, target_pattern in zip(input_dict.keys(), target_patterns): - tensor = input_dict.get(key, []) - if len(tensor) != 1: - raise ValueError(f"Transpose conversion requires exactly one tensor, found {len(tensor)}.") - output[target_pattern] = torch.transpose(tensor[0], dim0=self.dim0, dim1=self.dim1).contiguous() - return output - - @property - def reverse_op(self) -> ConversionOps: - return Transpose(dim0=self.dim1, dim1=self.dim0) - - @dataclass(slots=True) class WeightTransform: source_patterns: str | list[str] = field(init=True) @@ -739,7 +749,7 @@ def dot_natural_key(s: str): @contextmanager def log_conversion_errors( first_target_key: str, - conversion_errors: MutableMapping[str, str], + conversion_errors: MutableMapping[str, str] | None, extras: Any = None, op: list[ConversionOps] | ConversionOps | None = None, ): @@ -748,6 +758,9 @@ def log_conversion_errors( try: yield except Exception as e: + # During reverse mapping, we do not log and skip errors + if conversion_errors is None: + raise e def _format_op_name(curr_op: list[ConversionOps] | ConversionOps | None) -> str | None: if curr_op is None: From 001c6041cf35e06a8a43aa171c2dd3545ad9f11c Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 15 Jan 2026 16:51:46 +0100 Subject: [PATCH 2/7] oups --- src/transformers/conversion_mapping.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 8aa68c021aae..d9db8aa2ef36 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -243,6 +243,7 @@ def _build_checkpoint_conversion_mapping(): WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias") ] mapping["glm4_moe"] = mapping["qwen2_moe"].copy() + mapping["glm4_moe_lite"] = mapping["qwen2_moe"].copy() mapping["glm4v_moe"] = mapping["qwen2_moe"].copy() mapping["longcat_flash"] = mapping["qwen2_moe"].copy() mapping["qwen3_moe"] = mapping["qwen2_moe"].copy() From 093713e11f4770cdfaf45f96b3f2008323a2e0fa Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 15 Jan 2026 16:53:07 +0100 Subject: [PATCH 3/7] style --- src/transformers/core_model_loading.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index d3f5f2c5b124..5890a9038bb5 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -256,12 +256,7 @@ def __init__(self, dim0: int = 0, dim1: int = 1): @torch.no_grad def convert( - self, - input_dict: dict[str, torch.Tensor], - source_patterns: list[str], - target_patterns: list[str], - config, - **kwargs, + self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str], **kwargs ) -> dict[str, torch.Tensor]: target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns) tensors = next(iter(input_dict.values())) @@ -269,10 +264,7 @@ def convert( return {target_pattern: torch.transpose(tensor, dim0=self.dim0, dim1=self.dim1).contiguous()} def get_target_pattern( - self, - input_dict: dict[str, torch.Tensor | list[torch.Tensor]], - source_patterns: list[str], - target_patterns: list[str], + self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str] ) -> str: if len(input_dict) != 1: raise ValueError("Undefined Operation encountered!") From 8f51f290e2df9139c284ed42a23c304d7d272e47 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 15 Jan 2026 17:18:31 +0100 Subject: [PATCH 4/7] remove duplicated comment --- src/transformers/core_model_loading.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 5890a9038bb5..92068f127368 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -270,7 +270,6 @@ def get_target_pattern( raise ValueError("Undefined Operation encountered!") # Here it's the first operation of a chain, so return the source if len(target_patterns) > 1: - # Here it's the first operation of a chain, so return the source if len(source_patterns) == 1: return source_patterns[0] else: From 73f13e21b400bff7422268eec23f5e054f18235d Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 15 Jan 2026 19:15:27 +0100 Subject: [PATCH 5/7] add TODO --- src/transformers/core_model_loading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 92068f127368..3f5b948697e7 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -249,6 +249,8 @@ class Transpose(ConversionOps): """ Transposes the given tensor along dim0 and dim1. """ + # BIG TODO: This Ops may clash with TP if the required transpose dim are the same as the TP shard dim + # Would require to check if this Ops is used before sharding with TP def __init__(self, dim0: int = 0, dim1: int = 1): self.dim0 = dim0 From 306333b6348f323b92d72540b6d46ac7ef23c731 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 15 Jan 2026 19:16:25 +0100 Subject: [PATCH 6/7] oupsi style --- src/transformers/core_model_loading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 3f5b948697e7..0b61472c3f76 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -249,6 +249,7 @@ class Transpose(ConversionOps): """ Transposes the given tensor along dim0 and dim1. """ + # BIG TODO: This Ops may clash with TP if the required transpose dim are the same as the TP shard dim # Would require to check if this Ops is used before sharding with TP From 9a937ac7a7a044bec72999588d63178c9cd6af00 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Fri, 16 Jan 2026 12:16:41 +0100 Subject: [PATCH 7/7] remove TODO as basically every op will need to be checked very carefully - tp fully broken rn --- src/transformers/core_model_loading.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 0b61472c3f76..92068f127368 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -250,9 +250,6 @@ class Transpose(ConversionOps): Transposes the given tensor along dim0 and dim1. """ - # BIG TODO: This Ops may clash with TP if the required transpose dim are the same as the TP shard dim - # Would require to check if this Ops is used before sharding with TP - def __init__(self, dim0: int = 0, dim1: int = 1): self.dim0 = dim0 self.dim1 = dim1