From 68c95c20d6ae5bef433896a3e93a9234d3e6f704 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 19 Jan 2026 21:21:54 +0800 Subject: [PATCH 1/6] convert_hf_to_gguf.py: refactor modify_tensors to call super --- convert_hf_to_gguf.py | 707 +++++++++++++++++------------------------- 1 file changed, 276 insertions(+), 431 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ab015dd2c3a..6d87dd270d6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1981,13 +1981,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - tensors: list[tuple[str, Tensor]] = [] - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): # Map bloom-style qkv_linear to gpt-style qkv_linear # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa @@ -2014,9 +2010,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) logger.info("re-format attention.linear_qkv.bias") - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("BloomForCausalLM", "BloomModel") @@ -2036,15 +2030,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) name = re.sub(r'transformer\.', '', name) - tensors: list[tuple[str, Tensor]] = [] - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): # Map bloom-style qkv_linear to gpt-style qkv_linear # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa @@ -2071,9 +2061,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) logger.info("re-format attention.linear_qkv.bias") - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("MPTForCausalLM") @@ -2266,8 +2254,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) @@ -2277,7 +2263,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("k_proj.weight"): data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2314,8 +2300,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # QKV tensor transform # The original query_key_value tensor contains n_head_kv "kv groups", # each consisting of n_head/n_head_kv query weights followed by one key @@ -2337,7 +2321,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("GPTBigCodeForCausalLM") @@ -2476,7 +2460,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): datas: list[Tensor] = [] @@ -2488,9 +2472,8 @@ def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_ data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - new_name = self.map_tensor_name(merged_name) - return [(new_name, data_torch)] + return super().modify_tensors(data_torch, merged_name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -2616,7 +2599,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) if is_multimodal_tensor: - return [] # skip vision tensors + return # skip vision tensors elif self.hf_arch == "LlamaModel": name = "model." + name elif name.startswith("model.text_model"): @@ -2642,8 +2625,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -2657,14 +2638,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): @@ -2755,8 +2734,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["gate_proj", "up_proj", "down_proj"]: datas: list[Tensor] = [] @@ -2768,17 +2745,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + yield from super().modify_tensors(data_torch, merged_name, bid) - return tensors + return else: - return [] + return if name.endswith(".expert_bias"): name = name.replace(".expert_bias", ".expert_bias.bias") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register( @@ -2835,7 +2811,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused n_head = ( self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) ) @@ -2856,7 +2831,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" if self.img_break_tok_id > 0 and embed_key in name: @@ -2864,9 +2839,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # for pixtral model, we need to extract the [IMG_BREAK] token embedding img_break_embd = data_torch[self.img_break_tok_id] name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - return [(self.map_tensor_name(name), img_break_embd)] + return super().modify_tensors(img_break_embd, name, bid) - return [] # skip other tensors + return [] # skip other tensors @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") @@ -2897,13 +2872,12 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name if is_vision_tensor: - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) - return [] # skip other tensors + return [] # skip other tensors @ModelBase.register( @@ -2942,18 +2916,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name_gate = name.replace("gate_up_proj", "gate_proj.weight") dim_half = data_torch.shape[-1] // 2 gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + return if name.endswith("down_proj"): name += ".weight" data_torch = data_torch.transpose(-1, -2) if "multi_modal_projector" in name or "vision_model" in name: - return [] - return super().modify_tensors(data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Llama4ForConditionalGeneration") @@ -2967,16 +2940,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_use_gelu(True) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused if "multi_modal_projector" in name or "vision_model" in name: # process vision tensors if "positional_embedding_vlm" in name and ".weight" not in name: name += ".weight" if "multi_modal_projector.linear_1" in name: # despite the name with number postfix, this is a single fully connected layer - return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)] - return [(self.map_tensor_name(name), data_torch)] - return [] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Mistral3ForConditionalGeneration") @@ -3005,9 +2977,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("DeciLMForCausalLM") @@ -3146,7 +3118,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = DeciModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): @@ -3220,7 +3192,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # transform weight into 1/0/-1 (in fp32) data_torch = self.weight_quant(data_torch) - yield (new_name, data_torch) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") @@ -3276,11 +3248,11 @@ def set_gguf_parameters(self): _cur_expert = "" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - tensors: list[tuple[str, Tensor]] = [] + deferred: list[tuple[Tensor, str, int | None]] = [] is_expert = ".moe." in name or ".block_sparse_moe.experts." in name if not is_expert: - tensors.append((self.map_tensor_name(name), data_torch)) + deferred.append((data_torch, name, bid)) # process the experts separately if is_expert or self._cur_expert: @@ -3321,11 +3293,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" - new_name = self.map_tensor_name(merged_name) + yield from super().modify_tensors(data_torch, merged_name, bid) - yield (new_name, data_torch) - - yield from tensors + for t in deferred: + yield from super().modify_tensors(*t) @ModelBase.register("DbrxForCausalLM") @@ -3434,8 +3405,6 @@ def set_vocab(self): self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -3445,7 +3414,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("MiniCPM3ForCausalLM") @@ -3555,7 +3524,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter or name.startswith("vision_model") or name.startswith("audio_tower") \ or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): # skip vision and audio tensors - return [] + return yield from super().modify_tensors(data_torch, name, bid) @@ -3752,23 +3721,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter total_k_dim = num_kv_heads * head_dim total_v_dim = num_kv_heads * head_dim q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) - return [ - (self.map_tensor_name(name_q), q_proj_weight), - (self.map_tensor_name(name_k), k_proj_weight), - (self.map_tensor_name(name_v), v_proj_weight) - ] + yield from super().modify_tensors(q_proj_weight, name_q, bid) + yield from super().modify_tensors(k_proj_weight, name_k, bid) + yield from super().modify_tensors(v_proj_weight, name_v, bid) # split the up_gate_proj into gate and up # up_gate_proj shape: [2 * intermediate_size, hidden_size] - if "up_gate_proj" in name: + elif "up_gate_proj" in name: name_up = name.replace("up_gate_proj.weight", "up_proj.weight") name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") dim_half = data_torch.shape[0] // 2 gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Ernie4_5_MoeForCausalLM") @@ -3801,20 +3767,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) match = re.match(r"model.mtp_block.(\d+)", name) if match: - return [] + return # skip all other MTP tensors for now match = re.match(r"model.mtp_emb_norm.(\d+)", name) if match: - return [] + return match = re.match(r"model.mtp_hidden_norm.(\d+)", name) if match: - return [] + return match = re.match(r"model.mtp_linear_proj.(\d+)", name) if match: - return [] + return # process the experts separately if name.find("mlp.experts") != -1: @@ -3827,8 +3793,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["gate_proj", "up_proj", "down_proj"]: datas: list[Tensor] = [] @@ -3840,13 +3804,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - - return tensors - else: - return [] - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, merged_name, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -3877,14 +3837,13 @@ def set_vocab(self): self._set_vocab_gpt2() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused if name.startswith("thinker."): name = name.replace("thinker.", "") if name.startswith("visual") or name.startswith("audio") or \ name.startswith("talker") or name.startswith("token2wav"): # skip multimodal tensors return [] - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") @@ -3933,7 +3892,6 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused if name.startswith("visual."): # process visual tensors # split QKV tensors if needed @@ -3947,23 +3905,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter wq = data_torch[:c] wk = data_torch[c: c * 2] wv = data_torch[c * 2:] - return [ - (self.map_tensor_name(name.replace("qkv", "q")), wq), - (self.map_tensor_name(name.replace("qkv", "k")), wk), - (self.map_tensor_name(name.replace("qkv", "v")), wv), - ] + yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid) + yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid) + yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid) elif 'patch_embed.proj.weight' in name: # split Conv3D into Conv2Ds c1, c2, kt, kh, kw = data_torch.shape del c1, c2, kh, kw # unused assert kt == 2, "Current implmentation only support temporal_patch_size of 2" - return [ - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]), - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]), - ] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) else: - return [(self.map_tensor_name(name), data_torch)] - return [] # skip other tensors + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2_5OmniModel") @@ -4019,10 +3972,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "audio_bos_eos_token" in name: # this tensor is left unused in transformers code # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 - return [] - return [(self.map_tensor_name(name), data_torch)] - - return super().modify_tensors(data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("InternVisionModel") @@ -4069,7 +4020,6 @@ def _mapping_interns1_name(self, name): return name def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] # deal with intern-s1 special case name = self._mapping_interns1_name(name) @@ -4091,13 +4041,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter wq = data_torch[:c] wk = data_torch[c: c * 2] wv = data_torch[c * 2:] - return [ - (self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq), - (self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk), - (self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv), - ] - return [(self.map_tensor_name(name), data_torch)] - return [] # skip other tensors + yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) + yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) + yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("WavTokenizerDec") @@ -4105,8 +4053,6 @@ class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if \ name.endswith("codebook.cluster_size") or \ name.endswith("codebook.embed_avg") or \ @@ -4116,7 +4062,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def set_vocab(self): self._set_vocab_none() @@ -4171,7 +4117,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Need PyTorch: (128, 2048, 768) [reversed of GGML] # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768) permuted = data_torch.permute(0, 2, 1).contiguous() - return [(self.map_tensor_name(mapped), permuted)] + yield from super().modify_tensors(permuted, mapped, bid) + return if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: @@ -4189,14 +4136,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter mapped_up = f"{base}.up_proj.weight" perm_gate = gate.permute(0, 2, 1).contiguous() perm_up = up.permute(0, 2, 1).contiguous() - return [ - (self.map_tensor_name(mapped_gate), perm_gate), - (self.map_tensor_name(mapped_up), perm_up), - ] + yield from super().modify_tensors(perm_gate, mapped_gate, bid) + yield from super().modify_tensors(perm_up, mapped_up, bid) + return if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"): # skip visual tensors - return [] + return if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -4207,8 +4153,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4222,14 +4166,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -4309,7 +4251,7 @@ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if "model.vision_" in name: # skip multimodal tensors - return [] + return if self.is_rerank: is_tied_head = self.is_tied_embeddings and "embed_tokens" in name @@ -4319,13 +4261,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", self._get_cls_out_tensor(data_torch), ) + yield cls_out_head if is_tied_head: - embed = (self.map_tensor_name(name), data_torch) - return [cls_out_head, embed] - if is_real_head: - return [cls_out_head] + yield from super().modify_tensors(data_torch, name, bid) + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen3MoeForCausalLM") @@ -4363,7 +4304,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("mtp"): - return [] # ignore MTP layers for now + return # ignore MTP layers for now if name.endswith(".A_log"): data_torch = -torch.exp(data_torch) elif name.endswith(".dt_bias"): @@ -4510,7 +4451,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}") else: raise ValueError(f"Unexpected merger tensor: {name}") - return [(new_name, data_torch)] + yield (new_name, data_torch) + return if name == "visual.patch_embed.proj.weight": # split Conv3D into Conv2Ds along temporal dimension @@ -4518,20 +4460,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter del c1, c2 if kt != 2: raise ValueError("Current implementation only supports temporal_patch_size of 2") - return [ - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]), - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]), - ] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) + return if name == "visual.patch_embed.proj.bias": # Include the bias - it's used by the C++ code - return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch) + return if name.startswith("visual."): - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return # Fall back to parent class for other tensors - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") @@ -4554,8 +4497,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("model.visual."): name = name.replace("model.visual.", "visual.") if name.startswith("visual.merger."): - return [(self.map_tensor_name(name), data_torch)] - return super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen3VLForConditionalGeneration") @@ -4573,9 +4517,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision tensors - they go in the mmproj file if name.startswith("model.visual."): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen3VLMoeForConditionalGeneration") @@ -4591,9 +4535,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision tensors - they go in the mmproj file if name.startswith("model.visual."): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GPT2LMHeadModel") @@ -4849,8 +4793,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -4864,14 +4806,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -4917,8 +4857,6 @@ def shuffle_attn_output_weight(self, data_torch): return data_torch def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - new_name = self.map_tensor_name(name) # shuffle for broadcasting of gqa in ggml_mul_mat @@ -4927,7 +4865,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif new_name.endswith("attn_output.weight"): data_torch = self.shuffle_attn_output_weight(data_torch) - return [(new_name, data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") @@ -4988,8 +4926,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.endswith(".A_log"): data_torch = -torch.exp(data_torch) elif name.endswith(".dt_bias"): @@ -5018,9 +4954,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith(".norm.weight"): data_torch += 1.0 - new_name = self.map_tensor_name(name) - - return [(new_name, data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") @@ -5069,7 +5003,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith(".norm.weight"): data_torch = data_torch + 1.0 - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("CodeShellForCausalLM") @@ -5307,7 +5241,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") @@ -5362,8 +5296,6 @@ def phantom(tok, toktype): special_vocab.add_to_gguf(self.gguf_writer) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.startswith("bert."): name = name[5:] @@ -5375,7 +5307,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # we are only using BERT for embeddings so we don't need the pooling layer if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these + return [] # we don't need these if name.startswith("cls.predictions"): return [] @@ -5391,7 +5323,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "classifier.bias": name = "classifier.out_proj.bias" - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def _xlmroberta_tokenizer_init(self) -> None: # we need the pad_token_id to know how to chop down position_embd matrix @@ -5644,7 +5576,7 @@ def set_vocab(self) -> None: def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: # If the tensor is an experts bias tensor, skip it by returning an empty list. if "mlp.experts.bias" in name: - return [] # Explicitly return an empty list. + return [] # Explicitly return. if "mlp.experts.mlp.w1" in name: data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) @@ -5655,7 +5587,7 @@ def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) - data_torch = data_torch.transpose(1, 2) name += ".weight" - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -5834,8 +5766,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": @@ -5846,7 +5776,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma2ForCausalLM") @@ -5880,8 +5810,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": @@ -5892,7 +5820,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") @@ -5927,14 +5855,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "language_model." in name: name = name.replace("language_model.", "") elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): - return [] # skip vision tensors + return [] # skip vision tensors # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -5950,7 +5876,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + self.norm_shift - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3TextModel") @@ -6056,10 +5982,8 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "vision_model.head." in name: - return [] # skip redundant tensors for tinygemma3 + return [] # skip redundant tensors for tinygemma3 if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): @@ -6073,9 +5997,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.info(f"Correcting norm value for '{name}'") data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) - return [] # skip other tensors + return [] # skip other tensors class ConformerAudioModel(MmprojModel): @@ -7022,8 +6946,6 @@ def set_gguf_parameters(self): # Same as super class, but permuting q_proj, k_proj # Copied from: LlamaModel def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -7032,7 +6954,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("k_proj.weight"): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("SeedOssForCausalLM") @@ -7088,8 +7010,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -7103,14 +7023,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) # Copied from: Qwen2MoeModel def prepare_tensors(self): @@ -7333,8 +7251,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -7348,14 +7264,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7422,8 +7336,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -7437,14 +7349,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7580,9 +7490,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # skip vision tensors and remove "language_model." for Kimi-VL if "vision_tower" in name or "multi_modal_projector" in name: - return [] + return if name.startswith("siglip2.") or name.startswith("merger."): - return [] + return if name.startswith("language_model."): name = name.replace("language_model.", "") @@ -7590,7 +7500,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self.hparams.get("tie_word_embeddings", False): if name == "lm_head.weight" or name == "model.lm_head.weight": logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") - return [] + return # rename e_score_correction_bias tensors if name.endswith("e_score_correction_bias"): @@ -7600,7 +7510,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter block_count = self.hparams["num_hidden_layers"] match = re.match(r"model.layers.(\d+)", name) if match and int(match.group(1)) >= block_count: - return [] + return # process the experts separately if name.find("mlp.experts") != -1: @@ -7613,8 +7523,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -7628,12 +7536,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed if name.endswith("kv_b_proj.weight"): @@ -7650,12 +7556,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) - return [ - (self.map_tensor_name(name_kb), k_b), - (self.map_tensor_name(name_vb), v_b) - ] + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7758,7 +7663,7 @@ def modify_tensors(self, data_torch, name, bid): # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE if "model.mtp." in name: - return [] + return # process the experts separately if name.find("mlp.experts") != -1: @@ -7771,8 +7676,6 @@ def modify_tensors(self, data_torch, name, bid): self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["gate_proj", "up_proj", "down_proj"]: datas: list[Tensor] = [] @@ -7784,13 +7687,12 @@ def modify_tensors(self, data_torch, name, bid): data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] - return [(self.map_tensor_name(name), data_torch)] + return + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7834,8 +7736,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") - return [] - return [(self.map_tensor_name(name), data_torch)] + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Dots1ForCausalLM") @@ -7857,8 +7759,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") if "shared_experts" in name: - return [(self.map_tensor_name(name), data_torch)] - return super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + else: + super().modify_tensors(data_torch, name, bid) @ModelBase.register("PLMForCausalLM") @@ -7878,7 +7781,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8010,8 +7913,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder @@ -8022,9 +7923,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self.shared_token_embeddings_found = True else: logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("T5EncoderModel") @@ -8146,8 +8047,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder @@ -8158,9 +8057,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self.shared_token_embeddings_found = True else: logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("JAISLMHeadModel") @@ -8393,13 +8292,14 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part - return [] + return elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for multimodal variants # Handle main token embedding (but not layer-specific NextN embeddings) if name == "model.embed_tokens.weight" and ".layers." not in name: - return [(self.map_tensor_name("token_embd.weight"), data_torch)] + yield from super().modify_tensors(data_torch, "token_embd.weight", bid) + return # Handle routed experts if name.find("mlp.experts") != -1: @@ -8412,8 +8312,6 @@ def modify_tensors( self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -8427,18 +8325,15 @@ def modify_tensors( merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") - new_name = self.map_tensor_name(name) - - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8621,13 +8516,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(rope_freq) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."): - return [] + return name = name.removeprefix("transformer.") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("NemotronForCausalLM") @@ -8668,7 +8561,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) @ModelBase.register("ExaoneForCausalLM") @@ -9055,7 +8948,7 @@ def modify_tensors( return Mamba2Model.modify_tensors(self, data_torch, name, bid) elif bid in self._attn_layers: return GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): """This method merges params from both parents and some that are @@ -9187,34 +9080,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self.is_moe and bid is not None: if name.endswith("mixer.gate.e_score_correction_bias"): new_name = name.replace("e_score_correction_bias", "e_score_correction.bias") - mapped_name = self.map_tensor_name(new_name) - return [(mapped_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) + return if name.endswith("mixer.dt_bias"): new_name = name.replace("dt_bias", "dt.bias") - mapped_name = self.map_tensor_name(new_name) - return [(mapped_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) + return if name.endswith("mixer.conv1d.weight"): squeezed_data = data_torch.squeeze() - mapped_name = self.map_tensor_name(name) - return [(mapped_name, squeezed_data)] + yield from super().modify_tensors(squeezed_data, name, bid) + return if name.endswith("mixer.A_log"): transformed_data = -torch.exp(data_torch) reshaped_data = transformed_data.squeeze().reshape(-1, 1) - mapped_name = self.map_tensor_name(name) - return [(mapped_name, reshaped_data)] + yield from super().modify_tensors(reshaped_data, name, bid) + return if name.endswith("mixer.D"): reshaped_data = data_torch.squeeze().reshape(-1, 1) - mapped_name = self.map_tensor_name(name) - return [(mapped_name, reshaped_data)] + yield from super().modify_tensors(reshaped_data, name, bid) + return if name.endswith("mixer.norm.weight"): reshaped_data = data_torch.reshape(self.n_group, -1) - mapped_name = self.map_tensor_name(name) - return [(mapped_name, reshaped_data)] + yield from super().modify_tensors(reshaped_data, name, bid) + return if name.find("mixer.experts") != -1: n_experts = self.hparams["n_routed_experts"] @@ -9227,7 +9120,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._experts[bid]) >= n_experts * 2: # merge the experts into a single tensor - tensors: list[tuple[str, Tensor]] = [] for w_name in ["down_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9238,14 +9130,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9401,8 +9292,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_experts = self.hparams["num_experts"] assert bid is not None - tensors: list[tuple[str, Tensor]] = [] - if self._experts is None: self._experts = [{} for _ in range(self.block_count)] @@ -9422,16 +9311,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return if name.endswith(".expert_bias"): name = name.replace(".expert_bias", ".expert_bias.bias") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9467,7 +9353,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.endswith(".expert_bias"): # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 - return [] + return # process the experts separately if name.find("chunk_experts") != -1: @@ -9480,8 +9366,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._chunk_experts[bid][name] = data_torch if len(self._chunk_experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9495,12 +9379,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return elif name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -9511,8 +9393,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9526,14 +9406,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9567,7 +9445,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # ignore image tokenizer for now # TODO: remove this once image support is implemented for Chameleon if name.startswith("model.vqmodel"): - return [] + return n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -9582,7 +9460,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_norm.weight", "k_norm.bias")): data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 @staticmethod @@ -9627,11 +9505,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.startswith("model.") or name.startswith("lm_head."): # skip language model tensors - return [] + return if name.startswith("audio_encoder.whisper."): name = name.replace("audio_encoder.whisper.","audio_tower.") @@ -9639,7 +9515,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("audio_encoder.", "audio_encoder.adapting.") if name.startswith("audio_encoder.audio_bos_eos_token."): - return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])] + yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid) + yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid) + return if name.startswith("audio_encoder.adapting."): name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") @@ -9650,13 +9528,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if ".2." in name: name = name.replace(".2.", ".linear_2.") if ".proj." in name: - return [] + return if "conv1.bias" in name or "conv2.bias" in name: # transpose conv1 and conv2 bias data_torch = data_torch.unsqueeze(-1) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2AudioForConditionalGeneration") @@ -9683,11 +9561,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.startswith("language_model."): # skip language model tensors - return [] + return # prevent clash naming with vision tensors if name.startswith("multi_modal_projector"): @@ -9697,7 +9573,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # transpose conv1 and conv2 bias data_torch = data_torch.unsqueeze(-1) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("UltravoxModel") @@ -9941,7 +9817,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") - return [] + return if name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] @@ -9954,7 +9830,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._experts[bid]) >= n_experts * 3: # merge the experts into a single 3d tensor - tensors: list[tuple[str, Tensor]] = [] for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9965,14 +9840,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -10017,8 +9891,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -10032,14 +9904,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) # Copied from: Qwen2MoeModel def prepare_tensors(self): @@ -10138,9 +10008,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("SmolLM3ForCausalLM") @@ -10220,8 +10090,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: return [] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "sinks" in name: name += ".weight" @@ -10235,7 +10103,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.transpose(-1, -2) else: # otherwise, it should already be repacked to ggml MXFP4 format - return [] + return # split the gate_up into gate and up if "gate_up_proj" in name: @@ -10243,25 +10111,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name_up = name.replace("gate_up_proj_bias", "up_proj.bias") name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] - return [ - (self.map_tensor_name(name_gate), gate_proj_bias), - (self.map_tensor_name(name_up), up_proj_bias) - ] + yield from super().modify_tensors(gate_proj_bias, name_gate, bid) + yield from super().modify_tensors(up_proj_bias, name_up, bid) elif "_blocks" not in name and "_scales" not in name: logger.warning(f"{name} is not in MXFP4, performance may be degraded") name_up = name.replace("gate_up_proj", "up_proj.weight") name_gate = name.replace("gate_up_proj", "gate_proj.weight") data_torch = data_torch.transpose(-1, -2) gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] - else: - # otherwise, it should already be repacked to ggml MXFP4 format - return [] - - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) def set_vocab(self): self._set_vocab_gpt2() @@ -10309,7 +10170,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name): # skip multimodal tensors - return [] + return name = name.replace("language_model.", "") # vision name = name.replace("lfm.", "model.") # audio @@ -10318,7 +10179,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if 'conv.conv' in name: data_torch = data_torch.squeeze(1) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def _is_vision_tensor(self, name: str) -> bool: return "vision_tower" in name or "multi_modal_projector" in name @@ -10388,9 +10249,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # not enough expert weights to merge if len(expert_cache) < n_experts * len(expert_weights): - return [] + return - tensors: list[tuple[str, Tensor]] = [] for w_name in expert_weights: datas: list[Tensor] = [] @@ -10401,13 +10261,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + + yield from super().modify_tensors(data_torch, merged_name, bid) del self._experts_cache[bid] - return tensors + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -10433,7 +10293,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name if is_vision_tensor: @@ -10444,9 +10303,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "patch_embedding.weight" in name: data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return - return [] # skip other tensors + return # skip other tensors @ModelBase.register("Lfm2AudioForConditionalGeneration") @@ -10471,17 +10331,17 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch, name, bid): # skip language model tensors if name.startswith("lfm."): - return [] + return # for training only if any(p in name for p in ["audio_loss_weight"]): - return [] + return # for audio output if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("SmallThinkerForCausalLM") @@ -10526,8 +10386,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down", "gate", "up"]: datas: list[Tensor] = [] @@ -10541,14 +10399,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -10581,12 +10437,12 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # these layers act as MLM head, so we don't need them if name.startswith("decoder."): - return [] + return if name.startswith("model."): name = name[6:] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("ApertusForCausalLM") @@ -10606,24 +10462,24 @@ def modify_tensors(self, data_torch, name, bid): self._alpha_n[bid] = data_torch.to("cpu").float().item() if (len(self._alpha_n) == n_layers): self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) - return [] + return if name.endswith(".act_fn.alpha_p"): self._alpha_p[bid] = data_torch.to("cpu").float().item() if (len(self._alpha_p) == n_layers): self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) - return [] + return if name.endswith(".act_fn.beta"): self._beta[bid] = data_torch.to("cpu").float().item() if (len(self._beta) == n_layers): self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) - return [] + return if name.endswith(".act_fn.eps"): self._eps[bid] = data_torch.to("cpu").float().item() if (len(self._eps) == n_layers): self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) class MistralModel(LlamaModel): @@ -10786,7 +10642,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name: - return [] + return # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic if name.endswith(".qscale_act"): @@ -10802,7 +10658,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace(".w3.", ".up_proj.") name = "model." + name - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) class PixtralModel(LlavaVisionModel): @@ -10867,24 +10723,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name if is_vision_tensor: if "pos_emb.weight" in name: data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) - elif "wqkv" in name: + + if "wqkv" in name: split_dim = 0 if "weight" in name else -1 wq, wk, wv = data_torch.chunk(3, dim=split_dim) - return [ - (self.map_tensor_name(name.replace("wqkv", "wq")), wq), - (self.map_tensor_name(name.replace("wqkv", "wk")), wk), - (self.map_tensor_name(name.replace("wqkv", "wv")), wv) - ] - - return [(self.map_tensor_name(name), data_torch)] - - return [] # skip other tensors + yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid) + yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid) + yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("CogVLMForCausalLM") @@ -10896,12 +10748,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if not name.startswith("model.vision."): - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("CogVLMForCausalLM") @@ -10909,13 +10759,11 @@ class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # block vision tensors if name.startswith("model.vision."): - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("JanusForConditionalGeneration") @@ -10933,14 +10781,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter 'model.generation_head.', ) if name.startswith(skip_prefixes): - return [] + return if name.startswith('model.language_model.'): name = name.replace('model.language_model.', 'model.') elif name.startswith('language_model.'): name = name.replace('language_model.', '') - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("JanusForConditionalGeneration") @@ -10993,11 +10841,9 @@ def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[s return [(tensor_name, data_torch)] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # Skip language model tensors as they will be handled by `JanusProModel` if name.startswith(('model.language_model.', 'language_model.')): - return [] + return # Skip generation-related components skip_generation_prefixes = ( @@ -11011,17 +10857,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter 'generation_head.', ) if name.startswith(skip_generation_prefixes): - return [] + return # Handle aligner tensors if name.startswith(('model.aligner.', 'aligner.')): - return list(self._map_aligner_tensor(data_torch, name)) + yield from self._map_aligner_tensor(data_torch, name) + return # Handle vision tensors if name.startswith(('model.vision_model.', 'vision_model.')): - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return - return [] + return @ModelBase.register("YoutuVLForConditionalGeneration") @@ -11060,21 +10908,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # Skip language model tensors skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') if name.startswith(skip_prefixes): - return [] + return # Try to map the tensor using TensorNameMap (handles vision encoder and projector) try: - new_name = self.map_tensor_name(name) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) except ValueError: # If mapping fails, log warning and skip logger.warning(f"Cannot map tensor: {name}") - return [] + return @ModelBase.register("SolarOpenForCausalLM") From 492fe83bfa8f0b42576ea2674cd5adea9b275bf9 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 20 Jan 2026 12:33:08 +0800 Subject: [PATCH 2/6] s/return/yield/g when returning modify_tensors --- convert_hf_to_gguf.py | 76 +++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6d87dd270d6..c224775c5cf 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2010,7 +2010,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) logger.info("re-format attention.linear_qkv.bias") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("BloomForCausalLM", "BloomModel") @@ -2061,7 +2061,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) logger.info("re-format attention.linear_qkv.bias") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("MPTForCausalLM") @@ -2263,7 +2263,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("k_proj.weight"): data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2321,7 +2321,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GPTBigCodeForCausalLM") @@ -2460,7 +2460,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): datas: list[Tensor] = [] @@ -2473,7 +2473,7 @@ def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_ merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - return super().modify_tensors(data_torch, merged_name, bid) + yield from super().modify_tensors(data_torch, merged_name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -2831,7 +2831,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" if self.img_break_tok_id > 0 and embed_key in name: @@ -2839,7 +2839,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # for pixtral model, we need to extract the [IMG_BREAK] token embedding img_break_embd = data_torch[self.img_break_tok_id] name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - return super().modify_tensors(img_break_embd, name, bid) + yield from super().modify_tensors(img_break_embd, name, bid) return [] # skip other tensors @@ -2875,7 +2875,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name if is_vision_tensor: - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) return [] # skip other tensors @@ -3118,7 +3118,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = DeciModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): @@ -3414,7 +3414,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("MiniCPM3ForCausalLM") @@ -3843,7 +3843,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name.startswith("talker") or name.startswith("token2wav"): # skip multimodal tensors return [] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") @@ -4062,7 +4062,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def set_vocab(self): self._set_vocab_none() @@ -4865,7 +4865,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif new_name.endswith("attn_output.weight"): data_torch = self.shuffle_attn_output_weight(data_torch) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") @@ -4954,7 +4954,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith(".norm.weight"): data_torch += 1.0 - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") @@ -5003,7 +5003,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith(".norm.weight"): data_torch = data_torch + 1.0 - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("CodeShellForCausalLM") @@ -5241,7 +5241,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") @@ -5323,7 +5323,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "classifier.bias": name = "classifier.out_proj.bias" - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def _xlmroberta_tokenizer_init(self) -> None: # we need the pad_token_id to know how to chop down position_embd matrix @@ -5480,7 +5480,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("vocab_"): return [] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("RobertaModel", "RobertaForSequenceClassification") @@ -5523,7 +5523,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._position_offset is not None: data_torch = data_torch[self._position_offset:,:] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("NomicBertModel") @@ -5587,7 +5587,7 @@ def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) - data_torch = data_torch.transpose(1, 2) name += ".weight" - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -5632,7 +5632,7 @@ def modify_tensors(self, data_torch, name, bid): if name.startswith("model."): name = name[6:] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") @@ -5707,7 +5707,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -5776,7 +5776,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma2ForCausalLM") @@ -5820,7 +5820,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") @@ -5876,7 +5876,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + self.norm_shift - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3TextModel") @@ -5997,7 +5997,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.info(f"Correcting norm value for '{name}'") data_torch = data_torch + 1 - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) return [] # skip other tensors @@ -6148,7 +6148,7 @@ def custom_map(self, name: str) -> str: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if (ConformerAudioModel.is_audio_tensor(name)): name = name.replace("model.audio_tower.conformer.", "conformer.layers.") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) # Gemma3n uses # - model.embed_vision.* for projection layers @@ -6303,7 +6303,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Starcoder2ForCausalLM") @@ -6954,7 +6954,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("k_proj.weight"): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("SeedOssForCausalLM") @@ -7621,7 +7621,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): del self._experts_cache[bid] return tensors - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("MiMoV2FlashForCausalLM") @@ -7781,7 +7781,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8217,7 +8217,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") @@ -8561,7 +8561,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("ExaoneForCausalLM") @@ -8854,7 +8854,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch) ] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM") @@ -8948,7 +8948,7 @@ def modify_tensors( return Mamba2Model.modify_tensors(self, data_torch, name, bid) elif bid in self._attn_layers: return GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): """This method merges params from both parents and some that are @@ -10194,7 +10194,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if not name.startswith(self.dense_tensor_name): name = "model." + name - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # dense tensor is stored in a separate safetensors file @@ -10703,7 +10703,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("model.vision_encoder.", "vision_tower.") name = name.replace("model.vision_projection.", "multi_modal_projector.") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("KimiVLForConditionalGeneration") From 9a93ca537ec74a7b30ac97cff033dc82a69aea45 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 20 Jan 2026 22:03:44 +0800 Subject: [PATCH 3/6] return instead of return [] --- convert_hf_to_gguf.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c224775c5cf..97942e148db 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2841,7 +2841,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] yield from super().modify_tensors(img_break_embd, name, bid) - return [] # skip other tensors + return # skip other tensors @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") @@ -2877,7 +2877,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if is_vision_tensor: yield from super().modify_tensors(data_torch, name, bid) - return [] # skip other tensors + return # skip other tensors @ModelBase.register( @@ -5307,13 +5307,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # we are only using BERT for embeddings so we don't need the pooling layer if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these + return # we don't need these if name.startswith("cls.predictions"): - return [] + return if name.startswith("cls.seq_relationship"): - return [] + return if self.cls_out_labels: # For BertForSequenceClassification (direct projection layer) @@ -5478,7 +5478,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # These layers act as MLM head, so we don't need them if name.startswith("vocab_"): - return [] + return yield from super().modify_tensors(data_torch, name, bid) @@ -5576,7 +5576,7 @@ def set_vocab(self) -> None: def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: # If the tensor is an experts bias tensor, skip it by returning an empty list. if "mlp.experts.bias" in name: - return [] # Explicitly return. + return # Explicitly return. if "mlp.experts.mlp.w1" in name: data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) @@ -5689,7 +5689,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): if name.startswith("pooler.dense"): - return [] + return num_loras = data_torch.size(0) assert num_loras == len(self._lora_names) @@ -5705,7 +5705,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) - return [] + return yield from super().modify_tensors(data_torch, name, bid) @@ -5860,7 +5860,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): - return [] # skip vision tensors + return # skip vision tensors # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -5983,7 +5983,7 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if "vision_model.head." in name: - return [] # skip redundant tensors for tinygemma3 + return # skip redundant tensors for tinygemma3 if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): @@ -5999,7 +5999,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) - return [] # skip other tensors + return # skip other tensors class ConformerAudioModel(MmprojModel): @@ -6245,7 +6245,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # TODO: implement self.prediction_coefs.weight.clamp_(...) if "language_model." not in name: - return [] # skip non-language model tensors + return # skip non-language model tensors # Pad token embeddings for vision/audio special tokens (262144-262399) if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: @@ -6285,7 +6285,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if out is not None: return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)] else: - return [] + return if "altup_projections" in name: data_torch = data_torch.to(device="cpu") @@ -6301,7 +6301,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if out is not None: return [(self.map_tensor_name("model.altup_projections.weight"), out)] else: - return [] + return yield from super().modify_tensors(data_torch, name, bid) From 62eb6234dde7c1ff811b083b0d6472b719e6f34c Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 20 Jan 2026 23:46:38 +0800 Subject: [PATCH 4/6] more missing changes --- convert_hf_to_gguf.py | 105 +++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 58 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 97942e148db..6fd2ce0a0b5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -514,8 +514,7 @@ def set_gguf_parameters(self): raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - + del bid # unused return [(self.map_tensor_name(name), data_torch)] def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: @@ -2096,15 +2095,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_max_alibi_bias(0.0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "scales" in name: new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) new_name = new_name.replace("scales", "act.scales") else: new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("OrionForCausalLM") @@ -2445,7 +2442,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._q_norms[bid]) >= n_head: return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") else: - return [] + return if name.find("k_layernorm.norms") != -1: assert bid is not None @@ -2458,7 +2455,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._k_norms[bid]) >= n_kv_head: return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") else: - return [] + return yield from super().modify_tensors(data_torch, name, bid) @@ -3267,11 +3264,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name in self._experts[bid]: self._cur_expert = name self._experts[bid][name].append(data_torch) - return [] + return elif is_expert: self._cur_expert = name self._experts[bid][name] = [data_torch] - return [] + return else: self._cur_expert = "" @@ -3328,8 +3325,6 @@ def set_gguf_parameters(self): logger.info(f"gguf: file type = {self.ftype}") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_expert = self.hparams["ffn_config"]["moe_num_experts"] n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] @@ -3360,7 +3355,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid # unused @@ -3842,7 +3837,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("visual") or name.startswith("audio") or \ name.startswith("talker") or name.startswith("token2wav"): # skip multimodal tensors - return [] + return yield from super().modify_tensors(data_torch, name, bid) @@ -4058,7 +4053,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name.endswith("codebook.embed_avg") or \ name.endswith("codebook.inited"): logger.debug(f"Skipping {name!r}") - return [] + return logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") @@ -4406,7 +4401,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert self.hparams_vision is not None # Skip text model tensors - they go in the text model file if name.startswith("model.language_model.") or name.startswith("lm_head."): - return [] + return if name.startswith("model.visual."): name = name.replace("model.visual.", "visual.", 1) @@ -4431,7 +4426,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unexpected deepstack tensor: {name}") new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}") - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) + return if name.startswith("visual.merger."): suffix = name.split(".", 2)[2] @@ -5165,7 +5161,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("language_model.", "") # InternVL if name.startswith("mlp") or name.startswith("vision_model"): # skip visual tensors - return [] + return if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: qkv = data_torch @@ -5178,13 +5174,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) v = v.reshape((-1, v.shape[-1])) - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), - ] + yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) else: - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("InternLM3ForCausalLM") @@ -5236,7 +5230,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("language_model.", "") # InternVL if name.startswith("mlp") or name.startswith("vision_model"): # skip visual tensors - return [] + return if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): @@ -5627,7 +5621,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch, name, bid): if name.startswith("decoder."): - return [] + return if name.startswith("model."): name = name[6:] @@ -5770,7 +5764,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] + return # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): @@ -5814,7 +5808,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] + return # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): @@ -6034,10 +6028,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter a = weight / torch.sqrt(running_var + eps) b = bias - running_mean * a - return [ - (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a), - (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), - ] + yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b,f"conformer.layers.{bid}.conv.batch_norm.bias", bid) + return # reshape conv weights if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): @@ -6049,7 +6042,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert data_torch.shape[2] == 1 data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3nForConditionalGeneration") @@ -6155,7 +6148,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # - model.vision_tower.* for vision encoder # Skip non-vision tensors if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): - return [] + return if name.startswith("model.vision_tower.timm_model.blocks."): # Double-indexed block tensors through custom logic @@ -6267,7 +6260,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Continue with normal processing name = name.replace("language_model.", "") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) if "altup_unembed_projections" in name: data_torch = data_torch.to(device="cpu") @@ -6283,7 +6276,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_unembd) if out is not None: - return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)] + yield from super().modify_tensors(data_torch, "model.altup_unembed_projections.weight", bid) else: return @@ -6299,7 +6292,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_proj) if out is not None: - return [(self.map_tensor_name("model.altup_projections.weight"), out)] + yield from super().modify_tensors(data_torch, "model.altup_projections.weight", bid) else: return @@ -6686,11 +6679,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return [] + return elif new_name == tok_embd_name: self._tok_embd = data_torch - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("Mamba2ForCausalLM") @@ -7602,7 +7595,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): # not enough expert weights to merge if len(expert_cache) < n_experts * len(expert_weights): - return [] + return tensors: list[tuple[str, Tensor]] = [] for w_name in expert_weights: @@ -8204,7 +8197,7 @@ def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v - return [] + return elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for Glm4v if self.use_mrope: @@ -8756,9 +8749,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors.append((new_name, data_torch)) return tensors else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8840,19 +8833,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" gate, up = data_torch.split(ffn_dim, dim=-2) if has_experts: - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up), - ] - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up), - ] + yield from super().modify_tensors(gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) + yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) + return + yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return if not has_experts and name.endswith("shared_mlp.output_linear.weight"): - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch) - ] + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) + return yield from super().modify_tensors(data_torch, name, bid) @@ -9199,11 +9189,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith("query_key_value.weight"): q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v) - ] + yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return elif name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None From 7235fc8be5e9cec7efb5256f156cd2c5e8b0d133 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 21 Jan 2026 20:43:04 +0800 Subject: [PATCH 5/6] getting close --- convert_hf_to_gguf.py | 74 +++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6fd2ce0a0b5..4e9b68b007e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2155,11 +2155,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - tensors: list[tuple[str, Tensor]] = [] - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": logger.info(f"Unpacking and permuting layer {bid}") - tensors = [ + yield from [ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), @@ -2168,9 +2166,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._reverse_hf_part(data_torch, 2)), ] else: - tensors = [(self.map_tensor_name(name), data_torch)] - - return tensors + yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -4550,22 +4546,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - # we don't need these if name.endswith((".attn.bias", ".attn.masked_bias")): - return tensors + yield from super().modify_tensors(data_torch, name, bid) + return if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): data_torch = data_torch.transpose(1, 0) new_name = self.map_tensor_name(name) - tensors.append((new_name, data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("PhiForCausalLM") @@ -6018,7 +6009,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._batch_norm_tensors[bid][name] = data_torch if len(self._batch_norm_tensors[bid]) < 5: - return [] + return weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] @@ -6029,7 +6020,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter a = weight / torch.sqrt(running_var + eps) b = bias - running_mean * a yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) - yield from super().modify_tensors(b,f"conformer.layers.{bid}.conv.batch_norm.bias", bid) + yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid) return # reshape conv weights @@ -6160,7 +6151,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") @@ -6276,7 +6267,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_unembd) if out is not None: - yield from super().modify_tensors(data_torch, "model.altup_unembed_projections.weight", bid) + yield from super().modify_tensors(out, "model.altup_unembed_projections.weight", bid) else: return @@ -6292,7 +6283,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_proj) if out is not None: - yield from super().modify_tensors(data_torch, "model.altup_projections.weight", bid) + yield from super().modify_tensors(out, "model.altup_projections.weight", bid) else: return @@ -7597,7 +7588,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if len(expert_cache) < n_experts * len(expert_weights): return - tensors: list[tuple[str, Tensor]] = [] for w_name in expert_weights: datas: list[Tensor] = [] @@ -7609,10 +7599,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + yield from super().modify_tensors(data_torch, new_name, bid) del self._experts_cache[bid] - return tensors + return yield from super().modify_tensors(data_torch, name, bid) @@ -7754,7 +7744,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if "shared_experts" in name: yield from ModelBase.modify_tensors(self, data_torch, name, bid) else: - super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("PLMForCausalLM") @@ -8100,13 +8090,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - # we don't need these if name.endswith((".attn.bias")): - return tensors + return if name.endswith(("relative_pe.slopes")): # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) @@ -8117,7 +8103,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter first_val = float(data_torch[0].item()) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - return tensors + return if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): data_torch = data_torch.transpose(1, 0) @@ -8125,13 +8111,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.map_tensor_name(name) if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((new_name, data_torch * self.embeddings_scale)) + yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid) elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - tensors.append((new_name, data_torch * self.width_scale)) + yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid) else: - tensors.append((new_name, data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, new_name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8729,8 +8713,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -8746,8 +8728,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, new_name, bid) + return else: return @@ -8821,10 +8803,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ffn_dim = self.hparams["intermediate_size"] assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" gate, up = data_torch.split(ffn_dim, dim=-2) - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), - ] + yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) + yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) has_experts = bool(self.hparams.get('num_local_experts')) @@ -9185,7 +9165,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) if name.endswith("attention.dense.weight"): - return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)] + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) elif name.endswith("query_key_value.weight"): q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) @@ -9197,8 +9177,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_experts = self.hparams["num_experts"] assert bid is not None - tensors: list[tuple[str, Tensor]] = [] - if self._experts is None: self._experts = [{} for _ in range(self.block_count)] @@ -9220,9 +9198,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + yield from super().modify_tensors(data_torch, new_name, bid) - return tensors + return new_name = self.map_tensor_name(name) @@ -9230,7 +9208,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.float() data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) def prepare_tensors(self): super().prepare_tensors() From 1a9b98db7a1b1b7799d03a74a465a257c35bd900 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 22 Jan 2026 22:25:09 +0800 Subject: [PATCH 6/6] some more remaining ones --- convert_hf_to_gguf.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4e9b68b007e..34022e8b669 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2376,22 +2376,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head_kv = 1 head_dim = self.hparams["n_embd"] // n_head - tensors: list[tuple[str, Tensor]] = [] - if bid is not None: if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) - elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) - elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) - - if len(tensors) == 0: - tensors.append((self.map_tensor_name(name), data_torch)) + yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return + if name == f"transformer.h.{bid}.attn.q.weight": + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + return + if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return - return tensors + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") @@ -2825,6 +2823,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) yield from super().modify_tensors(data_torch, name, bid) + return embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" if self.img_break_tok_id > 0 and embed_key in name: @@ -6252,6 +6251,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Continue with normal processing name = name.replace("language_model.", "") yield from super().modify_tensors(data_torch, name, bid) + return if "altup_unembed_projections" in name: data_torch = data_torch.to(device="cpu") @@ -6268,6 +6268,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter out = self._stack_matrices(self._altup_unembd) if out is not None: yield from super().modify_tensors(out, "model.altup_unembed_projections.weight", bid) + return else: return @@ -6284,6 +6285,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter out = self._stack_matrices(self._altup_proj) if out is not None: yield from super().modify_tensors(out, "model.altup_projections.weight", bid) + return else: return @@ -7763,9 +7765,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_value_length(hparams["v_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - yield from super().modify_tensors(data_torch, name, bid) - def prepare_tensors(self): super().prepare_tensors() @@ -8694,11 +8693,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = remapper[_n.stem] + _n.suffix # set shared weights for all NextN/MTP layers - tensors = [] for bid in range(self.hparams['num_hidden_layers'], self.block_count): - new_name = new_name.format(bid=bid) - tensors.append((self.map_tensor_name(new_name), data_torch)) - return tensors + yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid) + return if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") @@ -9166,6 +9163,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("attention.dense.weight"): yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) + return elif name.endswith("query_key_value.weight"): q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)