From ad966b8a8044b218fc358c28995aecbd526752ac Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 10 Jan 2026 13:58:56 +0100 Subject: [PATCH] POC: group gate_exps and up_exps for PP boost --- convert_hf_to_gguf.py | 45 ++++++++++++------------ gguf-py/gguf/constants.py | 3 ++ gguf-py/gguf/tensor_mapping.py | 4 +++ src/llama-arch.cpp | 3 ++ src/llama-arch.h | 1 + src/llama-graph.cpp | 62 +++++++++++++++++++++++----------- src/llama-graph.h | 4 ++- src/llama-model.cpp | 18 +++++++--- src/llama-model.h | 18 +++++----- src/models/openai-moe-iswa.cpp | 12 ++++--- 10 files changed, 112 insertions(+), 58 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 386e2a7e52e..bc3bf71bd61 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9839,13 +9839,17 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name.replace("_scales", ".weight")) self.repack_mxfp4(new_name, blocks0, data_torch) elif "mlp.experts.gate_up_proj_blocks" in name: - blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] + # de-interleave and concatenate blocks: HF has interleaved layout + gate_blocks = data_torch[:, ::2, :, :] # gate at even indices + up_blocks = data_torch[:, 1::2, :, :] # up at odd indices + blocks0 = torch.cat([gate_blocks, up_blocks], dim=1) elif "mlp.experts.gate_up_proj_scales" in name: - scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :] - new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight")) - new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) - self.repack_mxfp4(new_name_gate, blocks0, scales0) - self.repack_mxfp4(new_name_up, blocks1, scales1) + # de-interleave and concatenate scales: HF has interleaved layout + gate_scales = data_torch[:, ::2, :] # gate at even indices + up_scales = data_torch[:, 1::2, :] # up at odd indices + scales0 = torch.cat([gate_scales, up_scales], dim=1) + new_name = self.map_tensor_name(name.replace("_scales", ".weight")) + self.repack_mxfp4(new_name, blocks0, scales0) return [] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -9866,26 +9870,25 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # otherwise, it should already be repacked to ggml MXFP4 format return [] - # split the gate_up into gate and up + # keep gate_up merged (don't split into gate and up) + # HF has interleaved layout, we need concatenated layout for inference if "gate_up_proj" in name: if name.endswith("_bias"): - name_up = name.replace("gate_up_proj_bias", "up_proj.bias") - name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") - gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] - return [ - (self.map_tensor_name(name_gate), gate_proj_bias), - (self.map_tensor_name(name_up), up_proj_bias) - ] + name = name.replace("gate_up_proj_bias", "gate_up_proj.bias") + # de-interleave and concatenate: [n_expert, 2*n_ff_interleaved] -> [n_expert, 2*n_ff_concatenated] + gate_bias = data_torch[..., ::2] # gate at even indices + up_bias = data_torch[..., 1::2] # up at odd indices + data_torch = torch.cat([gate_bias, up_bias], dim=-1) + return [(self.map_tensor_name(name), data_torch)] elif "_blocks" not in name and "_scales" not in name: logger.warning(f"{name} is not in MXFP4, performance may be degraded") - name_up = name.replace("gate_up_proj", "up_proj.weight") - name_gate = name.replace("gate_up_proj", "gate_proj.weight") + name = name.replace("gate_up_proj", "gate_up_proj.weight") data_torch = data_torch.transpose(-1, -2) - gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] + # de-interleave and concatenate: [n_expert, 2*n_ff_interleaved, n_embd] -> [n_expert, 2*n_ff_concatenated, n_embd] + gate_weight = data_torch[:, ::2, :] # gate at even indices + up_weight = data_torch[:, 1::2, :] # up at odd indices + data_torch = torch.cat([gate_weight, up_weight], dim=1) + return [(self.map_tensor_name(name), data_torch)] else: # otherwise, it should already be repacked to ggml MXFP4 format return [] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 64c227799f4..0063ccf17bf 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -507,6 +507,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() + FFN_GATE_UP_EXP = auto() FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() @@ -912,6 +913,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps", MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n @@ -3017,6 +3019,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_SINKS, MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 64dd4ddca50..892d563e1be 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -520,6 +520,10 @@ class TensorNameMap: "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe ), + MODEL_TENSOR.FFN_GATE_UP_EXP: ( + "model.layers.{bid}.mlp.experts.gate_up_proj", # gpt-oss + ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 2ead965469a..99b271d4d54 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -334,6 +334,7 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" }, { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, @@ -2053,6 +2054,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_OUT, LLM_TENSOR_ATTN_SINKS, LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, @@ -2399,6 +2401,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 68ec6a18b18..1d100019ef8 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -356,6 +356,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_EXPS, // merged experts LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 374ff1ebf3a..3cd98f83c46 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1027,7 +1027,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps, + ggml_tensor * gate_up_exps_b) const { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN @@ -1166,30 +1168,52 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_weighted", il); } - ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); + ggml_tensor * up = nullptr; + ggml_tensor * experts = nullptr; - if (up_exps_b) { - up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); - cb(up, "ffn_moe_up_biased", il); - } + if (gate_up_exps) { + // merged gate_up path: one mul_mat_id, then split into gate and up views + ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens] + cb(gate_up, "ffn_moe_gate_up", il); - ggml_tensor * experts = nullptr; - if (gate_exps) { - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + if (gate_up_exps_b) { + gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts); + cb(gate_up, "ffn_moe_gate_up_biased", il); + } + + const int64_t n_ff = gate_up->ne[0] / 2; + cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0); cb(cur, "ffn_moe_gate", il); + up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]); + cb(up, "ffn_moe_up", il); } else { - cur = up; - } + // separate gate and up path + up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); - if (gate_exps_b) { - cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); - cb(cur, "ffn_moe_gate_biased", il); + if (up_exps_b) { + up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); + cb(up, "ffn_moe_up_biased", il); + } + + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } + + if (gate_exps_b) { + cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); + cb(cur, "ffn_moe_gate_biased", il); + } } + const bool has_gate = gate_exps || gate_up_exps; + switch (type_op) { case LLM_FFN_SILU: - if (gate_exps) { + if (has_gate) { cur = ggml_swiglu_split(ctx0, cur, up); cb(cur, "ffn_moe_swiglu", il); } else { @@ -1197,7 +1221,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: - if (gate_exps) { + if (has_gate) { cur = ggml_geglu_split(ctx0, cur, up); cb(cur, "ffn_moe_geglu", il); } else { @@ -1213,7 +1237,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_swiglu_oai", il); } break; case LLM_FFN_RELU: - if (gate_exps) { + if (has_gate) { cur = ggml_reglu_split(ctx0, cur, up); cb(cur, "ffn_moe_reglu", il); } else { @@ -1221,7 +1245,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_relu", il); } break; case LLM_FFN_RELU_SQR: - if (gate_exps) { + if (has_gate) { // TODO: add support for gated squared relu GGML_ABORT("fatal error: gated squared relu not implemented"); } else { diff --git a/src/llama-graph.h b/src/llama-graph.h index 503ffd695aa..68f111fd6f4 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -742,7 +742,9 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr, + ggml_tensor * gate_up_exps_b = nullptr) const; // // inputs diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5de6493b9e9..ab76e9265dc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6412,9 +6412,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0); layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + + // try merged gate_up first, fall back to separate gate and up + layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, TENSOR_NOT_REQUIRED); + if (layer.ffn_gate_up_exps == nullptr) { + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + } // bias layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0); @@ -6423,9 +6428,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0); - layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0); layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0); - layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0); + + // try merged gate_up bias first, fall back to separate gate and up + layer.ffn_gate_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "bias", i), {n_ff_exp * 2, n_expert}, TENSOR_NOT_REQUIRED); + if (layer.ffn_gate_up_exps_b == nullptr) { + layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0); + layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0); + } } } break; case LLM_ARCH_LFM2: diff --git a/src/llama-model.h b/src/llama-model.h index 79200a0d97a..8bdd4670a47 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -274,14 +274,16 @@ struct llama_layer { struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE - struct ggml_tensor * ffn_gate_inp = nullptr; - struct ggml_tensor * ffn_gate_exps = nullptr; - struct ggml_tensor * ffn_down_exps = nullptr; - struct ggml_tensor * ffn_up_exps = nullptr; - struct ggml_tensor * ffn_gate_inp_b = nullptr; - struct ggml_tensor * ffn_gate_exps_b = nullptr; - struct ggml_tensor * ffn_down_exps_b = nullptr; - struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_exps = nullptr; + struct ggml_tensor * ffn_down_exps = nullptr; + struct ggml_tensor * ffn_up_exps = nullptr; + struct ggml_tensor * ffn_gate_up_exps = nullptr; + struct ggml_tensor * ffn_gate_inp_b = nullptr; + struct ggml_tensor * ffn_gate_exps_b = nullptr; + struct ggml_tensor * ffn_down_exps_b = nullptr; + struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_up_exps_b = nullptr; // ff shared expert (shexp) struct ggml_tensor * ffn_gate_inp_shexp = nullptr; diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index dbe3ca1851f..8f0096a30df 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -88,16 +88,18 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, // MoE branch cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b, - model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b, - model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b, + model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b, + model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b, + model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b, nullptr, n_expert, n_expert_used, LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT, - il); + il, + nullptr, // probs_in + model.layers[il].ffn_gate_up_exps, model.layers[il].ffn_gate_up_exps_b); cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp);