From 771c07882de184c48b1ed0f18dec20fde97610f4 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 10 Jan 2026 15:39:58 +0800 Subject: [PATCH 1/7] update Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 41 ++++++++++++++++++------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f2f3546047aa..092f21c3e6f6 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -42,7 +42,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -160,7 +160,30 @@ def __init__( self.physical_expert_start + self.n_local_physical_experts ) - self.experts = FusedMoE( + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate", + ) + + if config.shared_expert_intermediate_size > 0: + self.shared_expert = Qwen3MoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.shared_expert_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + expert_gate=self.shared_expert_gate, + prefix=f"{prefix}.shared_expert", + ) + else: + self.shared_expert = None + + self.experts = SharedFusedMoE( + shared_experts=self.shared_expert, + gate=self.gate, num_experts=self.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, @@ -175,14 +198,6 @@ def __init__( routing_method_type=RoutingMethodType.Renormalize, ) - self.gate = ReplicatedLinear( - config.hidden_size, - config.num_experts, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate", - ) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: assert hidden_states.dim() <= 2, ( "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs" @@ -205,6 +220,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states, 0 ) final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 + final_hidden_states + ) # return to 1d if input is 1d return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states @@ -469,7 +488,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return SharedFusedMoE.make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", From 52354dec85e7900a5f8a4d3860bf2c818bc14e57 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 11 Jan 2026 15:15:29 +0800 Subject: [PATCH 2/7] fix Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 092f21c3e6f6..2ce40c1d4bad 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -168,10 +168,13 @@ def __init__( prefix=f"{prefix}.gate", ) - if config.shared_expert_intermediate_size > 0: + shared_expert_intermediate_size = getattr( + config, "shared_expert_intermediate_size", 0 + ) + if shared_expert_intermediate_size > 0: self.shared_expert = Qwen3MoeMLP( hidden_size=config.hidden_size, - intermediate_size=config.shared_expert_intermediate_size, + intermediate_size=shared_expert_intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=False, @@ -211,9 +214,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( + shared_out, fused_out = self.experts( hidden_states=hidden_states, router_logits=router_logits ) + final_hidden_states = ( + shared_out + fused_out if shared_out is not None else fused_out + ) if self.is_sequence_parallel: final_hidden_states = tensor_model_parallel_all_gather( From 5c868999907a13a5fa7b128c961a0d890aa3d381 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 11 Jan 2026 21:11:45 +0800 Subject: [PATCH 3/7] fix Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 2ce40c1d4bad..968002df4bcf 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -29,6 +29,7 @@ from typing import Any import torch +import torch.nn.functional as F from torch import nn from vllm.attention.layer import Attention @@ -87,6 +88,7 @@ def __init__( hidden_act: str, quant_config: QuantizationConfig | None = None, reduce_results: bool = True, + expert_gate: torch.nn.Linear | None = None, prefix: str = "", ) -> None: super().__init__() @@ -110,11 +112,15 @@ def __init__( f"Unsupported activation: {hidden_act}. Only silu is supported for now." ) self.act_fn = SiluAndMul() + self.expert_gate = expert_gate def forward(self, x): gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) + + if self.expert_gate is not None: + x = F.sigmoid(self.expert_gate(x)[0]) * x return x @@ -172,6 +178,13 @@ def __init__( config, "shared_expert_intermediate_size", 0 ) if shared_expert_intermediate_size > 0: + self.shared_expert_gate = ReplicatedLinear( + config.hidden_size, + 1, + bias=False, + quant_config=None, + prefix=f"{prefix}.shared_expert_gate", + ) self.shared_expert = Qwen3MoeMLP( hidden_size=config.hidden_size, intermediate_size=shared_expert_intermediate_size, @@ -182,6 +195,7 @@ def __init__( prefix=f"{prefix}.shared_expert", ) else: + self.shared_expert_gate = None self.shared_expert = None self.experts = SharedFusedMoE( From e3c875d582a665cd923d1c1369f773cd02fea3bc Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 11 Jan 2026 23:59:30 +0800 Subject: [PATCH 4/7] fix Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 968002df4bcf..daeecf8209f2 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -44,7 +44,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import SharedFusedMoE -from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -116,12 +115,13 @@ def __init__( def forward(self, x): gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) + out = self.act_fn(gate_up) + out, _ = self.down_proj(out) if self.expert_gate is not None: - x = F.sigmoid(self.expert_gate(x)[0]) * x - return x + out = F.sigmoid(self.expert_gate(x)[0]) * out + + return out class Qwen3MoeSparseMoeBlock(nn.Module): @@ -212,7 +212,6 @@ def __init__( enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, - routing_method_type=RoutingMethodType.Renormalize, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: From de59bc0265b40de984937e306162f31b2022f8b2 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 12 Jan 2026 00:07:29 +0800 Subject: [PATCH 5/7] reduce_results=False Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index daeecf8209f2..567c031938f5 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -205,7 +205,7 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=True, + reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, prefix=f"{prefix}.experts", From f4ebf484ae0ee86b82e0ca1d6d62f73735080be7 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 12 Jan 2026 00:40:03 +0800 Subject: [PATCH 6/7] revert accident change Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 567c031938f5..e7ed469194b3 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -44,6 +44,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -212,6 +213,7 @@ def __init__( enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, + routing_method_type=RoutingMethodType.Renormalize, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: From 658eceab0eefea57b55208a1fe79b7e7450928e1 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 22 Jan 2026 22:48:50 +0800 Subject: [PATCH 7/7] pre-commit Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 9b4dc5f5b99d..567c031938f5 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -44,7 +44,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import SharedFusedMoE -from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear,