From 6af0337c94eba439ad0d3f00ba1011c3b1baccce Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 28 Apr 2026 14:58:05 -0500 Subject: [PATCH 1/2] fix input_ids and expert_map args for quark w4a8 gptoss Signed-off-by: Rohan138 --- vllm/model_executor/layers/quantization/quark/quark_moe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index d92acb85c26..51b09263b74 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1577,10 +1577,10 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - expert_map: torch.Tensor | None = None, + input_ids: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if layer.enable_eplb: raise NotImplementedError( @@ -1601,7 +1601,7 @@ def apply_monolithic( topk=layer.top_k, renormalize=layer.renormalize, global_num_experts=layer.global_num_experts, - expert_map=expert_map, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2, From f9f72aa775833ed1905c1a6d6156a1f427eea520 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 28 Apr 2026 15:50:36 -0500 Subject: [PATCH 2/2] fix return and error string Signed-off-by: Rohan138 --- vllm/model_executor/layers/quantization/quark/quark_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 51b09263b74..fd28d3c33f0 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1581,10 +1581,10 @@ def apply_monolithic( x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if layer.enable_eplb: raise NotImplementedError( - "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet." + f"EPLB not supported for {self.__class__.__name__} yet." ) from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501