From 6af0337c94eba439ad0d3f00ba1011c3b1baccce Mon Sep 17 00:00:00 2001
From: Rohan138 <rohanpotdar138@gmail.com>
Date: Tue, 28 Apr 2026 14:58:05 -0500
Subject: [PATCH 1/2] fix input_ids and expert_map args for quark w4a8 gptoss

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
---
 vllm/model_executor/layers/quantization/quark/quark_moe.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index d92acb85c26..51b09263b74 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1577,10 +1577,10 @@ def is_monolithic(self) -> bool:
 
     def apply_monolithic(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        expert_map: torch.Tensor | None = None,
+        input_ids: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if layer.enable_eplb:
             raise NotImplementedError(
@@ -1601,7 +1601,7 @@ def apply_monolithic(
             topk=layer.top_k,
             renormalize=layer.renormalize,
             global_num_experts=layer.global_num_experts,
-            expert_map=expert_map,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
             apply_router_weight_on_input=layer.apply_router_weight_on_input,
             unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2,

From f9f72aa775833ed1905c1a6d6156a1f427eea520 Mon Sep 17 00:00:00 2001
From: Rohan138 <rohanpotdar138@gmail.com>
Date: Tue, 28 Apr 2026 15:50:36 -0500
Subject: [PATCH 2/2] fix return and error string

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
---
 vllm/model_executor/layers/quantization/quark/quark_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 51b09263b74..fd28d3c33f0 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1581,10 +1581,10 @@ def apply_monolithic(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         input_ids: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         if layer.enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet."
+                f"EPLB not supported for {self.__class__.__name__} yet."
             )
 
         from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (  # noqa: E501