diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index d92acb85c265..fd28d3c33f04 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1577,14 +1577,14 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - expert_map: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + input_ids: torch.Tensor | None = None, + ) -> torch.Tensor: if layer.enable_eplb: raise NotImplementedError( - "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet." + f"EPLB not supported for {self.__class__.__name__} yet." ) from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501 @@ -1601,7 +1601,7 @@ def apply_monolithic( topk=layer.top_k, renormalize=layer.renormalize, global_num_experts=layer.global_num_experts, - expert_map=expert_map, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2,