diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py index 254153a27c48..e49d8b2624ab 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py @@ -67,7 +67,7 @@ def num_dispatchers(self) -> int: return self.num_dispatchers_ def output_is_reduced(self) -> bool: - return False + return True def topk_indices_dtype(self) -> torch.dtype | None: return torch.int32 diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py index b975e6cc5e15..f89f375709e3 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py @@ -50,7 +50,7 @@ def num_dispatchers(self) -> int: return self.num_dispatchers_ def output_is_reduced(self) -> bool: - return False + return True def _apply_router_weight_on_input( self,