diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 00be12780a16..d6a6c0502c77 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -335,11 +335,16 @@ def _maybe_reduce_shared_expert_output( """All-reduce shared expert output when the combine kernel already reduced fused output. - This is the "early" all-reduce path. When the combine kernel produces - already-reduced fused output, shared output must be reduced separately - to match. + * If the combine kernel does the reduction for fused_output, reduce + shared_output separately. O.w, reduce fused_output+shared_output later. + * If we have SP (TP=N, DP=M, EP), there is a separate AG step handled + in the model. """ - if shared_output is not None and self._fused_output_is_reduced: + if ( + shared_output is not None + and not self.moe_config.is_sequence_parallel + and self._fused_output_is_reduced + ): shared_output = tensor_model_parallel_all_reduce(shared_output) return shared_output