diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 4dacf4980ef7..f753647081c8 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -121,7 +121,13 @@ def enable_act_fusion(cfg: "VllmConfig") -> bool: def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool: - """Enable if TP > 1 and Hopper/Blackwell and flashinfer installed.""" + """Enable if TP > 1, PP == 1, Hopper/Blackwell, and flashinfer installed. + + Gated off for PP > 1: the fused op's GPU-side peer-signal spin-wait + assumes byte-identical kernel launches across TP peers, but concurrent + independent warmup of multiple TP subgroups lets ranks pick divergent + FlashInfer launch configs and deadlock. + """ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer @@ -134,6 +140,7 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool: return ( cfg.parallel_config.tensor_parallel_size > 1 + and cfg.parallel_config.pipeline_parallel_size == 1 and current_platform.is_cuda() and has_flashinfer() and (