From ad3c3ab89e205552c397555947dd69eb2232eb74 Mon Sep 17 00:00:00 2001 From: haosdent Date: Mon, 23 Feb 2026 16:03:24 +0800 Subject: [PATCH 1/2] [Bugfix] Gracefully disable AllReduceFusionPass on GPUs without multicast support Fixes #34891: Wrap flashinfer workspace creation in try/except to prevent crash on GPUs without NVSwitch (e.g., NVLink bridge-only or PCIe topologies). The pass is gracefully disabled with a warning instead of crashing the process. Signed-off-by: haosdent --- .../passes/fusion/allreduce_rms_fusion.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index b613d4424ee3..111b72b03bbf 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -729,14 +729,24 @@ def __init__(self, config: VllmConfig) -> None: scope="global", ) - self.workspace = flashinfer_comm.create_allreduce_fusion_workspace( - backend="trtllm", - world_size=self.tp_size, - rank=rank, - max_token_num=self.max_token_num, - hidden_dim=self.hidden_dim, - dtype=self.model_dtype, - ) + try: + self.workspace = flashinfer_comm.create_allreduce_fusion_workspace( + backend="trtllm", + world_size=self.tp_size, + rank=rank, + max_token_num=self.max_token_num, + hidden_dim=self.hidden_dim, + dtype=self.model_dtype, + ) + except RuntimeError as e: + logger.warning_once( + "AllReduce fusion pass is disabled: flashinfer workspace " + "creation failed: %s. This is expected on GPUs without " + "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). " + "Falling back to non-fused allreduce.", + str(e), + ) + return global _FI_WORKSPACE _FI_WORKSPACE = self.workspace From 74d894960b4585171685440a403a208a36f703f1 Mon Sep 17 00:00:00 2001 From: haosdent Date: Tue, 24 Feb 2026 10:06:14 +0800 Subject: [PATCH 2/2] [Bugfix] Gracefully disable AllReduceFusionPass on GPUs without multicast support Fixes #34891: Wrap flashinfer workspace creation in try/except to prevent crash on GPUs without NVSwitch (e.g., NVLink bridge-only or PCIe topologies). Only the specific multicast-related RuntimeError is caught; other RuntimeErrors are re-raised. The pass is gracefully disabled with a warning instead of crashing the process. Signed-off-by: haosdent --- vllm/compilation/passes/fusion/allreduce_rms_fusion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index 111b72b03bbf..b6a1314af9ef 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -739,6 +739,8 @@ def __init__(self, config: VllmConfig) -> None: dtype=self.model_dtype, ) except RuntimeError as e: + if "multicast" not in str(e).lower(): + raise logger.warning_once( "AllReduce fusion pass is disabled: flashinfer workspace " "creation failed: %s. This is expected on GPUs without "