From ad3c3ab89e205552c397555947dd69eb2232eb74 Mon Sep 17 00:00:00 2001
From: haosdent <haosdent@gmail.com>
Date: Mon, 23 Feb 2026 16:03:24 +0800
Subject: [PATCH 1/2] [Bugfix] Gracefully disable AllReduceFusionPass on GPUs
 without multicast support

Fixes #34891: Wrap flashinfer workspace creation in try/except to
prevent crash on GPUs without NVSwitch (e.g., NVLink bridge-only or
PCIe topologies). The pass is gracefully disabled with a warning
instead of crashing the process.

Signed-off-by: haosdent <haosdent@gmail.com>
---
 .../passes/fusion/allreduce_rms_fusion.py     | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index b613d4424ee3..111b72b03bbf 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -729,14 +729,24 @@ def __init__(self, config: VllmConfig) -> None:
             scope="global",
         )
 
-        self.workspace = flashinfer_comm.create_allreduce_fusion_workspace(
-            backend="trtllm",
-            world_size=self.tp_size,
-            rank=rank,
-            max_token_num=self.max_token_num,
-            hidden_dim=self.hidden_dim,
-            dtype=self.model_dtype,
-        )
+        try:
+            self.workspace = flashinfer_comm.create_allreduce_fusion_workspace(
+                backend="trtllm",
+                world_size=self.tp_size,
+                rank=rank,
+                max_token_num=self.max_token_num,
+                hidden_dim=self.hidden_dim,
+                dtype=self.model_dtype,
+            )
+        except RuntimeError as e:
+            logger.warning_once(
+                "AllReduce fusion pass is disabled: flashinfer workspace "
+                "creation failed: %s. This is expected on GPUs without "
+                "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). "
+                "Falling back to non-fused allreduce.",
+                str(e),
+            )
+            return
 
         global _FI_WORKSPACE
         _FI_WORKSPACE = self.workspace

From 74d894960b4585171685440a403a208a36f703f1 Mon Sep 17 00:00:00 2001
From: haosdent <haosdent@gmail.com>
Date: Tue, 24 Feb 2026 10:06:14 +0800
Subject: [PATCH 2/2] [Bugfix] Gracefully disable AllReduceFusionPass on GPUs
 without multicast support

Fixes #34891: Wrap flashinfer workspace creation in try/except to
prevent crash on GPUs without NVSwitch (e.g., NVLink bridge-only or
PCIe topologies). Only the specific multicast-related RuntimeError is
caught; other RuntimeErrors are re-raised. The pass is gracefully
disabled with a warning instead of crashing the process.

Signed-off-by: haosdent <haosdent@gmail.com>
---
 vllm/compilation/passes/fusion/allreduce_rms_fusion.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index 111b72b03bbf..b6a1314af9ef 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -739,6 +739,8 @@ def __init__(self, config: VllmConfig) -> None:
                 dtype=self.model_dtype,
             )
         except RuntimeError as e:
+            if "multicast" not in str(e).lower():
+                raise
             logger.warning_once(
                 "AllReduce fusion pass is disabled: flashinfer workspace "
                 "creation failed: %s. This is expected on GPUs without "