From 06ba3ab777013ae2554dbf816a1810a78bbd8b30 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Tue, 14 Apr 2026 17:04:53 -0700
Subject: [PATCH 1/5] Prevent orphaned process on NCCL destroy

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/distributed/device_communicators/pynccl.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 6ac3b9ea3c7c..0de727522c53 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -147,8 +147,14 @@ def __init__(
 
     def destroy(self):
         if self.available and not self.disabled:
-            with torch.accelerator.device_index(self.device.index):
-                self.nccl.ncclCommDestroy(self.comm)
+            # Note: we intentionally do not call ncclCommDestroy here.
+            # ncclCommDestroy is a blocking collective that requires all
+            # ranks to participate. During uncoordinated shutdown, peer
+            # ranks may already be gone, causing ncclCommDestroy to hang
+            # indefinitely and orphan GPU worker processes. The OS reclaims
+            # NCCL resources when the process exits, so explicitly destruction
+            # is unnecessary.
+            self.comm = None
             self.available = False
             self.disabled = True
 

From 7271f7dd34770bdf3ba123330b5bfe11206963f8 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Tue, 14 Apr 2026 21:07:25 -0700
Subject: [PATCH 2/5] Gemini feedback

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/distributed/device_communicators/pynccl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0de727522c53..85aa0baa2596 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -152,11 +152,11 @@ def destroy(self):
             # ranks to participate. During uncoordinated shutdown, peer
             # ranks may already be gone, causing ncclCommDestroy to hang
             # indefinitely and orphan GPU worker processes. The OS reclaims
-            # NCCL resources when the process exits, so explicitly destruction
+            # NCCL resources when the process exits, so explicit destruction
             # is unnecessary.
-            self.comm = None
-            self.available = False
             self.disabled = True
+            self.available = False
+            self.comm = None
 
     def all_reduce(
         self,

From 68a5cf88f28841d759c4033a465b068097a21c04 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Tue, 14 Apr 2026 21:23:04 -0700
Subject: [PATCH 3/5] Fix

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/distributed/device_communicators/pynccl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 85aa0baa2596..3347c7284e39 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -154,9 +154,9 @@ def destroy(self):
             # indefinitely and orphan GPU worker processes. The OS reclaims
             # NCCL resources when the process exits, so explicit destruction
             # is unnecessary.
-            self.disabled = True
-            self.available = False
             self.comm = None
+            self.available = False
+            self.disabled = True
 
     def all_reduce(
         self,

From 7c9a1f393c2a266639eacfbeb2f609305eb2dc5d Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Wed, 15 Apr 2026 11:57:33 -0700
Subject: [PATCH 4/5] Fix mypy

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/distributed/device_communicators/pynccl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 3347c7284e39..2bbe49c8e2d8 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -154,7 +154,7 @@ def destroy(self):
             # indefinitely and orphan GPU worker processes. The OS reclaims
             # NCCL resources when the process exits, so explicit destruction
             # is unnecessary.
-            self.comm = None
+            self.comm = ncclComm_t()
             self.available = False
             self.disabled = True
 

From 63f9254661ed9477d8c782cbdea323560023a5f2 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Thu, 23 Apr 2026 16:14:56 -0700
Subject: [PATCH 5/5] Use NCCL abort to clean up collectives

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 .../device_communicators/pynccl.py            | 23 ++++++++++++-------
 .../device_communicators/pynccl_wrapper.py    |  9 ++++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 2bbe49c8e2d8..a3577b577f8f 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -3,6 +3,8 @@
 
 
 # ===================== import region =====================
+import threading
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup, ReduceOp
@@ -147,14 +149,19 @@ def __init__(
 
     def destroy(self):
         if self.available and not self.disabled:
-            # Note: we intentionally do not call ncclCommDestroy here.
-            # ncclCommDestroy is a blocking collective that requires all
-            # ranks to participate. During uncoordinated shutdown, peer
-            # ranks may already be gone, causing ncclCommDestroy to hang
-            # indefinitely and orphan GPU worker processes. The OS reclaims
-            # NCCL resources when the process exits, so explicit destruction
-            # is unnecessary.
-            self.comm = ncclComm_t()
+            # ncclCommAbort can block until all CUDA graphs that
+            # captured NCCL ops on this comm are destroyed — and
+            # those graphs are released later in this same main-
+            # thread teardown, so a direct call here self-deadlocks.
+            # Run it in a daemon thread with a timeout: the main
+            # thread proceeds, the graphs drop, and the abort returns.
+            def _abort():
+                with torch.accelerator.device_index(self.device.index):
+                    self.nccl.ncclCommAbort(self.comm)
+
+            abort_thread = threading.Thread(target=_abort, daemon=True)
+            abort_thread.start()
+            abort_thread.join(timeout=5.0)
             self.available = False
             self.disabled = True
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 57c7397e01b6..5ca8cc7c77f4 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -290,6 +290,12 @@ class NCCLLibrary:
         # it is better not to call it at all.
         # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
         Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclCommAbort frees resources associated with the communicator
+        # without requiring a collective synchronization. Unlike
+        # ncclCommDestroy, it is safe to call during an uncoordinated
+        # shutdown when peer ranks may already be gone.
+        # ncclResult_t  ncclCommAbort(ncclComm_t comm);
+        Function("ncclCommAbort", ncclResult_t, [ncclComm_t]),
         # ncclResult_t ncclGroupStart();
         Function("ncclGroupStart", ncclResult_t, []),
         # ncclResult_t ncclGroupEnd();
@@ -548,6 +554,9 @@ def ncclBroadcast(
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
+    def ncclCommAbort(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommAbort"](comm))
+
     def ncclGroupStart(self) -> None:
         self.NCCL_CHECK(self._funcs["ncclGroupStart"]())