From 06ba3ab777013ae2554dbf816a1810a78bbd8b30 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Tue, 14 Apr 2026 17:04:53 -0700 Subject: [PATCH 1/5] Prevent orphaned process on NCCL destroy Signed-off-by: Jeffrey Wang --- vllm/distributed/device_communicators/pynccl.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 6ac3b9ea3c7c..0de727522c53 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -147,8 +147,14 @@ def __init__( def destroy(self): if self.available and not self.disabled: - with torch.accelerator.device_index(self.device.index): - self.nccl.ncclCommDestroy(self.comm) + # Note: we intentionally do not call ncclCommDestroy here. + # ncclCommDestroy is a blocking collective that requires all + # ranks to participate. During uncoordinated shutdown, peer + # ranks may already be gone, causing ncclCommDestroy to hang + # indefinitely and orphan GPU worker processes. The OS reclaims + # NCCL resources when the process exits, so explicitly destruction + # is unnecessary. + self.comm = None self.available = False self.disabled = True From 7271f7dd34770bdf3ba123330b5bfe11206963f8 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Tue, 14 Apr 2026 21:07:25 -0700 Subject: [PATCH 2/5] Gemini feedback Signed-off-by: Jeffrey Wang --- vllm/distributed/device_communicators/pynccl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 0de727522c53..85aa0baa2596 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -152,11 +152,11 @@ def destroy(self): # ranks to participate. During uncoordinated shutdown, peer # ranks may already be gone, causing ncclCommDestroy to hang # indefinitely and orphan GPU worker processes. The OS reclaims - # NCCL resources when the process exits, so explicitly destruction + # NCCL resources when the process exits, so explicit destruction # is unnecessary. - self.comm = None - self.available = False self.disabled = True + self.available = False + self.comm = None def all_reduce( self, From 68a5cf88f28841d759c4033a465b068097a21c04 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Tue, 14 Apr 2026 21:23:04 -0700 Subject: [PATCH 3/5] Fix Signed-off-by: Jeffrey Wang --- vllm/distributed/device_communicators/pynccl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 85aa0baa2596..3347c7284e39 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -154,9 +154,9 @@ def destroy(self): # indefinitely and orphan GPU worker processes. The OS reclaims # NCCL resources when the process exits, so explicit destruction # is unnecessary. - self.disabled = True - self.available = False self.comm = None + self.available = False + self.disabled = True def all_reduce( self, From 7c9a1f393c2a266639eacfbeb2f609305eb2dc5d Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Wed, 15 Apr 2026 11:57:33 -0700 Subject: [PATCH 4/5] Fix mypy Signed-off-by: Jeffrey Wang --- vllm/distributed/device_communicators/pynccl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 3347c7284e39..2bbe49c8e2d8 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -154,7 +154,7 @@ def destroy(self): # indefinitely and orphan GPU worker processes. The OS reclaims # NCCL resources when the process exits, so explicit destruction # is unnecessary. - self.comm = None + self.comm = ncclComm_t() self.available = False self.disabled = True From 63f9254661ed9477d8c782cbdea323560023a5f2 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Thu, 23 Apr 2026 16:14:56 -0700 Subject: [PATCH 5/5] Use NCCL abort to clean up collectives Signed-off-by: Jeffrey Wang --- .../device_communicators/pynccl.py | 23 ++++++++++++------- .../device_communicators/pynccl_wrapper.py | 9 ++++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 2bbe49c8e2d8..a3577b577f8f 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -3,6 +3,8 @@ # ===================== import region ===================== +import threading + import torch import torch.distributed as dist from torch.distributed import ProcessGroup, ReduceOp @@ -147,14 +149,19 @@ def __init__( def destroy(self): if self.available and not self.disabled: - # Note: we intentionally do not call ncclCommDestroy here. - # ncclCommDestroy is a blocking collective that requires all - # ranks to participate. During uncoordinated shutdown, peer - # ranks may already be gone, causing ncclCommDestroy to hang - # indefinitely and orphan GPU worker processes. The OS reclaims - # NCCL resources when the process exits, so explicit destruction - # is unnecessary. - self.comm = ncclComm_t() + # ncclCommAbort can block until all CUDA graphs that + # captured NCCL ops on this comm are destroyed — and + # those graphs are released later in this same main- + # thread teardown, so a direct call here self-deadlocks. + # Run it in a daemon thread with a timeout: the main + # thread proceeds, the graphs drop, and the abort returns. + def _abort(): + with torch.accelerator.device_index(self.device.index): + self.nccl.ncclCommAbort(self.comm) + + abort_thread = threading.Thread(target=_abort, daemon=True) + abort_thread.start() + abort_thread.join(timeout=5.0) self.available = False self.disabled = True diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 57c7397e01b6..5ca8cc7c77f4 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -290,6 +290,12 @@ class NCCLLibrary: # it is better not to call it at all. # ncclResult_t ncclCommDestroy(ncclComm_t comm); Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]), + # ncclCommAbort frees resources associated with the communicator + # without requiring a collective synchronization. Unlike + # ncclCommDestroy, it is safe to call during an uncoordinated + # shutdown when peer ranks may already be gone. + # ncclResult_t ncclCommAbort(ncclComm_t comm); + Function("ncclCommAbort", ncclResult_t, [ncclComm_t]), # ncclResult_t ncclGroupStart(); Function("ncclGroupStart", ncclResult_t, []), # ncclResult_t ncclGroupEnd(); @@ -548,6 +554,9 @@ def ncclBroadcast( def ncclCommDestroy(self, comm: ncclComm_t) -> None: self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm)) + def ncclCommAbort(self, comm: ncclComm_t) -> None: + self.NCCL_CHECK(self._funcs["ncclCommAbort"](comm)) + def ncclGroupStart(self) -> None: self.NCCL_CHECK(self._funcs["ncclGroupStart"]())