Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions vllm/v1/engine/core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,10 @@ def __init__(
)

with launch_core_engines(
vllm_config, executor_class, log_stats, addresses
vllm_config,
executor_class,
log_stats,
addresses,
) as (engine_manager, coordinator, addresses):
self.resources.coordinator = coordinator
self.resources.engine_manager = engine_manager
Expand Down Expand Up @@ -636,10 +639,10 @@ def __init__(

def shutdown(self, timeout: float | None = None) -> None:
"""Shutdown engine manager under timeout and clean up resources."""
if self._finalizer.detach() is not None:
if self.resources.engine_manager is not None:
self.resources.engine_manager.shutdown(timeout=timeout)
self.resources()
self._finalizer.detach()
if self.resources.engine_manager is not None:
self.resources.engine_manager.shutdown(timeout=timeout)
self.resources()
Comment on lines +642 to +645
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

By removing the check on the return value of self._finalizer.detach(), the shutdown method is no longer idempotent. If this method is called multiple times, it will attempt to shut down the engine manager and clean up resources repeatedly. This could lead to unexpected errors if the underlying shutdown and cleanup operations are not idempotent. It's safer to restore the check to ensure shutdown logic runs only once.

Suggested change
self._finalizer.detach()
if self.resources.engine_manager is not None:
self.resources.engine_manager.shutdown(timeout=timeout)
self.resources()
if self._finalizer.detach() is not None:
if self.resources.engine_manager is not None:
self.resources.engine_manager.shutdown(timeout=timeout)
self.resources()


def _format_exception(self, e: Exception) -> Exception:
"""If errored, use EngineDeadError so root cause is clear."""
Expand Down Expand Up @@ -683,7 +686,7 @@ def monitor_engine_cores():
sentinels = [proc.sentinel for proc in engine_processes]
died = multiprocessing.connection.wait(sentinels)
_self = self_ref()
if not _self or not _self._finalizer.alive or _self.resources.engine_dead:
if not _self or _self.resources.engine_dead:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Removing the not _self._finalizer.alive check re-introduces a race condition. During a normal shutdown, engine processes are terminated. This monitor thread can wake up and incorrectly interpret this as an unexpected crash, leading to spurious error logs. The check for _self._finalizer.alive is crucial to distinguish between a controlled shutdown and an actual failure.

Suggested change
if not _self or _self.resources.engine_dead:
if not _self or not _self._finalizer.alive or _self.resources.engine_dead:

return
_self.resources.engine_dead = True
proc_name = next(
Expand Down