From e5ac6d76485f6e78ba89fb2b58c369f3ab3e8aa5 Mon Sep 17 00:00:00 2001 From: RTCartist Date: Thu, 4 Jun 2026 13:27:39 +0800 Subject: [PATCH] [Bugfix] Fix ZMQ port TOCTOU race in shm_broadcast.py Replace `get_open_port()` with late binding (port 0) for the remote XPUB socket in `MessageQueue.__init__`, then read back the actual bound address via `zmq.LAST_ENDPOINT`. This eliminates the window between port discovery and socket bind where another process could claim the port. Follows the same pattern already used in the DP coordinator (PR #37452). Closes #28498 Signed-off-by: RTCartist --- .../distributed/device_communicators/shm_broadcast.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 9482568461c4..f7730c7b47f9 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -32,7 +32,6 @@ from vllm.platforms import current_platform from vllm.utils.network_utils import ( get_ip, - get_open_port, get_open_zmq_inproc_path, get_open_zmq_ipc_path, is_valid_ipv6_address, @@ -425,14 +424,16 @@ def __init__( connect_ip = get_ip() self.remote_socket = context.socket(XPUB) self.remote_socket.setsockopt(XPUB_VERBOSE, True) - remote_subscribe_port = get_open_port() if is_valid_ipv6_address(connect_ip): self.remote_socket.setsockopt(IPV6, 1) remote_addr_ipv6 = True connect_ip = f"[{connect_ip}]" - socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}" - self.remote_socket.bind(socket_addr) - remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}" + # Bind to port 0 so the OS assigns a port atomically, + # avoiding TOCTOU race with get_open_port(). (See #28498) + self.remote_socket.bind(f"tcp://{connect_ip}:0") + remote_subscribe_addr = ( + self.remote_socket.getsockopt(zmq.LAST_ENDPOINT).decode() + ) else: remote_subscribe_addr = None self.remote_socket = None