vllm-project · Lidang-Jiang · Mar 4, 2026 · Mar 5, 2026 · Mar 12, 2026 · lishunyang12
@@ -9,6 +9,7 @@
 import asyncio
 import fcntl
 import importlib
+import logging
 import multiprocessing as mp
 import os
 import queue
@@ -63,14 +64,69 @@
 logger = init_logger(__name__)
 
 
+class _RepoUtilsLocalPathFilter(logging.Filter):
+    """Filter out noisy HF Hub errors that always fire for local paths."""
+
+    _SUPPRESSED_FRAGMENTS = (
+        "Error retrieving file list",
+        "Error retrieving safetensors",
+    )
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        msg = record.getMessage()
+        return not any(f in msg for f in self._SUPPRESSED_FRAGMENTS)
+
+
+def _suppress_repo_utils_errors_for_local_path(model: str) -> None:
+    """Install a filter to suppress harmless HF Hub errors for local model paths."""
+    if os.path.exists(model):
+        logging.getLogger("vllm.transformers_utils.repo_utils").addFilter(_RepoUtilsLocalPathFilter())
+
+
 @contextmanager
 def _sequential_init_lock(engine_args: dict[str, Any], stage_init_timeout: int = 300):
     """Acquire device locks for sequential init if NVML is unavailable.
 
     If process-scoped memory tracking is available (NVML works), stages can
     safely initialize concurrently — each measures only its own GPU memory.
     Otherwise, fall back to file-based locks to serialize initialization.
+
+    A global file lock is always acquired first to serialize get_open_port()
+    calls across all stages, preventing EADDRINUSE TOCTOU races.
     """
+    # Global lock to serialize engine initialization across all stages.
+    # This prevents TOCTOU races where concurrent get_open_port() calls
+    # in different stages (using different devices) return the same port.
+    _GLOBAL_INIT_LOCK_PATH = "/tmp/vllm_omni_engine_init.lock"
+    global_lock_fd = os.open(_GLOBAL_INIT_LOCK_PATH, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        logger.info("Waiting for global engine init lock (%s)...", _GLOBAL_INIT_LOCK_PATH)
+        fcntl.flock(global_lock_fd, fcntl.LOCK_EX)
+        logger.info("Acquired global engine init lock")
+    except OSError as e:
+        logger.warning("Failed to acquire global init lock: %s, proceeding anyway", e)
+        try:
+            os.close(global_lock_fd)
+        except OSError:
+            pass
+        global_lock_fd = -1
+
+    try:
+        with _sequential_init_lock_inner(engine_args, stage_init_timeout):
+            yield
+    finally:
+        if global_lock_fd >= 0:
+            try:
+                fcntl.flock(global_lock_fd, fcntl.LOCK_UN)
+                os.close(global_lock_fd)
+                logger.info("Released global engine init lock")
+            except OSError:
+                pass
+
+
+@contextmanager
+def _sequential_init_lock_inner(engine_args: dict[str, Any], stage_init_timeout: int = 300):
+    """Inner logic for sequential init locks (per-device granularity)."""
     from vllm_omni.worker.gpu_memory_utils import is_process_scoped_memory_available
 
     nvml_available = is_process_scoped_memory_available()
@@ -711,6 +767,7 @@ def _stage_worker(
     from vllm_omni.plugins import load_omni_general_plugins
 
     load_omni_general_plugins()
+    _suppress_repo_utils_errors_for_local_path(model)
     # IMPORTANT: Ensure vLLM's internal multiprocessing workers (e.g., GPUARWorker /
     # GPUARModelRunner) are spawned with a fork-safe method.
     # Mooncake / gRPC / RDMA and CUDA/NCCL can deadlock under fork-with-threads.
@@ -1122,6 +1179,7 @@ async def _stage_worker_async(
     from vllm_omni.plugins import load_omni_general_plugins
 
     load_omni_general_plugins()
+    _suppress_repo_utils_errors_for_local_path(model)
     # IMPORTANT: Ensure vLLM's internal multiprocessing workers (e.g., GPUARWorker /
     # GPUARModelRunner) are spawned with a fork-safe method.
     if _os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn":