From 200ef3c8eba50076fcef4b7e52be8910c4af7caf Mon Sep 17 00:00:00 2001 From: Christina Date: Mon, 8 Dec 2025 18:00:24 -0600 Subject: [PATCH] fix: Lazy tokenizer init in StructuredOutputManager to prevent semaphore leak GGUF models without precomputed merges trigger `build_merges_on_the_fly` in the transformers library, which uses multiprocessing primitives. When this happens in both the APIServer process (for request validation) and the EngineCore subprocess (via StructuredOutputManager), the subprocess leaks a semaphore, causing the server to hang indefinitely. This change makes tokenizer initialization lazy in StructuredOutputManager: - Tokenizer is only loaded when grammar_init() is first called - Most inference requests don't use structured output, so the tokenizer in EngineCore is never loaded - For requests that do use structured output, tokenizer is loaded on-demand - Added explicit RuntimeError when skip_tokenizer_init=True but structured output is requested, providing clear error messaging instead of a later AttributeError The fix resolves the following symptoms: - Server hangs after "resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown" - Tokenizer merges being built twice (once in APIServer, once in EngineCore) - GGUF models failing to start even though weights load successfully Tested with bartowski/Phi-3.5-mini-instruct-GGUF (Q5_K_M). Signed-off-by: Christina --- vllm/v1/structured_output/__init__.py | 88 ++++++++++++++++++--------- 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 4c1d38110d7e..7264bca063e0 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import multiprocessing +import threading from collections.abc import Iterable from concurrent.futures import Future, ThreadPoolExecutor from typing import TYPE_CHECKING @@ -63,39 +64,65 @@ def __init__(self, vllm_config: VllmConfig): max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) - if not self.vllm_config.model_config.skip_tokenizer_init: - # The default max_workers if not specified is the number of - # CPUs * 5, which is way too high since these tasks are CPU-bound, - # not I/O bound. We also know we would never dominate CPU usage - # with just grammar compilation, so we set it to half the number - # of CPUs. - max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) - self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = cached_tokenizer_from_config( - model_config=self.vllm_config.model_config - ) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) - reasoning_parser_plugin = ( - self.vllm_config.structured_outputs_config.reasoning_parser_plugin - ) - if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: - ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) - - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) - if reasoning_parser: - reasoner_cls = ReasoningParserManager.get_reasoning_parser( - reasoning_parser - ) - self.reasoner = reasoner_cls(tokenizer=self.tokenizer) + # Tokenizer is loaded lazily to avoid duplicate tokenizer initialization + # in multiprocess mode. For GGUF models, this prevents a semaphore leak + # that causes server hangs (tokenizer builds merges on the fly, which + # uses multiprocessing primitives that don't clean up in subprocesses). + self._tokenizer = None + self._tokenizer_initialized = False + self._tokenizer_init_lock = threading.Lock() + self.executor: ThreadPoolExecutor | None = None self.enable_in_reasoning = ( self.vllm_config.structured_outputs_config.enable_in_reasoning ) + @property + def tokenizer(self): + """Lazily initialize tokenizer when first accessed (thread-safe).""" + # Double-checked locking pattern for thread-safe lazy initialization + if not self._tokenizer_initialized: + with self._tokenizer_init_lock: + if not self._tokenizer_initialized: + self._init_tokenizer() + return self._tokenizer + + def _init_tokenizer(self): + """Initialize tokenizer and related components on first use.""" + if self._tokenizer_initialized: + return + + if self.vllm_config.model_config.skip_tokenizer_init: + raise RuntimeError( + "Structured output requires a tokenizer, but " + "skip_tokenizer_init is enabled. Either disable " + "skip_tokenizer_init or avoid using structured output." + ) + + # The default max_workers if not specified is the number of + # CPUs * 5, which is way too high since these tasks are CPU-bound, + # not I/O bound. We also know we would never dominate CPU usage + # with just grammar compilation, so we set it to half the number + # of CPUs. + max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) + self.executor = ThreadPoolExecutor(max_workers=max_workers) + self._tokenizer = cached_tokenizer_from_config( + model_config=self.vllm_config.model_config + ) + + reasoning_parser = self.vllm_config.structured_outputs_config.reasoning_parser + reasoning_parser_plugin = ( + self.vllm_config.structured_outputs_config.reasoning_parser_plugin + ) + if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: + ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) + + if reasoning_parser: + reasoner_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser) + self.reasoner = reasoner_cls(tokenizer=self._tokenizer) + + self._tokenizer_initialized = True + def grammar_init(self, request: "Request") -> None: if request.structured_output_request is None: return @@ -149,6 +176,11 @@ def grammar_init(self, request: "Request") -> None: raise ValueError(f"Unsupported structured output backend: {backend}") if self._use_async_grammar_compilation: + # Ensure tokenizer (and executor) is initialized + _ = self.tokenizer + assert self.executor is not None, ( + "Executor should be initialized with tokenizer" + ) grammar = self.executor.submit(self._create_grammar, request) else: grammar = self._create_grammar(request) # type: ignore[assignment]