From 200ef3c8eba50076fcef4b7e52be8910c4af7caf Mon Sep 17 00:00:00 2001
From: Christina <truffle@gmail.com>
Date: Mon, 8 Dec 2025 18:00:24 -0600
Subject: [PATCH] fix: Lazy tokenizer init in StructuredOutputManager to
 prevent semaphore leak

GGUF models without precomputed merges trigger `build_merges_on_the_fly`
in the transformers library, which uses multiprocessing primitives.
When this happens in both the APIServer process (for request validation)
and the EngineCore subprocess (via StructuredOutputManager), the
subprocess leaks a semaphore, causing the server to hang indefinitely.

This change makes tokenizer initialization lazy in StructuredOutputManager:
- Tokenizer is only loaded when grammar_init() is first called
- Most inference requests don't use structured output, so the tokenizer
  in EngineCore is never loaded
- For requests that do use structured output, tokenizer is loaded on-demand
- Added explicit RuntimeError when skip_tokenizer_init=True but structured
  output is requested, providing clear error messaging instead of a later
  AttributeError

The fix resolves the following symptoms:
- Server hangs after "resource_tracker: There appear to be 1 leaked
  semaphore objects to clean up at shutdown"
- Tokenizer merges being built twice (once in APIServer, once in EngineCore)
- GGUF models failing to start even though weights load successfully

Tested with bartowski/Phi-3.5-mini-instruct-GGUF (Q5_K_M).

Signed-off-by: Christina <truffle@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 88 ++++++++++++++++++---------
 1 file changed, 60 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 4c1d38110d7e..7264bca063e0 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import multiprocessing
+import threading
 from collections.abc import Iterable
 from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING
@@ -63,39 +64,65 @@ def __init__(self, vllm_config: VllmConfig):
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
             self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
-        if not self.vllm_config.model_config.skip_tokenizer_init:
-            # The default max_workers if not specified is the number of
-            # CPUs * 5, which is way too high since these tasks are CPU-bound,
-            # not I/O bound. We also know we would never dominate CPU usage
-            # with just grammar compilation, so we set it to half the number
-            # of CPUs.
-            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
-            self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = cached_tokenizer_from_config(
-                model_config=self.vllm_config.model_config
-            )
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
-            reasoning_parser_plugin = (
-                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
-            )
-            if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
-                ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
-
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
-            if reasoning_parser:
-                reasoner_cls = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_parser
-                )
-                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+        # Tokenizer is loaded lazily to avoid duplicate tokenizer initialization
+        # in multiprocess mode. For GGUF models, this prevents a semaphore leak
+        # that causes server hangs (tokenizer builds merges on the fly, which
+        # uses multiprocessing primitives that don't clean up in subprocesses).
+        self._tokenizer = None
+        self._tokenizer_initialized = False
+        self._tokenizer_init_lock = threading.Lock()
+        self.executor: ThreadPoolExecutor | None = None
 
         self.enable_in_reasoning = (
             self.vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
+    @property
+    def tokenizer(self):
+        """Lazily initialize tokenizer when first accessed (thread-safe)."""
+        # Double-checked locking pattern for thread-safe lazy initialization
+        if not self._tokenizer_initialized:
+            with self._tokenizer_init_lock:
+                if not self._tokenizer_initialized:
+                    self._init_tokenizer()
+        return self._tokenizer
+
+    def _init_tokenizer(self):
+        """Initialize tokenizer and related components on first use."""
+        if self._tokenizer_initialized:
+            return
+
+        if self.vllm_config.model_config.skip_tokenizer_init:
+            raise RuntimeError(
+                "Structured output requires a tokenizer, but "
+                "skip_tokenizer_init is enabled. Either disable "
+                "skip_tokenizer_init or avoid using structured output."
+            )
+
+        # The default max_workers if not specified is the number of
+        # CPUs * 5, which is way too high since these tasks are CPU-bound,
+        # not I/O bound. We also know we would never dominate CPU usage
+        # with just grammar compilation, so we set it to half the number
+        # of CPUs.
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._tokenizer = cached_tokenizer_from_config(
+            model_config=self.vllm_config.model_config
+        )
+
+        reasoning_parser = self.vllm_config.structured_outputs_config.reasoning_parser
+        reasoning_parser_plugin = (
+            self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+        )
+        if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
+            ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
+
+        if reasoning_parser:
+            reasoner_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser)
+            self.reasoner = reasoner_cls(tokenizer=self._tokenizer)
+
+        self._tokenizer_initialized = True
+
     def grammar_init(self, request: "Request") -> None:
         if request.structured_output_request is None:
             return
@@ -149,6 +176,11 @@ def grammar_init(self, request: "Request") -> None:
                 raise ValueError(f"Unsupported structured output backend: {backend}")
 
         if self._use_async_grammar_compilation:
+            # Ensure tokenizer (and executor) is initialized
+            _ = self.tokenizer
+            assert self.executor is not None, (
+                "Executor should be initialized with tokenizer"
+            )
             grammar = self.executor.submit(self._create_grammar, request)
         else:
             grammar = self._create_grammar(request)  # type: ignore[assignment]