diff --git a/tests/entrypoints/openai/chat_completion/test_chat_error.py b/tests/entrypoints/openai/chat_completion/test_chat_error.py index 582e0792156c..9aebacfdd740 100644 --- a/tests/entrypoints/openai/chat_completion/test_chat_error.py +++ b/tests/entrypoints/openai/chat_completion/test_chat_error.py @@ -55,7 +55,6 @@ class MockModelConfig: skip_tokenizer_init = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/openai/chat_completion/test_serving_chat.py b/tests/entrypoints/openai/chat_completion/test_serving_chat.py index 39d59d28f854..acc68a1e3ceb 100644 --- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py +++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py @@ -537,7 +537,6 @@ class MockModelConfig: skip_tokenizer_init: bool = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/openai/completion/test_completion_error.py b/tests/entrypoints/openai/completion/test_completion_error.py index c95e47fa1b16..0e735fe9507e 100644 --- a/tests/entrypoints/openai/completion/test_completion_error.py +++ b/tests/entrypoints/openai/completion/test_completion_error.py @@ -54,7 +54,6 @@ class MockModelConfig: skip_tokenizer_init = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/openai/completion/test_lora_resolvers.py b/tests/entrypoints/openai/completion/test_lora_resolvers.py index 6a0bec92516d..5d5d21b1a472 100644 --- a/tests/entrypoints/openai/completion/test_lora_resolvers.py +++ b/tests/entrypoints/openai/completion/test_lora_resolvers.py @@ -54,7 +54,6 @@ class MockModelConfig: skip_tokenizer_init: bool = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/serve/disagg/test_generate_stream.py b/tests/entrypoints/serve/disagg/test_generate_stream.py index 49349bf4ba50..2ae17e995117 100644 --- a/tests/entrypoints/serve/disagg/test_generate_stream.py +++ b/tests/entrypoints/serve/disagg/test_generate_stream.py @@ -58,7 +58,6 @@ class MockModelConfig: skip_tokenizer_init = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/serve/tokenize/test_serving_tokenization.py b/tests/entrypoints/serve/tokenize/test_serving_tokenization.py index ba9d7989a865..58f390f42f52 100644 --- a/tests/entrypoints/serve/tokenize/test_serving_tokenization.py +++ b/tests/entrypoints/serve/tokenize/test_serving_tokenization.py @@ -52,7 +52,6 @@ class MockModelConfig: skip_tokenizer_init = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/renderers/test_completions.py b/tests/renderers/test_completions.py index ccc806ba137d..3554940d4dbb 100644 --- a/tests/renderers/test_completions.py +++ b/tests/renderers/test_completions.py @@ -38,7 +38,6 @@ class MockModelConfig: skip_tokenizer_init: bool = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 @dataclass diff --git a/tests/renderers/test_mistral.py b/tests/renderers/test_mistral.py index 3a8b49a4db29..74e50d0843ee 100644 --- a/tests/renderers/test_mistral.py +++ b/tests/renderers/test_mistral.py @@ -37,7 +37,6 @@ class MockModelConfig: skip_tokenizer_init: bool = False is_encoder_decoder: bool = False is_multimodal_model: bool = False - renderer_num_workers: int = 1 @dataclass diff --git a/tests/test_config.py b/tests/test_config.py index 41d34a6cb06b..bab960533078 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1246,28 +1246,6 @@ def test_needs_dp_coordination( assert vllm_config.needs_dp_coordinator == expected_needs_coordinator -def test_renderer_num_workers_with_mm_cache(): - """Disallow renderer_num_workers > 1 when mm processor cache is enabled, - since neither cache type is thread-safe.""" - mm_model = "Qwen/Qwen2-VL-2B-Instruct" - - # Should raise: multi-worker + cache enabled (default cache_gb=4) - with pytest.raises(ValueError, match="renderer-num-workers"): - ModelConfig(mm_model, renderer_num_workers=4) - - # Should raise: multi-worker + explicit cache size - with pytest.raises(ValueError, match="renderer-num-workers"): - ModelConfig(mm_model, renderer_num_workers=2, mm_processor_cache_gb=1.0) - - # Should pass: multi-worker + cache disabled - config = ModelConfig(mm_model, renderer_num_workers=4, mm_processor_cache_gb=0) - assert config.renderer_num_workers == 4 - - # Should pass: single worker + cache enabled (default) - config = ModelConfig(mm_model, renderer_num_workers=1) - assert config.renderer_num_workers == 1 - - def test_eagle_draft_model_config(): """Test that EagleDraft model config is correctly set.""" target_model_config = ModelConfig( diff --git a/vllm/config/model.py b/vllm/config/model.py index 470e8091cc27..0f17385a3b7f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -301,10 +301,6 @@ class ModelConfig: definitions""" io_processor_plugin: str | None = None """IOProcessor plugin name to load at model startup""" - renderer_num_workers: int = 1 - """Number of worker threads in the renderer thread pool. This pool - handles async tokenization, chat template rendering, and multimodal - preprocessing.""" # Pooler config pooler_config: PoolerConfig | None = None @@ -667,19 +663,6 @@ def __post_init__( self.multimodal_config = MultiModalConfig(**mm_config_kwargs) # type: ignore[arg-type] - if ( - self.renderer_num_workers > 1 - and self.multimodal_config.mm_processor_cache_gb > 0 - ): - raise ValueError( - "Cannot use --renderer-num-workers > 1 with the " - "multimodal processor cache enabled. The cache is " - "not thread-safe and does not support concurrent " - "renderer workers. Please set " - "--renderer-num-workers 1 (the default), or " - "disable the cache with --mm-processor-cache-gb 0." - ) - # Multimodal GGUF models must use original repo for mm processing if is_gguf(self.tokenizer) and self.is_multimodal_model: raise ValueError( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cd9551003339..ade5eddf8596 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -551,7 +551,6 @@ class EngineArgs: MultiModalConfig.mm_encoder_fp8_scale_save_margin ) io_processor_plugin: str | None = None - renderer_num_workers: int = 1 skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate mm_tensor_ipc: MMTensorIPC = MultiModalConfig.mm_tensor_ipc @@ -832,10 +831,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: model_group.add_argument( "--io-processor-plugin", **model_kwargs["io_processor_plugin"] ) - model_group.add_argument( - "--renderer-num-workers", - **model_kwargs["renderer_num_workers"], - ) # Model loading arguments load_kwargs = get_kwargs(LoadConfig) @@ -1555,7 +1550,6 @@ def create_model_config(self) -> ModelConfig: video_pruning_rate=self.video_pruning_rate, mm_tensor_ipc=self.mm_tensor_ipc, io_processor_plugin=self.io_processor_plugin, - renderer_num_workers=self.renderer_num_workers, ) def validate_tensorizer_args(self): diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index 2f10302c0268..828cac4b9f44 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -82,12 +82,13 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None: self.tokenizer = tokenizer - # Shared thread pool executor for blocking tokenizer and - # multimodal preprocessing operations. The multimodal processor - # receives a deep-copied tokenizer (see #36557) so it is safe to - # run tokenization and MM preprocessing concurrently. - pool_workers = config.model_config.renderer_num_workers - self._executor = ThreadPoolExecutor(max_workers=pool_workers) + # Shared single-worker thread pool for blocking tokenizer and + # multimodal preprocessing operations. + self._executor = ThreadPoolExecutor(max_workers=1) + # Tokenizer to be used in the executor thread + # Deep copy to avoid sharing the tokenizer leading to + # "already borrowed" errors (see #36557). + self.executor_tokenizer = copy.deepcopy(tokenizer) # Multimodal preprocessing is always offloaded to the thread pool # to keep the asyncio event loop responsive under concurrent load. @@ -108,17 +109,14 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None: if config.model_config.is_multimodal_model: mm_processor_cache = mm_registry.processor_cache_from_config(config) - # Deep-copy the tokenizer so the multimodal processor gets its - # own Rust tokenizer backend. Without this, concurrent access - # from AsyncMicrobatchTokenizer and call_hf_processor causes - # "RuntimeError: Already borrowed" from the Rust RefCell. - # See: https://github.com/huggingface/tokenizers/issues/537 - mm_tokenizer = copy.deepcopy(tokenizer) + # Cannot self.executor_tokenizer because the mm processor might + # mutate the tokenizer, corrupting the shared tokenizer. + self.mm_tokenizer = copy.deepcopy(tokenizer) with set_default_torch_num_threads(): self.mm_processor = mm_registry.create_processor( config.model_config, - tokenizer=mm_tokenizer, + tokenizer=self.mm_tokenizer, cache=mm_processor_cache, ) @@ -130,11 +128,10 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None: # requests don't pollute the sender cache. ro_cache = mm_registry.processor_only_cache_from_config(config) if ro_cache is not None: - ro_tokenizer = copy.deepcopy(tokenizer) with set_default_torch_num_threads(): self._readonly_mm_processor = mm_registry.create_processor( config.model_config, - tokenizer=ro_tokenizer, + tokenizer=self.mm_tokenizer, cache=ro_cache, ) @@ -152,10 +149,19 @@ def get_tokenizer(self) -> _T: return tokenizer + def get_executor_tokenizer(self) -> _T: + tokenizer = self.executor_tokenizer + if tokenizer is None: + raise ValueError( + "Executor tokenizer not available when `skip_tokenizer_init=True`" + ) + + return tokenizer + def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer: if self._async_tokenizer is None: self._async_tokenizer = AsyncMicrobatchTokenizer( - self.get_tokenizer(), executor=self._executor + self.get_executor_tokenizer(), executor=self._executor ) return self._async_tokenizer diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index 45a46b23283a..aab3b959daff 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -33,7 +33,7 @@ def __init__( ) def _apply_chat_template(self, *args, **kwargs): - return self.get_tokenizer().apply_chat_template(*args, **kwargs) + return self.get_executor_tokenizer().apply_chat_template(*args, **kwargs) def render_messages( self, diff --git a/vllm/renderers/deepseek_v4.py b/vllm/renderers/deepseek_v4.py index 3dc82b9622e5..9710067a0062 100644 --- a/vllm/renderers/deepseek_v4.py +++ b/vllm/renderers/deepseek_v4.py @@ -33,7 +33,7 @@ def __init__( ) def _apply_chat_template(self, *args, **kwargs): - return self.get_tokenizer().apply_chat_template(*args, **kwargs) + return self.get_executor_tokenizer().apply_chat_template(*args, **kwargs) def render_messages( self, diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index 665d9a98e94f..7e9e8c944c9c 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -33,7 +33,7 @@ def __init__( ) def _apply_chat_template(self, *args, **kwargs): - return self.get_tokenizer().apply_chat_template(*args, **kwargs) + return self.get_executor_tokenizer().apply_chat_template(*args, **kwargs) def render_messages( self, diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index 690ffb2a8954..9d1c67402d70 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -684,7 +684,6 @@ async def render_messages_async( params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config - tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, @@ -693,7 +692,7 @@ async def render_messages_async( chat_template=params.chat_template, tools=params.chat_template_kwargs.get("tools"), given_format=params.chat_template_content_format, - tokenizer=tokenizer, + tokenizer=self.get_tokenizer(), model_config=model_config, ), media_io_kwargs=params.media_io_kwargs, @@ -702,7 +701,7 @@ async def render_messages_async( prompt_raw = await self._apply_chat_template_async( model_config, - tokenizer, + self.get_executor_tokenizer(), conversation, **params.get_apply_chat_template_kwargs(), ) diff --git a/vllm/renderers/mistral.py b/vllm/renderers/mistral.py index da2275afc6c1..13c57d57b88f 100644 --- a/vllm/renderers/mistral.py +++ b/vllm/renderers/mistral.py @@ -64,7 +64,6 @@ def render_messages( messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, self.model_config, @@ -74,7 +73,7 @@ def render_messages( ) prompt_raw = safe_apply_chat_template( - tokenizer, + self.get_tokenizer(), messages, **params.get_apply_chat_template_kwargs(), ) @@ -92,7 +91,6 @@ async def render_messages_async( messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, self.model_config, @@ -102,7 +100,7 @@ async def render_messages_async( ) prompt_raw = await self._apply_chat_template_async( - tokenizer, + self.get_executor_tokenizer(), messages, **params.get_apply_chat_template_kwargs(), )