Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ class MockModelConfig:
skip_tokenizer_init = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,6 @@ class MockModelConfig:
skip_tokenizer_init: bool = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ class MockModelConfig:
skip_tokenizer_init = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ class MockModelConfig:
skip_tokenizer_init: bool = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/serve/disagg/test_generate_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class MockModelConfig:
skip_tokenizer_init = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ class MockModelConfig:
skip_tokenizer_init = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
1 change: 0 additions & 1 deletion tests/renderers/test_completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class MockModelConfig:
skip_tokenizer_init: bool = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1


@dataclass
Expand Down
1 change: 0 additions & 1 deletion tests/renderers/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class MockModelConfig:
skip_tokenizer_init: bool = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
renderer_num_workers: int = 1


@dataclass
Expand Down
22 changes: 0 additions & 22 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1246,28 +1246,6 @@ def test_needs_dp_coordination(
assert vllm_config.needs_dp_coordinator == expected_needs_coordinator


def test_renderer_num_workers_with_mm_cache():
"""Disallow renderer_num_workers > 1 when mm processor cache is enabled,
since neither cache type is thread-safe."""
mm_model = "Qwen/Qwen2-VL-2B-Instruct"

# Should raise: multi-worker + cache enabled (default cache_gb=4)
with pytest.raises(ValueError, match="renderer-num-workers"):
ModelConfig(mm_model, renderer_num_workers=4)

# Should raise: multi-worker + explicit cache size
with pytest.raises(ValueError, match="renderer-num-workers"):
ModelConfig(mm_model, renderer_num_workers=2, mm_processor_cache_gb=1.0)

# Should pass: multi-worker + cache disabled
config = ModelConfig(mm_model, renderer_num_workers=4, mm_processor_cache_gb=0)
assert config.renderer_num_workers == 4

# Should pass: single worker + cache enabled (default)
config = ModelConfig(mm_model, renderer_num_workers=1)
assert config.renderer_num_workers == 1


def test_eagle_draft_model_config():
"""Test that EagleDraft model config is correctly set."""
target_model_config = ModelConfig(
Expand Down
17 changes: 0 additions & 17 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,10 +301,6 @@ class ModelConfig:
definitions"""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup"""
renderer_num_workers: int = 1
"""Number of worker threads in the renderer thread pool. This pool
handles async tokenization, chat template rendering, and multimodal
preprocessing."""

# Pooler config
pooler_config: PoolerConfig | None = None
Expand Down Expand Up @@ -667,19 +663,6 @@ def __post_init__(

self.multimodal_config = MultiModalConfig(**mm_config_kwargs) # type: ignore[arg-type]

if (
self.renderer_num_workers > 1
and self.multimodal_config.mm_processor_cache_gb > 0
):
raise ValueError(
"Cannot use --renderer-num-workers > 1 with the "
"multimodal processor cache enabled. The cache is "
"not thread-safe and does not support concurrent "
"renderer workers. Please set "
"--renderer-num-workers 1 (the default), or "
"disable the cache with --mm-processor-cache-gb 0."
)

# Multimodal GGUF models must use original repo for mm processing
if is_gguf(self.tokenizer) and self.is_multimodal_model:
raise ValueError(
Expand Down
6 changes: 0 additions & 6 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,6 @@ class EngineArgs:
MultiModalConfig.mm_encoder_fp8_scale_save_margin
)
io_processor_plugin: str | None = None
renderer_num_workers: int = 1
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Removing renderer_num_workers from EngineArgs is a breaking change for the public Python API. Any user code that constructs EngineArgs with this parameter will now fail with a TypeError.

Additionally, removing the corresponding CLI argument (previously at line 836) breaks existing deployment scripts. While forcing a single worker fixes the borrowing issue, consider keeping the argument for backward compatibility (potentially ignoring it with a warning) and to allow for future scaling improvements (e.g., using a pool of tokenizers).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. This is a draft for illustrative purposes, will update before readying

skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate
mm_tensor_ipc: MMTensorIPC = MultiModalConfig.mm_tensor_ipc
Expand Down Expand Up @@ -832,10 +831,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
model_group.add_argument(
"--io-processor-plugin", **model_kwargs["io_processor_plugin"]
)
model_group.add_argument(
"--renderer-num-workers",
**model_kwargs["renderer_num_workers"],
)

# Model loading arguments
load_kwargs = get_kwargs(LoadConfig)
Expand Down Expand Up @@ -1555,7 +1550,6 @@ def create_model_config(self) -> ModelConfig:
video_pruning_rate=self.video_pruning_rate,
mm_tensor_ipc=self.mm_tensor_ipc,
io_processor_plugin=self.io_processor_plugin,
renderer_num_workers=self.renderer_num_workers,
)

def validate_tensorizer_args(self):
Expand Down
38 changes: 22 additions & 16 deletions vllm/renderers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,13 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:

self.tokenizer = tokenizer

# Shared thread pool executor for blocking tokenizer and
# multimodal preprocessing operations. The multimodal processor
# receives a deep-copied tokenizer (see #36557) so it is safe to
# run tokenization and MM preprocessing concurrently.
pool_workers = config.model_config.renderer_num_workers
self._executor = ThreadPoolExecutor(max_workers=pool_workers)
# Shared single-worker thread pool for blocking tokenizer and
# multimodal preprocessing operations.
self._executor = ThreadPoolExecutor(max_workers=1)
# Tokenizer to be used in the executor thread
# Deep copy to avoid sharing the tokenizer leading to
# "already borrowed" errors (see #36557).
self.executor_tokenizer = copy.deepcopy(tokenizer)
Comment on lines +87 to +91
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we can use deep copy to avoid sharing the tokenizer and prevent "already borrowed" errors (see #36557), we should find a way to make all tokenizers use a deep-copied instance (perhaps use a tokenizer pool ), rather than enforcing one tokenizer per thread.

We use multithreading in many places, and enforcing one tokenizer per thread here does not completely solve all the problems.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should find a way to make all tokenizers use a deep-copied instance, rather than enforcing one tokenizer per thread.

You're saying we want to create a deep-copy of the tokenizer for each thread in the threadpool rather than limiting threadpool to one thread right?

I agree that this is the most flexible solution. I was refering to this in #40949 in Keep allowing --renderer-num-workers > 1 but use thread-local tokenizer. I'm working on a draft PR implementing this.

We use multithreading in many places, and enforcing one tokenizer per thread here does not completely solve all the problems.

I'm not sure I understand this. IIUC we use multithreading in many places (for mm processor, pooling io, applying chat template, etc.) but they all execute on the same underlying thread pool in renderer._executor. If we ensure a unique tokenizer for each thread in the thread pool (either by having many copies of the tokenizer or by enforcing thread pool contains only 1 thread), won't we avoid all the problems?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're saying we want to create a deep-copy of the tokenizer for each thread in the threadpool rather than limiting threadpool to one thread right?

This is the most flexible solution. +1

I'm not sure I understand this. IIUC we use multithreading in many places (for mm processor, pooling io, applying chat template, etc.) but they all execute on the same underlying thread pool in renderer._executor. If we ensure a unique tokenizer for each thread in the thread pool (either by having many copies of the tokenizer or by enforcing thread pool contains only 1 thread), won't we avoid all the problems?

We are refactoring the preprocessing part so that we can place the thread pool and tokenizer pool in one place. However, we don't yet know how many tokenizers are being used at large and have gone unnoticed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#41181: Adds thread-safety wrapper. Tried implementation with mutex and thread-local copy. Not a huge perf difference in my admittedly limited testing


# Multimodal preprocessing is always offloaded to the thread pool
# to keep the asyncio event loop responsive under concurrent load.
Expand All @@ -108,17 +109,14 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
if config.model_config.is_multimodal_model:
mm_processor_cache = mm_registry.processor_cache_from_config(config)

# Deep-copy the tokenizer so the multimodal processor gets its
# own Rust tokenizer backend. Without this, concurrent access
# from AsyncMicrobatchTokenizer and call_hf_processor causes
# "RuntimeError: Already borrowed" from the Rust RefCell.
# See: https://github.com/huggingface/tokenizers/issues/537
mm_tokenizer = copy.deepcopy(tokenizer)
# Cannot self.executor_tokenizer because the mm processor might
# mutate the tokenizer, corrupting the shared tokenizer.
self.mm_tokenizer = copy.deepcopy(tokenizer)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, we can't use self.executor_tokenizer here because mm_processor may mutate their tokenizer. They were not designed with thread-safety in mind.

For example, DeepseekVLV2Processor

if image_token_id is None:
special_tokens = [image_token]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token_id = self.tokenizer.vocab.get(image_token)
# add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
# add special tokens for SFT data
special_tokens = ["<|User|>", "<|Assistant|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)


with set_default_torch_num_threads():
self.mm_processor = mm_registry.create_processor(
config.model_config,
tokenizer=mm_tokenizer,
tokenizer=self.mm_tokenizer,
cache=mm_processor_cache,
)

Expand All @@ -130,11 +128,10 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
# requests don't pollute the sender cache.
ro_cache = mm_registry.processor_only_cache_from_config(config)
if ro_cache is not None:
ro_tokenizer = copy.deepcopy(tokenizer)
with set_default_torch_num_threads():
self._readonly_mm_processor = mm_registry.create_processor(
config.model_config,
tokenizer=ro_tokenizer,
tokenizer=self.mm_tokenizer,
cache=ro_cache,
)

Expand All @@ -152,10 +149,19 @@ def get_tokenizer(self) -> _T:

return tokenizer

def get_executor_tokenizer(self) -> _T:
tokenizer = self.executor_tokenizer
if tokenizer is None:
raise ValueError(
"Executor tokenizer not available when `skip_tokenizer_init=True`"
)

return tokenizer

def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
if self._async_tokenizer is None:
self._async_tokenizer = AsyncMicrobatchTokenizer(
self.get_tokenizer(), executor=self._executor
self.get_executor_tokenizer(), executor=self._executor
)

return self._async_tokenizer
Expand Down
2 changes: 1 addition & 1 deletion vllm/renderers/deepseek_v32.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
)

def _apply_chat_template(self, *args, **kwargs):
return self.get_tokenizer().apply_chat_template(*args, **kwargs)
return self.get_executor_tokenizer().apply_chat_template(*args, **kwargs)

def render_messages(
self,
Expand Down
2 changes: 1 addition & 1 deletion vllm/renderers/deepseek_v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
)

def _apply_chat_template(self, *args, **kwargs):
return self.get_tokenizer().apply_chat_template(*args, **kwargs)
return self.get_executor_tokenizer().apply_chat_template(*args, **kwargs)

def render_messages(
self,
Expand Down
2 changes: 1 addition & 1 deletion vllm/renderers/grok2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
)

def _apply_chat_template(self, *args, **kwargs):
return self.get_tokenizer().apply_chat_template(*args, **kwargs)
return self.get_executor_tokenizer().apply_chat_template(*args, **kwargs)

def render_messages(
self,
Expand Down
5 changes: 2 additions & 3 deletions vllm/renderers/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,6 @@ async def render_messages_async(
params: ChatParams,
) -> tuple[list[ConversationMessage], DictPrompt]:
model_config = self.model_config
tokenizer = self.get_tokenizer()

conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
Expand All @@ -693,7 +692,7 @@ async def render_messages_async(
chat_template=params.chat_template,
tools=params.chat_template_kwargs.get("tools"),
given_format=params.chat_template_content_format,
tokenizer=tokenizer,
tokenizer=self.get_tokenizer(),
model_config=model_config,
),
media_io_kwargs=params.media_io_kwargs,
Expand All @@ -702,7 +701,7 @@ async def render_messages_async(

prompt_raw = await self._apply_chat_template_async(
model_config,
tokenizer,
self.get_executor_tokenizer(),
conversation,
**params.get_apply_chat_template_kwargs(),
)
Expand Down
6 changes: 2 additions & 4 deletions vllm/renderers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def render_messages(
messages: list[ChatCompletionMessageParam],
params: ChatParams,
) -> tuple[list[ConversationMessage], DictPrompt]:
tokenizer = self.get_tokenizer()
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
self.model_config,
Expand All @@ -74,7 +73,7 @@ def render_messages(
)

prompt_raw = safe_apply_chat_template(
tokenizer,
self.get_tokenizer(),
messages,
**params.get_apply_chat_template_kwargs(),
)
Expand All @@ -92,7 +91,6 @@ async def render_messages_async(
messages: list[ChatCompletionMessageParam],
params: ChatParams,
) -> tuple[list[ConversationMessage], DictPrompt]:
tokenizer = self.get_tokenizer()
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
self.model_config,
Expand All @@ -102,7 +100,7 @@ async def render_messages_async(
)

prompt_raw = await self._apply_chat_template_async(
tokenizer,
self.get_executor_tokenizer(),
messages,
**params.get_apply_chat_template_kwargs(),
)
Expand Down
Loading