From 8f80c301beceb88908b2df90df7e4da97c973671 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 9 Mar 2026 12:37:51 +0200 Subject: [PATCH 01/12] Delegate preprocessing to OpenAIServingRender Signed-off-by: Sage Ahrac --- .../openai/chat_completion/serving.py | 134 ++---------------- vllm/entrypoints/openai/completion/serving.py | 32 ++--- .../entrypoints/openai/generate/api_router.py | 50 ++++--- vllm/entrypoints/serve/render/serving.py | 31 +++- 4 files changed, 79 insertions(+), 168 deletions(-) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 08c783f87d83..253f40294cbb 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -6,12 +6,14 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import Any, Final +from typing import TYPE_CHECKING, Any, Final + +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender import partial_json_parser import regex as re from fastapi import Request -from openai_harmony import Message as OpenAIMessage from partial_json_parser.core.options import Allow from vllm.engine.protocol import EngineClient @@ -56,17 +58,13 @@ ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( - get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, - get_system_message, - parse_chat_inputs_to_harmony_messages, parse_chat_output, - render_for_completion, ) from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import get_max_tokens, should_include_usage -from vllm.inputs.data import ProcessorInputs, TokensPrompt +from vllm.inputs.data import ProcessorInputs from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import CompletionOutput, RequestOutput @@ -79,7 +77,6 @@ from vllm.tool_parsers.utils import partial_json_loads from vllm.utils.collection_utils import as_list from vllm.utils.mistral import is_mistral_tokenizer -from vllm.utils.mistral import mt as _mt logger = init_logger(__name__) @@ -91,6 +88,7 @@ def __init__( models: OpenAIServingModels, response_role: str, *, + openai_serving_render: "OpenAIServingRender", request_logger: RequestLogger | None, chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, @@ -113,6 +111,7 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids, ) + self.openai_serving_render = openai_serving_render self.response_role = response_role self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format @@ -215,7 +214,10 @@ async def render_chat_request( request: ChatCompletionRequest, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: """ - render chat request by validating and preprocessing inputs. + Validate the model and preprocess a chat completion request. + + Delegates preprocessing logic to OpenAIServingRender, adding the + engine-aware checks (LoRA model validation, engine health). Returns: A tuple of (conversation, engine_prompts) on success, @@ -232,78 +234,7 @@ async def render_chat_request( if self.engine_client.errored: raise self.engine_client.dead_error - tokenizer = self.renderer.tokenizer - - tool_parser = self.tool_parser - - if is_mistral_tokenizer(tokenizer): - # because of issues with pydantic we need to potentially - # re-serialize the tool_calls field of the request - # for more info: see comment in `maybe_serialize_tool_calls` - _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] - _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] - _mt.validate_request_params(request) - - # Check if tool parsing is unavailable (common condition) - tool_parsing_unavailable = ( - tool_parser is None - and not is_mistral_tokenizer(tokenizer) - and not self.use_harmony - ) - - # Validate tool_choice when tool parsing is required but unavailable - if tool_parsing_unavailable and request.tool_choice not in ( - None, - "none", - ): - if request.tool_choice == "auto" and not self.enable_auto_tools: - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - '"auto" tool choice requires ' - "--enable-auto-tool-choice and --tool-call-parser to be set" - ) - elif request.tool_choice != "auto": - # "required" or named tool requires tool parser - return self.create_error_response( - f'tool_choice="{request.tool_choice}" requires ' - "--tool-call-parser to be set" - ) - - if request.tools is None or ( - request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none - ): - tool_dicts = None - else: - tool_dicts = [tool.model_dump() for tool in request.tools] - - if not self.use_harmony: - # Common case. - error_check_ret = self._validate_chat_template( - request_chat_template=request.chat_template, - chat_template_kwargs=request.chat_template_kwargs, - trust_request_chat_template=self.trust_request_chat_template, - ) - if error_check_ret is not None: - return error_check_ret - - conversation, engine_prompts = await self._preprocess_chat( - request, - request.messages, - default_template=self.chat_template, - default_template_content_format=self.chat_template_content_format, - default_template_kwargs=self.default_chat_template_kwargs, - tool_dicts=tool_dicts, - tool_parser=tool_parser, - ) - else: - # For GPT-OSS. - should_include_tools = tool_dicts is not None - conversation, engine_prompts = self._make_request_with_harmony( - request, should_include_tools - ) - - return conversation, engine_prompts + return await self.openai_serving_render._preprocess_chat_request(request) async def create_chat_completion( self, @@ -1920,42 +1851,7 @@ def _make_request_with_harmony( request: ChatCompletionRequest, should_include_tools: bool = True, ): - messages: list[OpenAIMessage] = [] - - # because of issues with pydantic we need to potentially - # re-serialize the tool_calls field of the request - # for more info: see comment in `maybe_serialize_tool_calls` - _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] - - # Add system message. - # NOTE: In Chat Completion API, browsing is enabled by default - # if the model supports it. TODO: Support browsing. - assert not self.supports_browsing - assert not self.supports_code_interpreter - sys_msg = get_system_message( - reasoning_effort=request.reasoning_effort, - browser_description=None, - python_description=None, - with_custom_tools=should_include_tools, + """Delegates to OpenAIServingRender._make_request_with_harmony.""" + return self.openai_serving_render._make_request_with_harmony( + request, should_include_tools ) - messages.append(sys_msg) - - # Add developer message. - if request.tools: - dev_msg = get_developer_message( - tools=request.tools if should_include_tools else None # type: ignore[arg-type] - ) - messages.append(dev_msg) - - # Add user message. - messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) - - # Render prompt token ids. - prompt_token_ids = render_for_completion(messages) - engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) - - # Add cache_salt if provided in the request - if request.cache_salt is not None: - engine_prompt["cache_salt"] = request.cache_salt - - return messages, [engine_prompt] diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index 27320cbd0eba..8c5e05542575 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -5,7 +5,10 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import cast +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender from fastapi import Request @@ -51,6 +54,7 @@ def __init__( engine_client: EngineClient, models: OpenAIServingModels, *, + openai_serving_render: "OpenAIServingRender", request_logger: RequestLogger | None, return_tokens_as_token_ids: bool = False, enable_prompt_tokens_details: bool = False, @@ -63,6 +67,7 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids, ) + self.openai_serving_render = openai_serving_render self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage @@ -79,7 +84,10 @@ async def render_completion_request( request: CompletionRequest, ) -> list[ProcessorInputs] | ErrorResponse: """ - render completion request by validating and preprocessing inputs. + Validate the model and preprocess a completion request. + + Delegates preprocessing logic to OpenAIServingRender, adding the + engine-aware checks (LoRA model validation, engine health). Returns: A list of engine_prompts on success, @@ -95,25 +103,7 @@ async def render_completion_request( if self.engine_client.errored: raise self.engine_client.dead_error - # Return error for unsupported features. - if request.suffix is not None: - return self.create_error_response("suffix is not currently supported") - - if request.echo and request.prompt_embeds is not None: - return self.create_error_response("Echo is unsupported with prompt embeds.") - - if request.prompt_logprobs is not None and request.prompt_embeds is not None: - return self.create_error_response( - "prompt_logprobs is not compatible with prompt embeds." - ) - - engine_prompts = await self._preprocess_completion( - request, - prompt_input=request.prompt, - prompt_embeds=request.prompt_embeds, - ) - - return engine_prompts + return await self.openai_serving_render._preprocess_completion_request(request) async def create_completion( self, diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index f07f42f0c07d..35548313c9cd 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -72,6 +72,31 @@ async def init_generate_state( tool_server = None resolved_chat_template = load_chat_template(args.chat_template) + # Render endpoints are always backed by OpenAIServingRender so that + # /v1/chat/completions/render and /v1/completions/render work on both + # generate-mode and render-only servers. + # It is created first so that OpenAIServingChat and OpenAIServingCompletion + # can delegate their preprocessing logic to it. + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + state.openai_serving_render = OpenAIServingRender( + model_config=engine_client.model_config, + renderer=engine_client.renderer, + io_processor=engine_client.io_processor, + served_model_names=[ + mp.name for mp in state.openai_serving_models.base_model_paths + ], + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_responses = ( OpenAIServingResponses( engine_client, @@ -96,6 +121,7 @@ async def init_generate_state( engine_client, state.openai_serving_models, args.response_role, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -121,6 +147,7 @@ async def init_generate_state( OpenAIServingCompletion( engine_client, state.openai_serving_models, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, @@ -160,26 +187,3 @@ async def init_generate_state( if "generate" in supported_tasks else None ) - - # Render endpoints are always backed by OpenAIServingRender so that - # /v1/chat/completions/render and /v1/completions/render work on both - # generate-mode and render-only servers. - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - - state.openai_serving_render = OpenAIServingRender( - model_config=engine_client.model_config, - renderer=engine_client.renderer, - io_processor=engine_client.io_processor, - served_model_names=[ - mp.name for mp in state.openai_serving_models.base_model_paths - ], - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - trust_request_chat_template=args.trust_request_chat_template, - enable_auto_tools=args.enable_auto_tool_choice, - exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, - tool_parser=args.tool_call_parser, - default_chat_template_kwargs=args.default_chat_template_kwargs, - log_error_stack=args.log_error_stack, - ) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index c0e32be7ea5e..b424e24d92ca 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -93,15 +93,26 @@ async def render_chat_request( self, request: ChatCompletionRequest, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: - """Copied from OpenAIServingChat.render_chat_request. + """Validate the model and preprocess a chat completion request. - Differences: engine_client.errored check removed (no engine client). + This is the authoritative implementation used directly by the + GPU-less render server and delegated to by OpenAIServingChat. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) return error_check_ret + return await self._preprocess_chat_request(request) + async def _preprocess_chat_request( + self, + request: ChatCompletionRequest, + ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: + """Core preprocessing logic for chat requests (no model/engine check). + + Called directly by render_chat_request and delegated to by + OpenAIServingChat.render_chat_request after its engine-aware checks. + """ try: tokenizer = self.renderer.tokenizer @@ -184,14 +195,25 @@ async def render_completion_request( self, request: CompletionRequest, ) -> list[ProcessorInputs] | ErrorResponse: - """Copied from OpenAIServingCompletion.render_completion_request. + """Validate the model and preprocess a completion request. - Differences: engine_client.errored check removed (no engine client). + This is the authoritative implementation used directly by the + GPU-less render server and delegated to by OpenAIServingCompletion. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret + return await self._preprocess_completion_request(request) + async def _preprocess_completion_request( + self, + request: CompletionRequest, + ) -> list[ProcessorInputs] | ErrorResponse: + """Core preprocessing logic for completion requests (no model/engine check). + + Called directly by render_completion_request and delegated to by + OpenAIServingCompletion.render_completion_request after its engine-aware checks. + """ # Return error for unsupported features. if request.suffix is not None: return self.create_error_response("suffix is not currently supported") @@ -221,7 +243,6 @@ def _make_request_with_harmony( request: ChatCompletionRequest, should_include_tools: bool = True, ): - """Copied from OpenAIServingChat._make_request_with_harmony.""" messages: list[OpenAIMessage] = [] # because of issues with pydantic we need to potentially From 69975fb51a79e56e5a41c90cfa3879da89478f5c Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 9 Mar 2026 12:46:42 +0200 Subject: [PATCH 02/12] update tests Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_chat_error.py | 16 ++++++++++- .../openai/test_completion_error.py | 12 ++++++++ .../entrypoints/openai/test_lora_resolvers.py | 13 ++++++++- tests/entrypoints/openai/test_serving_chat.py | 28 +++++++++++++++++++ tests/v1/engine/test_async_llm.py | 14 ++++++++++ 5 files changed, 81 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index d6f32bab7008..9985bdfb7088 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -80,14 +80,26 @@ def _build_renderer(model_config: MockModelConfig): def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + models = OpenAIServingModels( engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) serving_chat = OpenAIServingChat( engine, models, response_role="assistant", + openai_serving_render=serving_render, request_logger=None, chat_template=None, chat_template_content_format="auto", @@ -100,7 +112,9 @@ async def _fake_preprocess_chat(*args, **kwargs): [{"prompt_token_ids": [1, 2, 3]}], ) - serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat) + serving_chat.openai_serving_render._preprocess_chat = AsyncMock( + side_effect=_fake_preprocess_chat + ) return serving_chat diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index 2372126d91f3..90f2aafa478b 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -70,13 +70,25 @@ class MockVllmConfig: def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + models = OpenAIServingModels( engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) return OpenAIServingCompletion( engine, models, + openai_serving_render=serving_render, request_logger=None, ) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index b0eda4b7d002..4842495b33b0 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -145,8 +145,19 @@ async def mock_generate(*args, **kwargs): base_model_paths=BASE_MODEL_PATHS, ) + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + serving_render = OpenAIServingRender( + model_config=mock_engine.model_config, + renderer=mock_engine.renderer, + io_processor=mock_engine.io_processor, + served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) serving_completion = OpenAIServingCompletion( - mock_engine, models, request_logger=None + mock_engine, models, openai_serving_render=serving_render, request_logger=None ) return mock_engine, serving_completion diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 49e4894ca8c8..9dcc8c420c9b 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -557,6 +557,20 @@ def _build_renderer(model_config: MockModelConfig): ) +def _build_serving_render(engine): + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + return OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + request_logger=None, + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + ) + + def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, @@ -566,6 +580,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: engine, models, response_role="assistant", + openai_serving_render=_build_serving_render(engine), chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, @@ -585,11 +600,23 @@ class MockEngine: async def _async_serving_chat_init(): engine = MockEngine() + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + request_logger=None, + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + ) models = OpenAIServingModels(engine, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat( engine, models, response_role="assistant", + openai_serving_render=serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, @@ -1694,6 +1721,7 @@ async def test_tool_choice_validation_without_parser(): mock_engine, models, response_role="assistant", + openai_serving_render=_build_serving_render(mock_engine), chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 9fd95d0c5782..5c8ce5a893ff 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -508,11 +508,25 @@ async def test_header_dp_rank_argument(): base_model_paths=BASE_MODEL_PATHS, ) + # Create render serving instance (required by OpenAIServingChat) + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) + # Create serving chat instance serving_chat = OpenAIServingChat( engine_client=engine, models=models, response_role="assistant", + openai_serving_render=serving_render, chat_template=None, chat_template_content_format="auto", request_logger=None, From 6e286432c0cd397c06208187eed802eb18a2cece Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 9 Mar 2026 12:57:33 +0200 Subject: [PATCH 03/12] fix tests format Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_chat_error.py | 3 +-- .../openai/test_completion_error.py | 3 +-- .../entrypoints/openai/test_lora_resolvers.py | 3 +-- tests/entrypoints/openai/test_serving_chat.py | 23 ++++++++----------- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 9985bdfb7088..88c5cd5dceab 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -13,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.outputs import CompletionOutput, RequestOutput from vllm.renderers.hf import HfRenderer from vllm.tokenizers.registry import tokenizer_args_from_config @@ -80,8 +81,6 @@ def _build_renderer(model_config: MockModelConfig): def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - models = OpenAIServingModels( engine_client=engine, base_model_paths=BASE_MODEL_PATHS, diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index 90f2aafa478b..335e8d67d531 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -13,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.outputs import CompletionOutput, RequestOutput from vllm.renderers.hf import HfRenderer from vllm.tokenizers.registry import tokenizer_args_from_config @@ -70,8 +71,6 @@ class MockVllmConfig: def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - models = OpenAIServingModels( engine_client=engine, base_model_paths=BASE_MODEL_PATHS, diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 4842495b33b0..a36c594af5e9 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -14,6 +14,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.renderers.hf import HfRenderer @@ -145,8 +146,6 @@ async def mock_generate(*args, **kwargs): base_model_paths=BASE_MODEL_PATHS, ) - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - serving_render = OpenAIServingRender( model_config=mock_engine.model_config, renderer=mock_engine.renderer, diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9dcc8c420c9b..90e94b1891bb 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -576,11 +576,13 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) + openai_serving_render = _build_serving_render(engine) + serving_chat = OpenAIServingChat( engine, models, response_role="assistant", - openai_serving_render=_build_serving_render(engine), + openai_serving_render=openai_serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, @@ -600,23 +602,14 @@ class MockEngine: async def _async_serving_chat_init(): engine = MockEngine() - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - - serving_render = OpenAIServingRender( - model_config=engine.model_config, - renderer=engine.renderer, - io_processor=engine.io_processor, - served_model_names=[mp.name for mp in BASE_MODEL_PATHS], - request_logger=None, - chat_template=CHAT_TEMPLATE, - chat_template_content_format="auto", - ) models = OpenAIServingModels(engine, BASE_MODEL_PATHS) + openai_serving_render = _build_serving_render(engine) + serving_completion = OpenAIServingChat( engine, models, response_role="assistant", - openai_serving_render=serving_render, + openai_serving_render=openai_serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, @@ -1716,12 +1709,14 @@ async def test_tool_choice_validation_without_parser(): engine_client=mock_engine, base_model_paths=BASE_MODEL_PATHS, ) + openai_serving_render = _build_serving_render(mock_engine) + # Create serving_chat without tool_parser (enable_auto_tools=False) serving_chat = OpenAIServingChat( mock_engine, models, response_role="assistant", - openai_serving_render=_build_serving_render(mock_engine), + openai_serving_render=openai_serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, From db4d4794aecb2e1a62dead52d9c9307e02e77ce3 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 9 Mar 2026 13:12:27 +0200 Subject: [PATCH 04/12] init OpenAIServingRender in AnthropicServingMessages Signed-off-by: Sage Ahrac --- vllm/entrypoints/anthropic/serving.py | 7 ++++++- vllm/entrypoints/openai/generate/api_router.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index 85232e9185f5..e067c4587ae6 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -10,7 +10,7 @@ import time import uuid from collections.abc import AsyncGenerator -from typing import Any +from typing import TYPE_CHECKING, Any from fastapi import Request @@ -43,6 +43,9 @@ ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + logger = logging.getLogger(__name__) @@ -59,6 +62,7 @@ def __init__( models: OpenAIServingModels, response_role: str, *, + openai_serving_render: "OpenAIServingRender", request_logger: RequestLogger | None, chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, @@ -73,6 +77,7 @@ def __init__( engine_client=engine_client, models=models, response_role=response_role, + openai_serving_render=openai_serving_render, request_logger=request_logger, chat_template=chat_template, chat_template_content_format=chat_template_content_format, diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 35548313c9cd..a1f1cbfe24e6 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -161,6 +161,7 @@ async def init_generate_state( engine_client, state.openai_serving_models, args.response_role, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, From ef5b12ad26c3d7247964df3dc97a8d8ba9fd351a Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 9 Mar 2026 13:16:07 +0200 Subject: [PATCH 05/12] type check Signed-off-by: Sage Ahrac --- vllm/entrypoints/openai/chat_completion/serving.py | 6 +++--- vllm/entrypoints/openai/completion/serving.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 253f40294cbb..55a76657c2ba 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -8,9 +8,6 @@ from collections.abc import Sequence as GenericSequence from typing import TYPE_CHECKING, Any, Final -if TYPE_CHECKING: - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - import partial_json_parser import regex as re from fastapi import Request @@ -78,6 +75,9 @@ from vllm.utils.collection_utils import as_list from vllm.utils.mistral import is_mistral_tokenizer +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index 8ff90a71ac38..c4a5bf1dcf73 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -7,9 +7,6 @@ from collections.abc import Sequence as GenericSequence from typing import TYPE_CHECKING, cast -if TYPE_CHECKING: - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - from fastapi import Request from vllm.engine.protocol import EngineClient @@ -45,6 +42,9 @@ from vllm.utils.async_utils import merge_async_iterators from vllm.utils.collection_utils import as_list +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + logger = init_logger(__name__) From 7c7278351f21a73b6c9fbc4fd07b052987ff601b Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 9 Mar 2026 14:00:36 +0200 Subject: [PATCH 06/12] remove redundant try except Signed-off-by: Sage Ahrac --- vllm/entrypoints/serve/render/serving.py | 154 +++++++++++------------ 1 file changed, 72 insertions(+), 82 deletions(-) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index b424e24d92ca..7cba8b377c9a 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -6,7 +6,6 @@ from http import HTTPStatus from typing import Any -import jinja2 from openai_harmony import Message as OpenAIMessage from vllm.config import ModelConfig @@ -113,81 +112,76 @@ async def _preprocess_chat_request( Called directly by render_chat_request and delegated to by OpenAIServingChat.render_chat_request after its engine-aware checks. """ - try: - tokenizer = self.renderer.tokenizer - - tool_parser = self.tool_parser - - if is_mistral_tokenizer(tokenizer): - # because of issues with pydantic we need to potentially - # re-serialize the tool_calls field of the request - # for more info: see comment in `maybe_serialize_tool_calls` - _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] - _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] - _mt.validate_request_params(request) - - # Check if tool parsing is unavailable (common condition) - tool_parsing_unavailable = ( - tool_parser is None - and not is_mistral_tokenizer(tokenizer) - and not self.use_harmony - ) - - # Validate tool_choice when tool parsing is required but unavailable - if tool_parsing_unavailable and request.tool_choice not in ( - None, - "none", - ): - if request.tool_choice == "auto" and not self.enable_auto_tools: - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - '"auto" tool choice requires ' - "--enable-auto-tool-choice and --tool-call-parser to be set" - ) - elif request.tool_choice != "auto": - # "required" or named tool requires tool parser - return self.create_error_response( - f'tool_choice="{request.tool_choice}" requires ' - "--tool-call-parser to be set" - ) + tokenizer = self.renderer.tokenizer + + tool_parser = self.tool_parser + + if is_mistral_tokenizer(tokenizer): + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] + _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] + _mt.validate_request_params(request) + + # Check if tool parsing is unavailable (common condition) + tool_parsing_unavailable = ( + tool_parser is None + and not is_mistral_tokenizer(tokenizer) + and not self.use_harmony + ) - if request.tools is None or ( - request.tool_choice == "none" - and self.exclude_tools_when_tool_choice_none - ): - tool_dicts = None - else: - tool_dicts = [tool.model_dump() for tool in request.tools] - - if not self.use_harmony: - # Common case. - error_check_ret = self._validate_chat_template( - request_chat_template=request.chat_template, - chat_template_kwargs=request.chat_template_kwargs, - trust_request_chat_template=self.trust_request_chat_template, + # Validate tool_choice when tool parsing is required but unavailable + if tool_parsing_unavailable and request.tool_choice not in ( + None, + "none", + ): + if request.tool_choice == "auto" and not self.enable_auto_tools: + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + '"auto" tool choice requires ' + "--enable-auto-tool-choice and --tool-call-parser to be set" ) - if error_check_ret is not None: - return error_check_ret - - conversation, engine_prompts = await self._preprocess_chat( - request, - request.messages, - default_template=self.chat_template, - default_template_content_format=self.chat_template_content_format, - default_template_kwargs=self.default_chat_template_kwargs, - tool_dicts=tool_dicts, - tool_parser=tool_parser, + elif request.tool_choice != "auto": + # "required" or named tool requires tool parser + return self.create_error_response( + f'tool_choice="{request.tool_choice}" requires ' + "--tool-call-parser to be set" ) - else: - # For GPT-OSS. - should_include_tools = tool_dicts is not None - conversation, engine_prompts = self._make_request_with_harmony( - request, should_include_tools - ) - except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: - logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(e) + + if request.tools is None or ( + request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none + ): + tool_dicts = None + else: + tool_dicts = [tool.model_dump() for tool in request.tools] + + if not self.use_harmony: + # Common case. + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + + conversation, engine_prompts = await self._preprocess_chat( + request, + request.messages, + default_template=self.chat_template, + default_template_content_format=self.chat_template_content_format, + default_template_kwargs=self.default_chat_template_kwargs, + tool_dicts=tool_dicts, + tool_parser=tool_parser, + ) + else: + # For GPT-OSS. + should_include_tools = tool_dicts is not None + conversation, engine_prompts = self._make_request_with_harmony( + request, should_include_tools + ) return conversation, engine_prompts @@ -226,15 +220,11 @@ async def _preprocess_completion_request( "prompt_logprobs is not compatible with prompt embeds." ) - try: - engine_prompts = await self._preprocess_completion( - request, - prompt_input=request.prompt, - prompt_embeds=request.prompt_embeds, - ) - except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: - logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(e) + engine_prompts = await self._preprocess_completion( + request, + prompt_input=request.prompt, + prompt_embeds=request.prompt_embeds, + ) return engine_prompts From 5a6d93e870bc29473482fa59e7bffa5f38c357b5 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Thu, 12 Mar 2026 13:12:38 +0200 Subject: [PATCH 07/12] raise error instead of assert Signed-off-by: Sage Ahrac --- vllm/entrypoints/serve/render/serving.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 95d7be3ac4ba..67e95401930d 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -240,11 +240,10 @@ def _make_request_with_harmony( # if the model supports it. TODO: Support browsing. assert not self.supports_browsing assert not self.supports_code_interpreter - assert request.reasoning_effort != "none", ( - "Harmony does not support reasoning_effort='none'" - ) + if (reasoning_effort := request.reasoning_effort) == "none": + raise ValueError(f"Harmony does not support {reasoning_effort=}") sys_msg = get_system_message( - reasoning_effort=request.reasoning_effort, + reasoning_effort=reasoning_effort, browser_description=None, python_description=None, with_custom_tools=should_include_tools, From 712158a2dff90bf3ea42f4c7bd9aaba726a97935 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Thu, 12 Mar 2026 13:32:27 +0200 Subject: [PATCH 08/12] CR changes - public namings Signed-off-by: Sage Ahrac --- vllm/entrypoints/openai/chat_completion/serving.py | 12 +----------- vllm/entrypoints/openai/completion/serving.py | 2 +- vllm/entrypoints/serve/render/serving.py | 9 +++++---- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index ba328677109e..bf8beb9b97ab 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -205,7 +205,7 @@ async def render_chat_request( if self.engine_client.errored: raise self.engine_client.dead_error - return await self.openai_serving_render._preprocess_chat_request(request) + return await self.openai_serving_render.render_chat(request) async def create_chat_completion( self, @@ -1806,13 +1806,3 @@ def _create_remaining_args_delta( ) ] ) - - def _make_request_with_harmony( - self, - request: ChatCompletionRequest, - should_include_tools: bool = True, - ): - """Delegates to OpenAIServingRender._make_request_with_harmony.""" - return self.openai_serving_render._make_request_with_harmony( - request, should_include_tools - ) diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index c4a5bf1dcf73..96cd7797c14d 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -103,7 +103,7 @@ async def render_completion_request( if self.engine_client.errored: raise self.engine_client.dead_error - return await self.openai_serving_render._preprocess_completion_request(request) + return await self.openai_serving_render.render_completion(request) async def create_completion( self, diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 67e95401930d..0ff737824596 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -96,9 +96,9 @@ async def render_chat_request( if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) return error_check_ret - return await self._preprocess_chat_request(request) + return await self.render_chat(request) - async def _preprocess_chat_request( + async def render_chat( self, request: ChatCompletionRequest, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: @@ -192,9 +192,9 @@ async def render_completion_request( error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret - return await self._preprocess_completion_request(request) + return await self.render_completion(request) - async def _preprocess_completion_request( + async def render_completion( self, request: CompletionRequest, ) -> list[ProcessorInputs] | ErrorResponse: @@ -228,6 +228,7 @@ def _make_request_with_harmony( request: ChatCompletionRequest, should_include_tools: bool = True, ): + """Build Harmony (GPT-OSS) messages and engine prompt from a chat request.""" messages: list[OpenAIMessage] = [] # because of issues with pydantic we need to potentially From 9dac904ea4485ce54f91b1b233a078fb69e082d2 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Thu, 12 Mar 2026 13:36:37 +0200 Subject: [PATCH 09/12] fix tests Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_serving_chat.py | 52 ++++++++++++++----- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 90e94b1891bb..dd98e093d90b 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1202,7 +1202,9 @@ async def test_simple_chat(self, serving_chat, stream): # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1229,7 +1231,9 @@ async def test_simple_chat(self, serving_chat, stream): # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1250,7 +1254,9 @@ async def test_tool_call_response_with_content( # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1294,7 +1300,9 @@ async def test_tool_call_response_with_content( # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1331,7 +1339,9 @@ async def test_tools_and_reasoning( # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1375,7 +1385,9 @@ async def test_tools_and_reasoning( # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1412,7 +1424,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1456,7 +1470,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1506,7 +1522,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the third turn's input req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) + input_messages_3, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_3) + ) verify_harmony_messages( input_messages_3, [ @@ -1569,7 +1587,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the fourth turn's input req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) + input_messages_4, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_4) + ) verify_harmony_messages( input_messages_4, [ @@ -1618,7 +1638,9 @@ async def test_non_tool_reasoning(self, serving_chat): }, ] req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, @@ -1649,7 +1671,9 @@ async def test_non_tool_reasoning_empty_content(self, serving_chat): }, ] req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, @@ -1678,7 +1702,9 @@ async def test_non_tool_reasoning_empty_content_list(self, serving_chat): }, ] req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, From 26d9106b3da3960e48809e24fa8e48c11972a3a2 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Thu, 12 Mar 2026 13:58:48 +0200 Subject: [PATCH 10/12] pass model registry to openai renderer Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_chat_error.py | 2 +- tests/entrypoints/openai/test_completion_error.py | 2 +- tests/entrypoints/openai/test_lora_resolvers.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 10 ++++++++-- tests/v1/engine/test_async_llm.py | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 88c5cd5dceab..0739765639e9 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -89,7 +89,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: model_config=engine.model_config, renderer=engine.renderer, io_processor=engine.io_processor, - served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + model_registry=models.registry, request_logger=None, chat_template=None, chat_template_content_format="auto", diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index 335e8d67d531..c914e427d59c 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -79,7 +79,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: model_config=engine.model_config, renderer=engine.renderer, io_processor=engine.io_processor, - served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + model_registry=models.registry, request_logger=None, chat_template=None, chat_template_content_format="auto", diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index a36c594af5e9..4bcfff56072d 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -150,7 +150,7 @@ async def mock_generate(*args, **kwargs): model_config=mock_engine.model_config, renderer=mock_engine.renderer, io_processor=mock_engine.io_processor, - served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + model_registry=models.registry, request_logger=None, chat_template=None, chat_template_content_format="auto", diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index dd98e093d90b..91aff21cb2b7 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -21,7 +21,11 @@ ErrorResponse, RequestResponseMetadata, ) -from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.models.serving import ( + BaseModelPath, + OpenAIModelRegistry, + OpenAIServingModels, +) from vllm.entrypoints.openai.parser.harmony_utils import get_encoding from vllm.exceptions import VLLMValidationError from vllm.inputs import TokensPrompt @@ -560,11 +564,13 @@ def _build_renderer(model_config: MockModelConfig): def _build_serving_render(engine): from vllm.entrypoints.serve.render.serving import OpenAIServingRender + model_registry = OpenAIModelRegistry(engine.model_config, BASE_MODEL_PATHS) + return OpenAIServingRender( model_config=engine.model_config, renderer=engine.renderer, io_processor=engine.io_processor, - served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + model_registry=model_registry, request_logger=None, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 5c8ce5a893ff..69a1c38a453d 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -515,7 +515,7 @@ async def test_header_dp_rank_argument(): model_config=engine.model_config, renderer=engine.renderer, io_processor=engine.io_processor, - served_model_names=[mp.name for mp in BASE_MODEL_PATHS], + model_registry=models.registry, request_logger=None, chat_template=None, chat_template_content_format="auto", From 1d45de34ca96703f62e948b057d9f120c92a9f12 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Thu, 12 Mar 2026 14:04:01 +0200 Subject: [PATCH 11/12] reuse models.registry Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_serving_chat.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 91aff21cb2b7..89affc561376 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -23,7 +23,6 @@ ) from vllm.entrypoints.openai.models.serving import ( BaseModelPath, - OpenAIModelRegistry, OpenAIServingModels, ) from vllm.entrypoints.openai.parser.harmony_utils import get_encoding @@ -561,11 +560,9 @@ def _build_renderer(model_config: MockModelConfig): ) -def _build_serving_render(engine): +def _build_serving_render(engine, model_registry): from vllm.entrypoints.serve.render.serving import OpenAIServingRender - model_registry = OpenAIModelRegistry(engine.model_config, BASE_MODEL_PATHS) - return OpenAIServingRender( model_config=engine.model_config, renderer=engine.renderer, @@ -582,7 +579,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) - openai_serving_render = _build_serving_render(engine) + openai_serving_render = _build_serving_render(engine, models.registry) serving_chat = OpenAIServingChat( engine, @@ -609,7 +606,7 @@ async def _async_serving_chat_init(): engine = MockEngine() models = OpenAIServingModels(engine, BASE_MODEL_PATHS) - openai_serving_render = _build_serving_render(engine) + openai_serving_render = _build_serving_render(engine, models.registry) serving_completion = OpenAIServingChat( engine, @@ -1741,7 +1738,7 @@ async def test_tool_choice_validation_without_parser(): engine_client=mock_engine, base_model_paths=BASE_MODEL_PATHS, ) - openai_serving_render = _build_serving_render(mock_engine) + openai_serving_render = _build_serving_render(mock_engine, models.registry) # Create serving_chat without tool_parser (enable_auto_tools=False) serving_chat = OpenAIServingChat( From 1c9b09a555ad9125bee671e5156a167c970b06cb Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Thu, 12 Mar 2026 14:17:28 +0200 Subject: [PATCH 12/12] type hints Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_serving_chat.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 89affc561376..3791faa386f3 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -23,9 +23,11 @@ ) from vllm.entrypoints.openai.models.serving import ( BaseModelPath, + OpenAIModelRegistry, OpenAIServingModels, ) from vllm.entrypoints.openai.parser.harmony_utils import get_encoding +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.exceptions import VLLMValidationError from vllm.inputs import TokensPrompt from vllm.outputs import CompletionOutput, RequestOutput @@ -560,9 +562,9 @@ def _build_renderer(model_config: MockModelConfig): ) -def _build_serving_render(engine, model_registry): - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - +def _build_serving_render( + engine, model_registry: OpenAIModelRegistry +) -> OpenAIServingRender: return OpenAIServingRender( model_config=engine.model_config, renderer=engine.renderer,