From 95341082494112d58fdec7a582d360a045a20ae3 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 09:46:32 +0200 Subject: [PATCH 1/8] [Frontend] Delegate tokenization serving preprocessing to OpenAIServingRender Signed-off-by: Sage Ahrac --- vllm/entrypoints/openai/api_server.py | 19 +++++++++++++++++ .../entrypoints/openai/generate/api_router.py | 21 +------------------ vllm/entrypoints/serve/tokenize/serving.py | 9 +++++--- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 126e2b4024e8..39e9076a7cc6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -46,6 +46,7 @@ from vllm.entrypoints.serve.elastic_ep.middleware import ( ScalingMiddleware, ) +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization from vllm.entrypoints.utils import ( cli_env_setup, @@ -365,9 +366,27 @@ async def init_app_state( lora_modules=lora_modules, ) await state.openai_serving_models.init_static_loras() + + state.openai_serving_render = OpenAIServingRender( + model_config=engine_client.model_config, + renderer=engine_client.renderer, + io_processor=engine_client.io_processor, + model_registry=state.openai_serving_models.registry, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 88a059661c55..bda83fbe0f66 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -74,26 +74,7 @@ async def init_generate_state( # Render endpoints are always backed by OpenAIServingRender so that # /v1/chat/completions/render and /v1/completions/render work on both - # generate-mode and render-only servers. - # It is created first so that OpenAIServingChat and OpenAIServingCompletion - # can delegate their preprocessing logic to it. - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - - state.openai_serving_render = OpenAIServingRender( - model_config=engine_client.model_config, - renderer=engine_client.renderer, - io_processor=engine_client.io_processor, - model_registry=state.openai_serving_models.registry, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - trust_request_chat_template=args.trust_request_chat_template, - enable_auto_tools=args.enable_auto_tool_choice, - exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, - tool_parser=args.tool_call_parser, - default_chat_template_kwargs=args.default_chat_template_kwargs, - log_error_stack=args.log_error_stack, - ) + # generate-mode and render-only servers. Created in init_app_state. state.openai_serving_responses = ( OpenAIServingResponses( diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py index 233674aff6cd..18e908104d09 100644 --- a/vllm/entrypoints/serve/tokenize/serving.py +++ b/vllm/entrypoints/serve/tokenize/serving.py @@ -11,6 +11,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.serve.tokenize.protocol import ( DetokenizeRequest, DetokenizeResponse, @@ -31,6 +32,7 @@ def __init__( self, engine_client: EngineClient, models: OpenAIServingModels, + openai_serving_render: OpenAIServingRender, *, request_logger: RequestLogger | None, chat_template: str | None, @@ -44,6 +46,7 @@ def __init__( request_logger=request_logger, ) + self.openai_serving_render = openai_serving_render self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.default_chat_template_kwargs = default_chat_template_kwargs or {} @@ -68,7 +71,7 @@ async def create_tokenize( if request.tools is None else [tool.model_dump() for tool in request.tools] ) - error_check_ret = self._validate_chat_template( + error_check_ret = self.openai_serving_render._validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, @@ -76,7 +79,7 @@ async def create_tokenize( if error_check_ret is not None: return error_check_ret - _, engine_prompts = await self._preprocess_chat( + _, engine_prompts = await self.openai_serving_render._preprocess_chat( request, request.messages, default_template=self.chat_template, @@ -85,7 +88,7 @@ async def create_tokenize( tool_dicts=tool_dicts, ) else: - engine_prompts = await self._preprocess_completion( + engine_prompts = await self.openai_serving_render._preprocess_completion( request, prompt_input=request.prompt, prompt_embeds=None, From f538d00d6921aeb63288abb0073eb5da5b5731ea Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 10:26:02 +0200 Subject: [PATCH 2/8] cr fix Signed-off-by: Sage Ahrac --- .../openai/chat_completion/test_chat_error.py | 2 +- vllm/entrypoints/serve/render/serving.py | 12 ++++++------ vllm/entrypoints/serve/tokenize/serving.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/openai/chat_completion/test_chat_error.py b/tests/entrypoints/openai/chat_completion/test_chat_error.py index 0739765639e9..5fd7bc09c273 100644 --- a/tests/entrypoints/openai/chat_completion/test_chat_error.py +++ b/tests/entrypoints/openai/chat_completion/test_chat_error.py @@ -111,7 +111,7 @@ async def _fake_preprocess_chat(*args, **kwargs): [{"prompt_token_ids": [1, 2, 3]}], ) - serving_chat.openai_serving_render._preprocess_chat = AsyncMock( + serving_chat.openai_serving_render.preprocess_chat = AsyncMock( side_effect=_fake_preprocess_chat ) return serving_chat diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 9dc410c9e34c..c54852fca8a4 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -226,7 +226,7 @@ async def render_chat( if not self.use_harmony: # Common case. - error_check_ret = self._validate_chat_template( + error_check_ret = self.validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, @@ -234,7 +234,7 @@ async def render_chat( if error_check_ret is not None: return error_check_ret - conversation, engine_prompts = await self._preprocess_chat( + conversation, engine_prompts = await self.preprocess_chat( request, request.messages, default_template=self.chat_template, @@ -328,7 +328,7 @@ async def render_completion( "prompt_logprobs is not compatible with prompt embeds." ) - engine_prompts = await self._preprocess_completion( + engine_prompts = await self.preprocess_completion( request, prompt_input=request.prompt, prompt_embeds=request.prompt_embeds, @@ -426,7 +426,7 @@ async def _check_model( ) -> ErrorResponse | None: return await self.model_registry.check_model(request.model) - def _validate_chat_template( + def validate_chat_template( self, request_chat_template: str | None, chat_template_kwargs: dict[str, Any] | None, @@ -447,7 +447,7 @@ def _validate_chat_template( ) return None - async def _preprocess_completion( + async def preprocess_completion( self, request: Any, prompt_input: str | list[str] | list[int] | list[list[int]] | None, @@ -490,7 +490,7 @@ async def _preprocess_cmpl( }, ) - async def _preprocess_chat( + async def preprocess_chat( self, request: Any, messages: list[Any], diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py index 18e908104d09..d68651da828d 100644 --- a/vllm/entrypoints/serve/tokenize/serving.py +++ b/vllm/entrypoints/serve/tokenize/serving.py @@ -71,7 +71,7 @@ async def create_tokenize( if request.tools is None else [tool.model_dump() for tool in request.tools] ) - error_check_ret = self.openai_serving_render._validate_chat_template( + error_check_ret = self.openai_serving_render.validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, @@ -79,7 +79,7 @@ async def create_tokenize( if error_check_ret is not None: return error_check_ret - _, engine_prompts = await self.openai_serving_render._preprocess_chat( + _, engine_prompts = await self.openai_serving_render.preprocess_chat( request, request.messages, default_template=self.chat_template, @@ -88,7 +88,7 @@ async def create_tokenize( tool_dicts=tool_dicts, ) else: - engine_prompts = await self.openai_serving_render._preprocess_completion( + engine_prompts = await self.openai_serving_render.preprocess_completion( request, prompt_input=request.prompt, prompt_embeds=None, From b7e985e200fa72f00c2954ad4c20ad44ab9a87ed Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 10:42:49 +0200 Subject: [PATCH 3/8] responses delegation Signed-off-by: Sage Ahrac --- tests/entrypoints/openai/test_serving_responses.py | 4 ++++ vllm/entrypoints/openai/generate/api_router.py | 1 + vllm/entrypoints/openai/responses/serving.py | 5 ++++- vllm/entrypoints/serve/render/serving.py | 13 +++++-------- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py index 0ad1e1c93094..b5d2b24a63a5 100644 --- a/tests/entrypoints/openai/test_serving_responses.py +++ b/tests/entrypoints/openai/test_serving_responses.py @@ -159,6 +159,7 @@ async def serving_responses_instance(self): instance = OpenAIServingResponses( engine_client=engine_client, models=models, + openai_serving_render=MagicMock(), request_logger=None, chat_template=None, chat_template_content_format="auto", @@ -245,6 +246,7 @@ async def serving_responses_instance(self): instance = OpenAIServingResponses( engine_client=engine_client, models=models, + openai_serving_render=MagicMock(), request_logger=None, chat_template=None, chat_template_content_format="auto", @@ -308,6 +310,7 @@ def get_vocab(self): serving = OpenAIServingResponses( engine_client=engine_client, models=models, + openai_serving_render=MagicMock(), request_logger=None, chat_template=None, chat_template_content_format="auto", @@ -607,6 +610,7 @@ def _make_serving_instance_with_reasoning(): serving = OpenAIServingResponses( engine_client=engine_client, models=models, + openai_serving_render=MagicMock(), request_logger=None, chat_template=None, chat_template_content_format="auto", diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index bda83fbe0f66..6b8796f3a08c 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -80,6 +80,7 @@ async def init_generate_state( OpenAIServingResponses( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 6d0041813e35..be4d511d9335 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -105,6 +105,7 @@ construct_tool_dicts, extract_tool_types, ) +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.utils import get_max_tokens from vllm.exceptions import VLLMValidationError from vllm.inputs.data import ProcessorInputs, token_inputs @@ -165,6 +166,7 @@ def __init__( self, engine_client: EngineClient, models: OpenAIServingModels, + openai_serving_render: OpenAIServingRender, *, request_logger: RequestLogger | None, chat_template: str | None, @@ -185,6 +187,7 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids, ) + self.openai_serving_render = openai_serving_render self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.enable_log_outputs = enable_log_outputs @@ -587,7 +590,7 @@ async def _make_request( prev_response_output=prev_response.output if prev_response else None, ) - _, engine_prompts = await self._preprocess_chat( + _, engine_prompts = await self.openai_serving_render.preprocess_chat( request, messages, default_template=self.chat_template, diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index c54852fca8a4..b0ed7d284d8c 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -24,6 +24,7 @@ parse_chat_inputs_to_harmony_messages, render_for_completion, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.entrypoints.serve.disagg.protocol import ( GenerateRequest, MultiModalFeatures, @@ -500,11 +501,7 @@ async def preprocess_chat( tool_dicts: list[dict[str, Any]] | None = None, tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]]: - """Copied from OpenAIServing._preprocess_chat. - - Differences: isinstance check is ChatCompletionRequest-only - (ResponsesRequest not supported here); TODO comment dropped accordingly. - """ + """Copied from OpenAIServing._preprocess_chat.""" renderer = self.renderer mm_config = self.model_config.multimodal_config @@ -542,11 +539,11 @@ async def preprocess_chat( if tool_parser is not None: tool_choice = getattr(request, "tool_choice", "none") if tool_choice != "none": - if not isinstance(request, ChatCompletionRequest): + if not isinstance(request, ChatCompletionRequest | ResponsesRequest): msg = ( "Tool usage is only supported " - " for ChatCompletionRequest, but got " - f"{type(request).__name__}" + "for Chat Completions API or Responses API requests, " + f"but got {type(request).__name__}" ) raise NotImplementedError(msg) tokenizer = renderer.get_tokenizer() From 908f7ecea2cd11a8853d6bb8a9ae6e6b0824a45d Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 11:52:25 +0200 Subject: [PATCH 4/8] Expose preprocess_cmpl as public method in OpenAIServingRender Signed-off-by: Sage Ahrac --- vllm/entrypoints/serve/render/serving.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index b0ed7d284d8c..d1c5acad8c72 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -460,9 +460,9 @@ async def preprocess_completion( prompts.extend(prompt_to_seq(prompt_embeds)) if prompt_input is not None: prompts.extend(prompt_to_seq(prompt_input)) - return await self._preprocess_cmpl(request, prompts) + return await self.preprocess_cmpl(request, prompts) - async def _preprocess_cmpl( + async def preprocess_cmpl( self, request: Any, prompts: Sequence[PromptType | bytes], From 4a156c3fd1cf880adf1337974996422b2a09de7b Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 11:52:34 +0200 Subject: [PATCH 5/8] Delegate ServingTokens preprocessing to OpenAIServingRender Signed-off-by: Sage Ahrac --- vllm/entrypoints/openai/generate/api_router.py | 1 + vllm/entrypoints/serve/disagg/serving.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 6b8796f3a08c..c81c295e4597 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -158,6 +158,7 @@ async def init_generate_state( ServingTokens( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py index 322314907dd8..46f68d535253 100644 --- a/vllm/entrypoints/serve/disagg/serving.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -29,6 +29,7 @@ GenerateResponse, GenerateResponseChoice, ) +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import RequestOutput @@ -45,6 +46,7 @@ def __init__( self, engine_client: EngineClient, models: OpenAIServingModels, + openai_serving_render: OpenAIServingRender, *, request_logger: RequestLogger | None, force_no_detokenize: bool = False, @@ -58,6 +60,7 @@ def __init__( request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, ) + self.openai_serving_render = openai_serving_render self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_log_outputs = enable_log_outputs self.force_no_detokenize = force_no_detokenize @@ -96,7 +99,7 @@ async def serve_tokens( if raw_request: raw_request.state.request_metadata = request_metadata - engine_prompts = await self._preprocess_completion( + engine_prompts = await self.openai_serving_render.preprocess_completion( request, prompt_input=request.token_ids, prompt_embeds=None, From c7115199e0f11aceee0d620361393e2ab4489b7f Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 11:53:01 +0200 Subject: [PATCH 6/8] Delegate OpenAIServingPooling preprocessing to OpenAIServingRender Signed-off-by: Sage Ahrac --- vllm/entrypoints/pooling/__init__.py | 1 + vllm/entrypoints/pooling/pooling/serving.py | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py index f64675e56b68..d2baea8959d2 100644 --- a/vllm/entrypoints/pooling/__init__.py +++ b/vllm/entrypoints/pooling/__init__.py @@ -68,6 +68,7 @@ def init_pooling_state( OpenAIServingPooling( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index bcd331b01435..54151ccb7130 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -32,6 +32,7 @@ encode_pooling_output_base64, encode_pooling_output_float, ) +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.inputs import ProcessorInputs from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput @@ -47,6 +48,7 @@ def __init__( self, engine_client: EngineClient, models: OpenAIServingModels, + openai_serving_render: OpenAIServingRender, *, request_logger: RequestLogger | None, chat_template: str | None, @@ -59,6 +61,7 @@ def __init__( request_logger=request_logger, ) + self.openai_serving_render = openai_serving_render self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.trust_request_chat_template = trust_request_chat_template @@ -101,12 +104,12 @@ async def create_pooling( raw_prompts = await self.io_processor.pre_process_async( prompt=validated_prompt, request_id=request_id ) - engine_prompts = await self._preprocess_cmpl( + engine_prompts = await self.openai_serving_render.preprocess_cmpl( request, prompt_to_seq(raw_prompts), ) elif isinstance(request, PoolingChatRequest): - error_check_ret = self._validate_chat_template( + error_check_ret = self.openai_serving_render.validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, @@ -114,7 +117,7 @@ async def create_pooling( if error_check_ret is not None: return error_check_ret - _, engine_prompts = await self._preprocess_chat( + _, engine_prompts = await self.openai_serving_render.preprocess_chat( request, request.messages, default_template=self.chat_template, @@ -122,7 +125,7 @@ async def create_pooling( default_template_kwargs=None, ) elif isinstance(request, PoolingCompletionRequest): - engine_prompts = await self._preprocess_completion( + engine_prompts = await self.openai_serving_render.preprocess_completion( request, prompt_input=request.input, prompt_embeds=None, From 7071929db7bec0c779a53cfa16edb2f77888709e Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 11:53:09 +0200 Subject: [PATCH 7/8] Move _render_next_turn and _generate_with_builtin_tools to OpenAIServingResponses Signed-off-by: Sage Ahrac --- vllm/entrypoints/openai/responses/serving.py | 110 ++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index be4d511d9335..dd42a6a56600 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -5,11 +5,11 @@ import time import uuid from collections import deque -from collections.abc import AsyncGenerator, AsyncIterator, Callable, Sequence +from collections.abc import AsyncGenerator, AsyncIterator, Callable, Mapping, Sequence from contextlib import AsyncExitStack from copy import copy from http import HTTPStatus -from typing import Final +from typing import Any, Final from fastapi import Request from openai.types.responses import ( @@ -86,6 +86,7 @@ ResponseCompletedEvent, ResponseCreatedEvent, ResponseInProgressEvent, + ResponseInputOutputItem, ResponseInputOutputMessage, ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent, @@ -112,10 +113,12 @@ from vllm.logger import init_logger from vllm.logprobs import Logprob as SampleLogprob from vllm.logprobs import SampleLogprobs +from vllm.lora.request import LoRARequest from vllm.outputs import CompletionOutput from vllm.parser import ParserManager from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.tokenizers import TokenizerLike +from vllm.tool_parsers import ToolParser from vllm.utils import random_uuid from vllm.utils.collection_utils import as_list @@ -601,6 +604,109 @@ async def _make_request( ) return messages, engine_prompts + async def _render_next_turn( + self, + request: ResponsesRequest, + messages: list[ResponseInputOutputItem], + tool_dicts: list[dict[str, Any]] | None, + tool_parser: Callable[[TokenizerLike], ToolParser] | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + ): + new_messages = construct_input_messages( + request_input=messages, + ) + + _, engine_prompts = await self.openai_serving_render.preprocess_chat( + request, + new_messages, + default_template=chat_template, + default_template_content_format=chat_template_content_format, + default_template_kwargs=None, + tool_dicts=tool_dicts, + tool_parser=tool_parser, + ) + return engine_prompts + + async def _generate_with_builtin_tools( + self, + request_id: str, + engine_prompt: ProcessorInputs, + sampling_params: SamplingParams, + context: ConversationContext, + lora_request: LoRARequest | None = None, + priority: int = 0, + trace_headers: Mapping[str, str] | None = None, + ): + max_model_len = self.model_config.max_model_len + + orig_priority = priority + sub_request = 0 + while True: + # Ensure that each sub-request has a unique request id. + sub_request_id = f"{request_id}_{sub_request}" + + self._log_inputs( + sub_request_id, + engine_prompt, + params=sampling_params, + lora_request=lora_request, + ) + + generator = self.engine_client.generate( + engine_prompt, + sampling_params, + sub_request_id, + lora_request=lora_request, + trace_headers=trace_headers, + priority=priority, + ) + + async for res in generator: + context.append_output(res) + # NOTE(woosuk): The stop condition is handled by the engine. + yield context + + if not context.need_builtin_tool_call(): + # The model did not ask for a tool call, so we're done. + break + + # Call the tool and update the context with the result. + tool_output = await context.call_tool() + context.append_tool_output(tool_output) + + # TODO: uncomment this and enable tool output streaming + # yield context + + # Create inputs for the next turn. + # Render the next prompt token ids and update sampling_params. + if isinstance(context, (HarmonyContext, StreamingHarmonyContext)): + token_ids = context.render_for_completion() + engine_prompt = token_inputs(token_ids) + + sampling_params.max_tokens = max_model_len - len(token_ids) + elif isinstance(context, ParsableContext): + (engine_prompt,) = await self._render_next_turn( + context.request, + context.parser.response_messages, + context.tool_dicts, + context.tool_parser_cls, + context.chat_template, + context.chat_template_content_format, + ) + + sampling_params.max_tokens = get_max_tokens( + max_model_len, + context.request.max_output_tokens, + self._extract_prompt_len(engine_prompt), + self.default_sampling_params, # type: ignore + self.override_max_tokens, # type: ignore + ) + + # OPTIMIZATION + priority = orig_priority - 1 + sub_request += 1 + def _make_request_with_harmony( self, request: ResponsesRequest, From 1f5b68c5837474ef624afc00a6b6e34745f9db00 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Tue, 17 Mar 2026 11:53:19 +0200 Subject: [PATCH 8/8] Remove preprocessing methods from OpenAIServing Signed-off-by: Sage Ahrac --- vllm/entrypoints/openai/engine/serving.py | 229 +--------------------- 1 file changed, 3 insertions(+), 226 deletions(-) diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 2049b3adfd3c..405db1a134c1 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -4,7 +4,7 @@ import contextlib import json import time -from collections.abc import AsyncGenerator, Callable, Mapping, Sequence +from collections.abc import AsyncGenerator, Callable, Mapping from dataclasses import dataclass, field from http import HTTPStatus from typing import Any, ClassVar, Generic, Protocol, TypeAlias, TypeVar @@ -22,9 +22,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( - ChatCompletionMessageParam, ChatTemplateContentFormatOption, - ConversationMessage, ) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ( @@ -43,19 +41,9 @@ GenerationError, ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.entrypoints.openai.responses.context import ( - ConversationContext, - HarmonyContext, - ParsableContext, - StreamingHarmonyContext, -) from vllm.entrypoints.openai.responses.protocol import ( - ResponseInputOutputItem, ResponsesRequest, ) -from vllm.entrypoints.openai.responses.utils import ( - construct_input_messages, -) from vllm.entrypoints.openai.speech_to_text.protocol import ( TranscriptionRequest, TranscriptionResponse, @@ -82,26 +70,22 @@ TokenizeCompletionRequest, TokenizeResponse, ) -from vllm.entrypoints.utils import create_error_response, get_max_tokens +from vllm.entrypoints.utils import create_error_response from vllm.exceptions import VLLMValidationError from vllm.inputs.data import ( ProcessorInputs, PromptType, - SingletonPrompt, TokensPrompt, - token_inputs, ) from vllm.logger import init_logger from vllm.logprobs import Logprob, PromptLogprobs from vllm.lora.request import LoRARequest from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams -from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs +from vllm.renderers import ChatParams, TokenizeParams from vllm.renderers.inputs.preprocess import ( extract_prompt_components, extract_prompt_len, - parse_model_prompt, - prompt_to_seq, ) from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tokenizers import TokenizerLike @@ -116,7 +100,6 @@ collect_from_async_generator, merge_async_iterators, ) -from vllm.utils.mistral import is_mistral_tokenizer logger = init_logger(__name__) @@ -823,109 +806,6 @@ def _prepare_extra_chat_template_kwargs( # Apply server defaults first, then request kwargs override. return default_chat_template_kwargs | request_chat_template_kwargs - async def _preprocess_completion( - self, - request: RendererRequest, - prompt_input: str | list[str] | list[int] | list[list[int]] | None, - prompt_embeds: bytes | list[bytes] | None, - ) -> list[ProcessorInputs]: - prompts = list[SingletonPrompt | bytes]() - if prompt_embeds is not None: # embeds take higher priority - prompts.extend(prompt_to_seq(prompt_embeds)) - if prompt_input is not None: - prompts.extend(prompt_to_seq(prompt_input)) - - return await self._preprocess_cmpl(request, prompts) - - async def _preprocess_cmpl( - self, - request: RendererRequest, - prompts: Sequence[PromptType | bytes], - ) -> list[ProcessorInputs]: - renderer = self.renderer - model_config = self.model_config - - parsed_prompts = [ - ( - prompt - if isinstance(prompt, bytes) - else parse_model_prompt(model_config, prompt) - ) - for prompt in prompts - ] - tok_params = request.build_tok_params(model_config) - - return await renderer.render_cmpl_async( - parsed_prompts, - tok_params, - prompt_extras={ - k: v - for k in ("mm_processor_kwargs", "cache_salt") - if (v := getattr(request, k, None)) is not None - }, - ) - - async def _preprocess_chat( - self, - request: RendererChatRequest, - messages: list[ChatCompletionMessageParam], - default_template: str | None, - default_template_content_format: ChatTemplateContentFormatOption, - default_template_kwargs: dict[str, Any] | None, - tool_dicts: list[dict[str, Any]] | None = None, - tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, - ) -> tuple[list[ConversationMessage], list[ProcessorInputs]]: - renderer = self.renderer - - default_template_kwargs = merge_kwargs( - default_template_kwargs, - dict( - tools=tool_dicts, - tokenize=is_mistral_tokenizer(renderer.tokenizer), - ), - ) - - mm_config = self.model_config.multimodal_config - - tok_params = request.build_tok_params(self.model_config) - chat_params = request.build_chat_params( - default_template, default_template_content_format - ).with_defaults( - default_template_kwargs, - default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None), - default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None), - ) - - (conversation,), (engine_prompt,) = await renderer.render_chat_async( - [messages], - chat_params, - tok_params, - prompt_extras={ - k: v - for k in ("mm_processor_kwargs", "cache_salt") - if (v := getattr(request, k, None)) is not None - }, - ) - - # tool parsing is done only if a tool_parser has been set and if - # tool_choice is not "none" (if tool_choice is "none" but a tool_parser - # is set, we want to prevent parsing a tool_call hallucinated by the LLM - if tool_parser is not None: - tool_choice = getattr(request, "tool_choice", "none") - if tool_choice != "none": - if not isinstance(request, ChatCompletionRequest | ResponsesRequest): - msg = ( - "Tool usage is only supported for Chat Completions API " - "or Responses API requests." - ) - raise NotImplementedError(msg) - - # TODO: Update adjust_request to accept ResponsesRequest - tokenizer = renderer.get_tokenizer() - request = tool_parser(tokenizer).adjust_request(request=request) # type: ignore[arg-type] - - return conversation, [engine_prompt] - def _extract_prompt_components(self, prompt: PromptType | ProcessorInputs): return extract_prompt_components(self.model_config, prompt) @@ -935,109 +815,6 @@ def _extract_prompt_text(self, prompt: ProcessorInputs): def _extract_prompt_len(self, prompt: ProcessorInputs): return extract_prompt_len(self.model_config, prompt) - async def _render_next_turn( - self, - request: ResponsesRequest, - messages: list[ResponseInputOutputItem], - tool_dicts: list[dict[str, Any]] | None, - tool_parser: Callable[[TokenizerLike], ToolParser] | None, - chat_template: str | None, - chat_template_content_format: ChatTemplateContentFormatOption, - ): - new_messages = construct_input_messages( - request_input=messages, - ) - - _, engine_prompts = await self._preprocess_chat( - request, - new_messages, - default_template=chat_template, - default_template_content_format=chat_template_content_format, - default_template_kwargs=None, - tool_dicts=tool_dicts, - tool_parser=tool_parser, - ) - return engine_prompts - - async def _generate_with_builtin_tools( - self, - request_id: str, - engine_prompt: ProcessorInputs, - sampling_params: SamplingParams, - context: ConversationContext, - lora_request: LoRARequest | None = None, - priority: int = 0, - trace_headers: Mapping[str, str] | None = None, - ): - max_model_len = self.model_config.max_model_len - - orig_priority = priority - sub_request = 0 - while True: - # Ensure that each sub-request has a unique request id. - sub_request_id = f"{request_id}_{sub_request}" - - self._log_inputs( - sub_request_id, - engine_prompt, - params=sampling_params, - lora_request=lora_request, - ) - - generator = self.engine_client.generate( - engine_prompt, - sampling_params, - sub_request_id, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - - async for res in generator: - context.append_output(res) - # NOTE(woosuk): The stop condition is handled by the engine. - yield context - - if not context.need_builtin_tool_call(): - # The model did not ask for a tool call, so we're done. - break - - # Call the tool and update the context with the result. - tool_output = await context.call_tool() - context.append_tool_output(tool_output) - - # TODO: uncomment this and enable tool output streaming - # yield context - - # Create inputs for the next turn. - # Render the next prompt token ids and update sampling_params. - if isinstance(context, (HarmonyContext, StreamingHarmonyContext)): - token_ids = context.render_for_completion() - engine_prompt = token_inputs(token_ids) - - sampling_params.max_tokens = max_model_len - len(token_ids) - elif isinstance(context, ParsableContext): - (engine_prompt,) = await self._render_next_turn( - context.request, - context.parser.response_messages, - context.tool_dicts, - context.tool_parser_cls, - context.chat_template, - context.chat_template_content_format, - ) - - sampling_params.max_tokens = get_max_tokens( - max_model_len, - context.request.max_output_tokens, - self._extract_prompt_len(engine_prompt), - self.default_sampling_params, # type: ignore - self.override_max_tokens, # type: ignore - ) - - # OPTIMIZATION - priority = orig_priority - 1 - sub_request += 1 - def _log_inputs( self, request_id: str,