diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index d6f32bab7008..0739765639e9 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -13,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.outputs import CompletionOutput, RequestOutput from vllm.renderers.hf import HfRenderer from vllm.tokenizers.registry import tokenizer_args_from_config @@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + model_registry=models.registry, + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) serving_chat = OpenAIServingChat( engine, models, response_role="assistant", + openai_serving_render=serving_render, request_logger=None, chat_template=None, chat_template_content_format="auto", @@ -100,7 +111,9 @@ async def _fake_preprocess_chat(*args, **kwargs): [{"prompt_token_ids": [1, 2, 3]}], ) - serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat) + serving_chat.openai_serving_render._preprocess_chat = AsyncMock( + side_effect=_fake_preprocess_chat + ) return serving_chat diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index 2372126d91f3..c914e427d59c 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -13,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.outputs import CompletionOutput, RequestOutput from vllm.renderers.hf import HfRenderer from vllm.tokenizers.registry import tokenizer_args_from_config @@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + model_registry=models.registry, + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) return OpenAIServingCompletion( engine, models, + openai_serving_render=serving_render, request_logger=None, ) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index b0eda4b7d002..4bcfff56072d 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -14,6 +14,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.renderers.hf import HfRenderer @@ -145,8 +146,17 @@ async def mock_generate(*args, **kwargs): base_model_paths=BASE_MODEL_PATHS, ) + serving_render = OpenAIServingRender( + model_config=mock_engine.model_config, + renderer=mock_engine.renderer, + io_processor=mock_engine.io_processor, + model_registry=models.registry, + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) serving_completion = OpenAIServingCompletion( - mock_engine, models, request_logger=None + mock_engine, models, openai_serving_render=serving_render, request_logger=None ) return mock_engine, serving_completion diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 49e4894ca8c8..3791faa386f3 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -21,8 +21,13 @@ ErrorResponse, RequestResponseMetadata, ) -from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.models.serving import ( + BaseModelPath, + OpenAIModelRegistry, + OpenAIServingModels, +) from vllm.entrypoints.openai.parser.harmony_utils import get_encoding +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.exceptions import VLLMValidationError from vllm.inputs import TokensPrompt from vllm.outputs import CompletionOutput, RequestOutput @@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig): ) +def _build_serving_render( + engine, model_registry: OpenAIModelRegistry +) -> OpenAIServingRender: + return OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + model_registry=model_registry, + request_logger=None, + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + ) + + def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) + openai_serving_render = _build_serving_render(engine, models.registry) + serving_chat = OpenAIServingChat( engine, models, response_role="assistant", + openai_serving_render=openai_serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, @@ -586,10 +608,13 @@ async def _async_serving_chat_init(): engine = MockEngine() models = OpenAIServingModels(engine, BASE_MODEL_PATHS) + openai_serving_render = _build_serving_render(engine, models.registry) + serving_completion = OpenAIServingChat( engine, models, response_role="assistant", + openai_serving_render=openai_serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, @@ -1182,7 +1207,9 @@ async def test_simple_chat(self, serving_chat, stream): # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1209,7 +1236,9 @@ async def test_simple_chat(self, serving_chat, stream): # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1230,7 +1259,9 @@ async def test_tool_call_response_with_content( # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1274,7 +1305,9 @@ async def test_tool_call_response_with_content( # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1311,7 +1344,9 @@ async def test_tools_and_reasoning( # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1355,7 +1390,9 @@ async def test_tools_and_reasoning( # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1392,7 +1429,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the first turn's input req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, [ @@ -1436,7 +1475,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the second turn's input req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) + input_messages_2, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_2) + ) verify_harmony_messages( input_messages_2, [ @@ -1486,7 +1527,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the third turn's input req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) + input_messages_3, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_3) + ) verify_harmony_messages( input_messages_3, [ @@ -1549,7 +1592,9 @@ async def test_multi_turn_tools_and_reasoning( # Test the Harmony messages for the fourth turn's input req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) - input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) + input_messages_4, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req_4) + ) verify_harmony_messages( input_messages_4, [ @@ -1598,7 +1643,9 @@ async def test_non_tool_reasoning(self, serving_chat): }, ] req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, @@ -1629,7 +1676,9 @@ async def test_non_tool_reasoning_empty_content(self, serving_chat): }, ] req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, @@ -1658,7 +1707,9 @@ async def test_non_tool_reasoning_empty_content_list(self, serving_chat): }, ] req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) - input_messages, _ = serving_chat._make_request_with_harmony(req) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) verify_harmony_messages( input_messages, @@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser(): engine_client=mock_engine, base_model_paths=BASE_MODEL_PATHS, ) + openai_serving_render = _build_serving_render(mock_engine, models.registry) + # Create serving_chat without tool_parser (enable_auto_tools=False) serving_chat = OpenAIServingChat( mock_engine, models, response_role="assistant", + openai_serving_render=openai_serving_render, chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 9fd95d0c5782..69a1c38a453d 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -508,11 +508,25 @@ async def test_header_dp_rank_argument(): base_model_paths=BASE_MODEL_PATHS, ) + # Create render serving instance (required by OpenAIServingChat) + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + serving_render = OpenAIServingRender( + model_config=engine.model_config, + renderer=engine.renderer, + io_processor=engine.io_processor, + model_registry=models.registry, + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) + # Create serving chat instance serving_chat = OpenAIServingChat( engine_client=engine, models=models, response_role="assistant", + openai_serving_render=serving_render, chat_template=None, chat_template_content_format="auto", request_logger=None, diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index a536ae77ad0f..f301ed499f86 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -10,7 +10,7 @@ import time import uuid from collections.abc import AsyncGenerator -from typing import Any +from typing import TYPE_CHECKING, Any from fastapi import Request @@ -43,6 +43,9 @@ ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + logger = logging.getLogger(__name__) @@ -59,6 +62,7 @@ def __init__( models: OpenAIServingModels, response_role: str, *, + openai_serving_render: "OpenAIServingRender", request_logger: RequestLogger | None, chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, @@ -73,6 +77,7 @@ def __init__( engine_client=engine_client, models=models, response_role=response_role, + openai_serving_render=openai_serving_render, request_logger=request_logger, chat_template=chat_template, chat_template_content_format=chat_template_content_format, diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 802eee1ccbb4..bf8beb9b97ab 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -6,12 +6,11 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import Any, Final +from typing import TYPE_CHECKING, Any, Final import partial_json_parser import regex as re from fastapi import Request -from openai_harmony import Message as OpenAIMessage from partial_json_parser.core.options import Allow from vllm.engine.protocol import EngineClient @@ -56,17 +55,13 @@ ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( - get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, - get_system_message, - parse_chat_inputs_to_harmony_messages, parse_chat_output, - render_for_completion, ) from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import get_max_tokens, should_include_usage -from vllm.inputs.data import ProcessorInputs, TokensPrompt +from vllm.inputs.data import ProcessorInputs from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import CompletionOutput, RequestOutput @@ -80,7 +75,9 @@ from vllm.tool_parsers.utils import partial_json_loads from vllm.utils.collection_utils import as_list from vllm.utils.mistral import is_mistral_tokenizer -from vllm.utils.mistral import mt as _mt + +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender logger = init_logger(__name__) @@ -92,6 +89,7 @@ def __init__( models: OpenAIServingModels, response_role: str, *, + openai_serving_render: "OpenAIServingRender", request_logger: RequestLogger | None, chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, @@ -114,6 +112,7 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids, ) + self.openai_serving_render = openai_serving_render self.response_role = response_role self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format @@ -186,7 +185,10 @@ async def render_chat_request( request: ChatCompletionRequest, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: """ - render chat request by validating and preprocessing inputs. + Validate the model and preprocess a chat completion request. + + Delegates preprocessing logic to OpenAIServingRender, adding the + engine-aware checks (LoRA model validation, engine health). Returns: A tuple of (conversation, engine_prompts) on success, @@ -203,78 +205,7 @@ async def render_chat_request( if self.engine_client.errored: raise self.engine_client.dead_error - tokenizer = self.renderer.tokenizer - - tool_parser = self.tool_parser - - if is_mistral_tokenizer(tokenizer): - # because of issues with pydantic we need to potentially - # re-serialize the tool_calls field of the request - # for more info: see comment in `maybe_serialize_tool_calls` - _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] - _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] - _mt.validate_request_params(request) - - # Check if tool parsing is unavailable (common condition) - tool_parsing_unavailable = ( - tool_parser is None - and not is_mistral_tokenizer(tokenizer) - and not self.use_harmony - ) - - # Validate tool_choice when tool parsing is required but unavailable - if tool_parsing_unavailable and request.tool_choice not in ( - None, - "none", - ): - if request.tool_choice == "auto" and not self.enable_auto_tools: - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - '"auto" tool choice requires ' - "--enable-auto-tool-choice and --tool-call-parser to be set" - ) - elif request.tool_choice != "auto": - # "required" or named tool requires tool parser - return self.create_error_response( - f'tool_choice="{request.tool_choice}" requires ' - "--tool-call-parser to be set" - ) - - if request.tools is None or ( - request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none - ): - tool_dicts = None - else: - tool_dicts = [tool.model_dump() for tool in request.tools] - - if not self.use_harmony: - # Common case. - error_check_ret = self._validate_chat_template( - request_chat_template=request.chat_template, - chat_template_kwargs=request.chat_template_kwargs, - trust_request_chat_template=self.trust_request_chat_template, - ) - if error_check_ret is not None: - return error_check_ret - - conversation, engine_prompts = await self._preprocess_chat( - request, - request.messages, - default_template=self.chat_template, - default_template_content_format=self.chat_template_content_format, - default_template_kwargs=self.default_chat_template_kwargs, - tool_dicts=tool_dicts, - tool_parser=tool_parser, - ) - else: - # For GPT-OSS. - should_include_tools = tool_dicts is not None - conversation, engine_prompts = self._make_request_with_harmony( - request, should_include_tools - ) - - return conversation, engine_prompts + return await self.openai_serving_render.render_chat(request) async def create_chat_completion( self, @@ -1875,50 +1806,3 @@ def _create_remaining_args_delta( ) ] ) - - def _make_request_with_harmony( - self, - request: ChatCompletionRequest, - should_include_tools: bool = True, - ): - messages: list[OpenAIMessage] = [] - - # because of issues with pydantic we need to potentially - # re-serialize the tool_calls field of the request - # for more info: see comment in `maybe_serialize_tool_calls` - _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] - - # Add system message. - # NOTE: In Chat Completion API, browsing is enabled by default - # if the model supports it. TODO: Support browsing. - assert not self.supports_browsing - assert not self.supports_code_interpreter - if (reasoning_effort := request.reasoning_effort) == "none": - raise ValueError(f"Harmony does not support {reasoning_effort=}") - sys_msg = get_system_message( - reasoning_effort=reasoning_effort, - browser_description=None, - python_description=None, - with_custom_tools=should_include_tools, - ) - messages.append(sys_msg) - - # Add developer message. - if request.tools: - dev_msg = get_developer_message( - tools=request.tools if should_include_tools else None # type: ignore[arg-type] - ) - messages.append(dev_msg) - - # Add user message. - messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) - - # Render prompt token ids. - prompt_token_ids = render_for_completion(messages) - engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) - - # Add cache_salt if provided in the request - if request.cache_salt is not None: - engine_prompt["cache_salt"] = request.cache_salt - - return messages, [engine_prompt] diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index dc5ef563959d..96cd7797c14d 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -5,7 +5,7 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import cast +from typing import TYPE_CHECKING, cast from fastapi import Request @@ -42,6 +42,9 @@ from vllm.utils.async_utils import merge_async_iterators from vllm.utils.collection_utils import as_list +if TYPE_CHECKING: + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + logger = init_logger(__name__) @@ -51,6 +54,7 @@ def __init__( engine_client: EngineClient, models: OpenAIServingModels, *, + openai_serving_render: "OpenAIServingRender", request_logger: RequestLogger | None, return_tokens_as_token_ids: bool = False, enable_prompt_tokens_details: bool = False, @@ -63,6 +67,7 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids, ) + self.openai_serving_render = openai_serving_render self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage @@ -79,7 +84,10 @@ async def render_completion_request( request: CompletionRequest, ) -> list[ProcessorInputs] | ErrorResponse: """ - render completion request by validating and preprocessing inputs. + Validate the model and preprocess a completion request. + + Delegates preprocessing logic to OpenAIServingRender, adding the + engine-aware checks (LoRA model validation, engine health). Returns: A list of engine_prompts on success, @@ -95,25 +103,7 @@ async def render_completion_request( if self.engine_client.errored: raise self.engine_client.dead_error - # Return error for unsupported features. - if request.suffix is not None: - return self.create_error_response("suffix is not currently supported") - - if request.echo and request.prompt_embeds is not None: - return self.create_error_response("Echo is unsupported with prompt embeds.") - - if request.prompt_logprobs is not None and request.prompt_embeds is not None: - return self.create_error_response( - "prompt_logprobs is not compatible with prompt embeds." - ) - - engine_prompts = await self._preprocess_completion( - request, - prompt_input=request.prompt, - prompt_embeds=request.prompt_embeds, - ) - - return engine_prompts + return await self.openai_serving_render.render_completion(request) async def create_completion( self, diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 2d9e63158f0c..88a059661c55 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -72,6 +72,29 @@ async def init_generate_state( tool_server = None resolved_chat_template = load_chat_template(args.chat_template) + # Render endpoints are always backed by OpenAIServingRender so that + # /v1/chat/completions/render and /v1/completions/render work on both + # generate-mode and render-only servers. + # It is created first so that OpenAIServingChat and OpenAIServingCompletion + # can delegate their preprocessing logic to it. + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + state.openai_serving_render = OpenAIServingRender( + model_config=engine_client.model_config, + renderer=engine_client.renderer, + io_processor=engine_client.io_processor, + model_registry=state.openai_serving_models.registry, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_responses = ( OpenAIServingResponses( engine_client, @@ -96,6 +119,7 @@ async def init_generate_state( engine_client, state.openai_serving_models, args.response_role, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -120,6 +144,7 @@ async def init_generate_state( OpenAIServingCompletion( engine_client, state.openai_serving_models, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, @@ -133,6 +158,7 @@ async def init_generate_state( engine_client, state.openai_serving_models, args.response_role, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -159,24 +185,3 @@ async def init_generate_state( if "generate" in supported_tasks else None ) - - # Render endpoints are always backed by OpenAIServingRender so that - # /v1/chat/completions/render and /v1/completions/render work on both - # generate-mode and render-only servers. - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - - state.openai_serving_render = OpenAIServingRender( - model_config=engine_client.model_config, - renderer=engine_client.renderer, - io_processor=engine_client.io_processor, - model_registry=state.openai_serving_models.registry, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - trust_request_chat_template=args.trust_request_chat_template, - enable_auto_tools=args.enable_auto_tool_choice, - exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, - tool_parser=args.tool_call_parser, - default_chat_template_kwargs=args.default_chat_template_kwargs, - log_error_stack=args.log_error_stack, - ) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index c5a79191e4df..0ff737824596 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -87,15 +87,26 @@ async def render_chat_request( self, request: ChatCompletionRequest, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: - """Copied from OpenAIServingChat.render_chat_request. + """Validate the model and preprocess a chat completion request. - Differences: engine_client.errored check removed (no engine client). + This is the authoritative implementation used directly by the + GPU-less render server and delegated to by OpenAIServingChat. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) return error_check_ret + return await self.render_chat(request) + async def render_chat( + self, + request: ChatCompletionRequest, + ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: + """Core preprocessing logic for chat requests (no model/engine check). + + Called directly by render_chat_request and delegated to by + OpenAIServingChat.render_chat_request after its engine-aware checks. + """ tokenizer = self.renderer.tokenizer tool_parser = self.tool_parser @@ -173,14 +184,25 @@ async def render_completion_request( self, request: CompletionRequest, ) -> list[ProcessorInputs] | ErrorResponse: - """Copied from OpenAIServingCompletion.render_completion_request. + """Validate the model and preprocess a completion request. - Differences: engine_client.errored check removed (no engine client). + This is the authoritative implementation used directly by the + GPU-less render server and delegated to by OpenAIServingCompletion. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret + return await self.render_completion(request) + async def render_completion( + self, + request: CompletionRequest, + ) -> list[ProcessorInputs] | ErrorResponse: + """Core preprocessing logic for completion requests (no model/engine check). + + Called directly by render_completion_request and delegated to by + OpenAIServingCompletion.render_completion_request after its engine-aware checks. + """ # Return error for unsupported features. if request.suffix is not None: return self.create_error_response("suffix is not currently supported") @@ -206,7 +228,7 @@ def _make_request_with_harmony( request: ChatCompletionRequest, should_include_tools: bool = True, ): - """Copied from OpenAIServingChat._make_request_with_harmony.""" + """Build Harmony (GPT-OSS) messages and engine prompt from a chat request.""" messages: list[OpenAIMessage] = [] # because of issues with pydantic we need to potentially @@ -219,11 +241,10 @@ def _make_request_with_harmony( # if the model supports it. TODO: Support browsing. assert not self.supports_browsing assert not self.supports_code_interpreter - assert request.reasoning_effort != "none", ( - "Harmony does not support reasoning_effort='none'" - ) + if (reasoning_effort := request.reasoning_effort) == "none": + raise ValueError(f"Harmony does not support {reasoning_effort=}") sys_msg = get_system_message( - reasoning_effort=request.reasoning_effort, + reasoning_effort=reasoning_effort, browser_description=None, python_description=None, with_custom_tools=should_include_tools,