diff --git a/tests/entrypoints/openai/chat_completion/test_chat_error.py b/tests/entrypoints/openai/chat_completion/test_chat_error.py index 0739765639e9..5fd7bc09c273 100644 --- a/tests/entrypoints/openai/chat_completion/test_chat_error.py +++ b/tests/entrypoints/openai/chat_completion/test_chat_error.py @@ -111,7 +111,7 @@ async def _fake_preprocess_chat(*args, **kwargs): [{"prompt_token_ids": [1, 2, 3]}], ) - serving_chat.openai_serving_render._preprocess_chat = AsyncMock( + serving_chat.openai_serving_render.preprocess_chat = AsyncMock( side_effect=_fake_preprocess_chat ) return serving_chat diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 126e2b4024e8..39e9076a7cc6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -46,6 +46,7 @@ from vllm.entrypoints.serve.elastic_ep.middleware import ( ScalingMiddleware, ) +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization from vllm.entrypoints.utils import ( cli_env_setup, @@ -365,9 +366,27 @@ async def init_app_state( lora_modules=lora_modules, ) await state.openai_serving_models.init_static_loras() + + state.openai_serving_render = OpenAIServingRender( + model_config=engine_client.model_config, + renderer=engine_client.renderer, + io_processor=engine_client.io_processor, + model_registry=state.openai_serving_models.registry, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 88a059661c55..bda83fbe0f66 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -74,26 +74,7 @@ async def init_generate_state( # Render endpoints are always backed by OpenAIServingRender so that # /v1/chat/completions/render and /v1/completions/render work on both - # generate-mode and render-only servers. - # It is created first so that OpenAIServingChat and OpenAIServingCompletion - # can delegate their preprocessing logic to it. - from vllm.entrypoints.serve.render.serving import OpenAIServingRender - - state.openai_serving_render = OpenAIServingRender( - model_config=engine_client.model_config, - renderer=engine_client.renderer, - io_processor=engine_client.io_processor, - model_registry=state.openai_serving_models.registry, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - trust_request_chat_template=args.trust_request_chat_template, - enable_auto_tools=args.enable_auto_tool_choice, - exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, - tool_parser=args.tool_call_parser, - default_chat_template_kwargs=args.default_chat_template_kwargs, - log_error_stack=args.log_error_stack, - ) + # generate-mode and render-only servers. Created in init_app_state. state.openai_serving_responses = ( OpenAIServingResponses( diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 9dc410c9e34c..c54852fca8a4 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -226,7 +226,7 @@ async def render_chat( if not self.use_harmony: # Common case. - error_check_ret = self._validate_chat_template( + error_check_ret = self.validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, @@ -234,7 +234,7 @@ async def render_chat( if error_check_ret is not None: return error_check_ret - conversation, engine_prompts = await self._preprocess_chat( + conversation, engine_prompts = await self.preprocess_chat( request, request.messages, default_template=self.chat_template, @@ -328,7 +328,7 @@ async def render_completion( "prompt_logprobs is not compatible with prompt embeds." ) - engine_prompts = await self._preprocess_completion( + engine_prompts = await self.preprocess_completion( request, prompt_input=request.prompt, prompt_embeds=request.prompt_embeds, @@ -426,7 +426,7 @@ async def _check_model( ) -> ErrorResponse | None: return await self.model_registry.check_model(request.model) - def _validate_chat_template( + def validate_chat_template( self, request_chat_template: str | None, chat_template_kwargs: dict[str, Any] | None, @@ -447,7 +447,7 @@ def _validate_chat_template( ) return None - async def _preprocess_completion( + async def preprocess_completion( self, request: Any, prompt_input: str | list[str] | list[int] | list[list[int]] | None, @@ -490,7 +490,7 @@ async def _preprocess_cmpl( }, ) - async def _preprocess_chat( + async def preprocess_chat( self, request: Any, messages: list[Any], diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py index 233674aff6cd..d68651da828d 100644 --- a/vllm/entrypoints/serve/tokenize/serving.py +++ b/vllm/entrypoints/serve/tokenize/serving.py @@ -11,6 +11,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.serve.tokenize.protocol import ( DetokenizeRequest, DetokenizeResponse, @@ -31,6 +32,7 @@ def __init__( self, engine_client: EngineClient, models: OpenAIServingModels, + openai_serving_render: OpenAIServingRender, *, request_logger: RequestLogger | None, chat_template: str | None, @@ -44,6 +46,7 @@ def __init__( request_logger=request_logger, ) + self.openai_serving_render = openai_serving_render self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.default_chat_template_kwargs = default_chat_template_kwargs or {} @@ -68,7 +71,7 @@ async def create_tokenize( if request.tools is None else [tool.model_dump() for tool in request.tools] ) - error_check_ret = self._validate_chat_template( + error_check_ret = self.openai_serving_render.validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, @@ -76,7 +79,7 @@ async def create_tokenize( if error_check_ret is not None: return error_check_ret - _, engine_prompts = await self._preprocess_chat( + _, engine_prompts = await self.openai_serving_render.preprocess_chat( request, request.messages, default_template=self.chat_template, @@ -85,7 +88,7 @@ async def create_tokenize( tool_dicts=tool_dicts, ) else: - engine_prompts = await self._preprocess_completion( + engine_prompts = await self.openai_serving_render.preprocess_completion( request, prompt_input=request.prompt, prompt_embeds=None,