Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion tests/entrypoints/openai/test_chat_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
Expand Down Expand Up @@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=serving_render,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
Expand All @@ -100,7 +111,9 @@ async def _fake_preprocess_chat(*args, **kwargs):
[{"prompt_token_ids": [1, 2, 3]}],
)

serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
side_effect=_fake_preprocess_chat
)
return serving_chat


Expand Down
11 changes: 11 additions & 0 deletions tests/entrypoints/openai/test_completion_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
Expand Down Expand Up @@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
return OpenAIServingCompletion(
engine,
models,
openai_serving_render=serving_render,
request_logger=None,
)

Expand Down
12 changes: 11 additions & 1 deletion tests/entrypoints/openai/test_lora_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.renderers.hf import HfRenderer
Expand Down Expand Up @@ -145,8 +146,17 @@ async def mock_generate(*args, **kwargs):
base_model_paths=BASE_MODEL_PATHS,
)

serving_render = OpenAIServingRender(
model_config=mock_engine.model_config,
renderer=mock_engine.renderer,
io_processor=mock_engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
serving_completion = OpenAIServingCompletion(
mock_engine, models, request_logger=None
mock_engine, models, openai_serving_render=serving_render, request_logger=None
)

return mock_engine, serving_completion
Expand Down
82 changes: 68 additions & 14 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.models.serving import (
BaseModelPath,
OpenAIModelRegistry,
OpenAIServingModels,
)
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.exceptions import VLLMValidationError
from vllm.inputs import TokensPrompt
from vllm.outputs import CompletionOutput, RequestOutput
Expand Down Expand Up @@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
)


def _build_serving_render(
engine, model_registry: OpenAIModelRegistry
) -> OpenAIServingRender:
return OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=model_registry,
request_logger=None,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
)


def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(engine, models.registry)

serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
Expand All @@ -586,10 +608,13 @@ async def _async_serving_chat_init():
engine = MockEngine()

models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
openai_serving_render = _build_serving_render(engine, models.registry)

serving_completion = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
Expand Down Expand Up @@ -1182,7 +1207,9 @@ async def test_simple_chat(self, serving_chat, stream):

# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
Expand All @@ -1209,7 +1236,9 @@ async def test_simple_chat(self, serving_chat, stream):

# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
Expand All @@ -1230,7 +1259,9 @@ async def test_tool_call_response_with_content(

# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
Expand Down Expand Up @@ -1274,7 +1305,9 @@ async def test_tool_call_response_with_content(

# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
Expand Down Expand Up @@ -1311,7 +1344,9 @@ async def test_tools_and_reasoning(

# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
Expand Down Expand Up @@ -1355,7 +1390,9 @@ async def test_tools_and_reasoning(

# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
Expand Down Expand Up @@ -1392,7 +1429,9 @@ async def test_multi_turn_tools_and_reasoning(

# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
Expand Down Expand Up @@ -1436,7 +1475,9 @@ async def test_multi_turn_tools_and_reasoning(

# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
Expand Down Expand Up @@ -1486,7 +1527,9 @@ async def test_multi_turn_tools_and_reasoning(

# Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
input_messages_3, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_3)
)
verify_harmony_messages(
input_messages_3,
[
Expand Down Expand Up @@ -1549,7 +1592,9 @@ async def test_multi_turn_tools_and_reasoning(

# Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
input_messages_4, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_4)
)
verify_harmony_messages(
input_messages_4,
[
Expand Down Expand Up @@ -1598,7 +1643,9 @@ async def test_non_tool_reasoning(self, serving_chat):
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)

verify_harmony_messages(
input_messages,
Expand Down Expand Up @@ -1629,7 +1676,9 @@ async def test_non_tool_reasoning_empty_content(self, serving_chat):
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)

verify_harmony_messages(
input_messages,
Expand Down Expand Up @@ -1658,7 +1707,9 @@ async def test_non_tool_reasoning_empty_content_list(self, serving_chat):
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)

verify_harmony_messages(
input_messages,
Expand Down Expand Up @@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(mock_engine, models.registry)

# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat = OpenAIServingChat(
mock_engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
Expand Down
14 changes: 14 additions & 0 deletions tests/v1/engine/test_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,11 +508,25 @@ async def test_header_dp_rank_argument():
base_model_paths=BASE_MODEL_PATHS,
)

# Create render serving instance (required by OpenAIServingChat)
from vllm.entrypoints.serve.render.serving import OpenAIServingRender

serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)

# Create serving chat instance
serving_chat = OpenAIServingChat(
engine_client=engine,
models=models,
response_role="assistant",
openai_serving_render=serving_render,
chat_template=None,
chat_template_content_format="auto",
request_logger=None,
Expand Down
7 changes: 6 additions & 1 deletion vllm/entrypoints/anthropic/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import time
import uuid
from collections.abc import AsyncGenerator
from typing import Any
from typing import TYPE_CHECKING, Any

from fastapi import Request

Expand Down Expand Up @@ -43,6 +43,9 @@
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels

if TYPE_CHECKING:
from vllm.entrypoints.serve.render.serving import OpenAIServingRender

logger = logging.getLogger(__name__)


Expand All @@ -59,6 +62,7 @@ def __init__(
models: OpenAIServingModels,
response_role: str,
*,
openai_serving_render: "OpenAIServingRender",
request_logger: RequestLogger | None,
chat_template: str | None,
chat_template_content_format: ChatTemplateContentFormatOption,
Expand All @@ -73,6 +77,7 @@ def __init__(
engine_client=engine_client,
models=models,
response_role=response_role,
openai_serving_render=openai_serving_render,
request_logger=request_logger,
chat_template=chat_template,
chat_template_content_format=chat_template_content_format,
Expand Down
Loading
Loading