Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion docs/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,21 @@ response = client.chat.completions.create(
)
```

The same `chat_template_kwargs` override is also supported on the `/v1/responses`
endpoint:

```python
response = client.responses.create(
model=model,
input="Compute 23 * 17 and explain briefly.",
reasoning={"effort": "low"},
extra_body={"chat_template_kwargs": {"enable_thinking": True}},
)
```

## Limitations

- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
- The reasoning content is only available for online serving's chat completion and responses endpoints (`/v1/chat/completions` and `/v1/responses`).

## How to support a new reasoning model

Expand Down
80 changes: 80 additions & 0 deletions tests/entrypoints/openai/responses/test_chat_template_kwargs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import pytest_asyncio
from openai import OpenAI

from tests.utils import RemoteOpenAIServer

from .conftest import BASE_TEST_ENV

MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
def server():
args = [
"--reasoning-parser",
"qwen3",
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
"4096",
"--default-chat-template-kwargs",
'{"enable_thinking": false}',
]
env_dict = {
**BASE_TEST_ENV,
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_responses_honors_default_chat_template_kwargs(
client: OpenAI, model_name: str
):
response = await client.responses.create(
model=model_name,
input="Compute 17 * 19 and explain briefly.",
reasoning={"effort": "low"},
temperature=0.0,
)

reasoning_items = [item for item in response.output if item.type == "reasoning"]

assert response.status == "completed"
assert response.output_text
assert not reasoning_items


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_responses_request_chat_template_kwargs_override_server_default(
client: OpenAI, model_name: str
):
response = await client.responses.create(
model=model_name,
input="Compute 23 * 17 and explain briefly.",
reasoning={"effort": "low"},
temperature=0.0,
extra_body={"chat_template_kwargs": {"enable_thinking": True}},
)

reasoning_items = [item for item in response.output if item.type == "reasoning"]

assert response.status == "completed"
assert response.usage is not None
assert response.usage.output_tokens_details.reasoning_tokens > 0
assert reasoning_items
assert reasoning_items[0].content
30 changes: 30 additions & 0 deletions tests/entrypoints/openai/responses/test_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
)

from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
serialize_message,
serialize_messages,
)
Expand Down Expand Up @@ -37,3 +38,32 @@ def test_serialize_messages() -> None:
}
msg = Message.from_dict(msg_value)
assert serialize_messages([msg, dict_value]) == [msg_value, dict_value]


def test_responses_request_accepts_chat_template_kwargs() -> None:
request = ResponsesRequest(
input="Hello",
chat_template_kwargs={"enable_thinking": False},
)

assert request.chat_template_kwargs == {"enable_thinking": False}


def test_build_chat_params_merges_responses_chat_template_kwargs() -> None:
request = ResponsesRequest(
input="Hello",
chat_template_kwargs={"enable_thinking": False},
reasoning={"effort": "low"},
)

chat_params = request.build_chat_params(
default_template=None,
default_template_content_format="auto",
)

assert chat_params.chat_template_kwargs == {
"enable_thinking": False,
"add_generation_prompt": True,
"continue_final_message": False,
"reasoning_effort": "low",
}
134 changes: 133 additions & 1 deletion tests/entrypoints/openai/responses/test_serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from contextlib import AsyncExitStack
from unittest.mock import MagicMock
from unittest.mock import AsyncMock, MagicMock

import pytest
import pytest_asyncio
Expand Down Expand Up @@ -619,6 +619,138 @@ def _make_serving_instance_with_reasoning():
return serving


@pytest.mark.asyncio
async def test_make_request_passes_default_chat_template_kwargs():
engine_client = MagicMock()
model_config = MagicMock()
model_config.max_model_len = 100
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
engine_client.renderer = MagicMock()

openai_serving_render = MagicMock()
openai_serving_render.preprocess_chat = AsyncMock(return_value=([], [object()]))

serving = OpenAIServingResponses(
engine_client=engine_client,
models=MagicMock(),
openai_serving_render=openai_serving_render,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
default_chat_template_kwargs={"enable_thinking": False},
)

request = ResponsesRequest(
input="hi",
tools=[],
chat_template_kwargs={"enable_thinking": True},
)

await serving._make_request(request, None)

assert openai_serving_render.preprocess_chat.await_count == 1
assert openai_serving_render.preprocess_chat.await_args.kwargs[
"default_template_kwargs"
] == {"enable_thinking": False}


@pytest.mark.asyncio
async def test_reasoning_parser_receives_merged_chat_template_kwargs():
serving = _make_serving_instance_with_reasoning()
serving.default_chat_template_kwargs = {"enable_thinking": False}

mock_parser = MagicMock()
mock_parser.count_reasoning_tokens.return_value = 0
serving.parser = MagicMock()
serving.parser.reasoning_parser_cls = MagicMock(return_value=mock_parser)
serving.parser.tool_parser_cls = None

tokenizer = MagicMock()
context = SimpleContext()
completion = CompletionOutput(
index=0,
text="final",
token_ids=[20],
cumulative_logprob=0.0,
logprobs=None,
finish_reason="stop",
stop_reason=None,
)
req_output = RequestOutput(
request_id="req",
prompt="hi",
prompt_token_ids=[7, 8],
prompt_logprobs=None,
outputs=[completion],
finished=True,
num_cached_tokens=0,
)
context.append_output(req_output)

async def dummy_result_generator():
yield None

request = ResponsesRequest(
input="hi",
tools=[],
stream=False,
chat_template_kwargs={"enable_thinking": True},
)
sampling_params = SamplingParams(max_tokens=16)
metadata = RequestResponseMetadata(request_id="req")

await serving.responses_full_generator(
request=request,
sampling_params=sampling_params,
result_generator=dummy_result_generator(),
context=context,
model_name="test-model",
tokenizer=tokenizer,
request_metadata=metadata,
)

serving.parser.reasoning_parser_cls.assert_called_once_with(
tokenizer,
chat_template_kwargs={"enable_thinking": True},
)


def test_make_response_output_items_passes_merged_chat_template_kwargs():
serving = _make_serving_instance_with_reasoning()
serving.default_chat_template_kwargs = {"enable_thinking": False}

mock_parser = MagicMock()
mock_parser.extract_response_outputs.return_value = []
serving.parser = MagicMock(return_value=mock_parser)

request = ResponsesRequest(
input="hi",
tools=[],
chat_template_kwargs={"enable_thinking": True},
)
final_output = CompletionOutput(
index=0,
text="final",
token_ids=[20],
cumulative_logprob=0.0,
logprobs=None,
finish_reason="stop",
stop_reason=None,
)
tokenizer = MagicMock()

serving._make_response_output_items(request, final_output, tokenizer)

serving.parser.assert_called_once_with(
tokenizer,
chat_template_kwargs={"enable_thinking": True},
)


def _identity_increment(event):
"""Simple identity callable for _increment_sequence_number_and_return."""
seq = getattr(_identity_increment, "_counter", 0)
Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/generate/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ async def init_generate_state(
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
enable_log_outputs=args.enable_log_outputs,
default_chat_template_kwargs=args.default_chat_template_kwargs,
)
if "generate" in supported_tasks
else None
Expand Down
12 changes: 9 additions & 3 deletions vllm/entrypoints/openai/parser/responses_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ def __init__(
self,
*,
tokenizer: TokenizerLike,
reasoning_parser_cls: Callable[[TokenizerLike], ReasoningParser],
reasoning_parser_cls: Callable[..., ReasoningParser],
response_messages: list[ResponseInputOutputItem],
request: ResponsesRequest,
chat_template_kwargs: dict | None,
tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
):
self.response_messages: list[ResponseInputOutputItem] = (
Expand All @@ -49,7 +50,10 @@ def __init__(
self.tokenizer = tokenizer
self.request = request

self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
self.reasoning_parser_instance = reasoning_parser_cls(
tokenizer,
chat_template_kwargs=chat_template_kwargs,
)
self.tool_parser_instance = None
if tool_parser_cls is not None:
self.tool_parser_instance = tool_parser_cls(tokenizer)
Expand Down Expand Up @@ -159,9 +163,10 @@ def make_response_output_items_from_parsable_context(
def get_responses_parser_for_simple_context(
*,
tokenizer: TokenizerLike,
reasoning_parser_cls: Callable[[TokenizerLike], ReasoningParser],
reasoning_parser_cls: Callable[..., ReasoningParser],
response_messages: list[ResponseInputOutputItem],
request: ResponsesRequest,
chat_template_kwargs: dict | None,
tool_parser_cls,
) -> ResponsesParser:
"""Factory function to create a ResponsesParser with
Expand All @@ -175,5 +180,6 @@ def get_responses_parser_for_simple_context(
reasoning_parser_cls=reasoning_parser_cls,
response_messages=response_messages,
request=request,
chat_template_kwargs=chat_template_kwargs,
tool_parser_cls=tool_parser_cls,
)
4 changes: 3 additions & 1 deletion vllm/entrypoints/openai/responses/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,9 @@ def __init__(
*,
response_messages: list[ResponseInputOutputItem],
tokenizer: TokenizerLike,
reasoning_parser_cls: Callable[[TokenizerLike], ReasoningParser] | None,
reasoning_parser_cls: Callable[..., ReasoningParser] | None,
request: ResponsesRequest,
chat_template_kwargs: dict[str, Any] | None,
available_tools: list[str] | None,
tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
chat_template: str | None,
Expand All @@ -295,6 +296,7 @@ def __init__(
reasoning_parser_cls=reasoning_parser_cls,
response_messages=response_messages,
request=request,
chat_template_kwargs=chat_template_kwargs,
tool_parser_cls=tool_parser_cls,
)
self.tool_parser_cls = tool_parser_cls
Expand Down
9 changes: 8 additions & 1 deletion vllm/entrypoints/openai/responses/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,13 @@ class ResponsesRequest(OpenAIBaseModel):
"and vLLM will ignore it."
),
)
chat_template_kwargs: dict[str, Any] | None = Field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks~ @sidsaha-ai

This is a known issue. The reason we haven’t implemented it so far is that we wanted to wait and see whether OpenAI would introduce a similar field.

Otherwise, introducing these fields would cause the Responses API to overlap with chat completions.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool. Should we then wait and close this PR? Or should I go ahead with rebasing and can get approval.

default=None,
description=(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)

# --8<-- [start:responses-extra-params]
request_id: str = Field(
Expand Down Expand Up @@ -276,7 +283,7 @@ def build_chat_params(
chat_template=default_template,
chat_template_content_format=default_template_content_format,
chat_template_kwargs=merge_kwargs( # To remove unset values
{},
self.chat_template_kwargs,
dict(
add_generation_prompt=not continue_final,
continue_final_message=continue_final,
Expand Down
Loading
Loading