Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions docs/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,42 @@ The reasoning content is also available when both tool calling and the reasoning

For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).

## Server-Level Default Chat Template Kwargs

You can set default `chat_template_kwargs` at the server level using the `--default-chat-template-kwargs` CLI argument. This is useful for configuring reasoning behavior across all requests without requiring clients to specify it in each request.

### Disabling Thinking Mode by Default

For models like Qwen3 where thinking is enabled by default, you can disable it server-wide:

```bash
vllm serve Qwen/Qwen3-8B \
--reasoning-parser qwen3 \
--default-chat-template-kwargs '{"enable_thinking": false}'
```

### Enabling Thinking Mode by Default

For models like IBM Granite 3.2 or DeepSeek-V3.1 where thinking is disabled by default, you can enable it server-wide:

```bash
vllm serve ibm-granite/granite-3.2-2b-instruct \
--reasoning-parser granite \
--default-chat-template-kwargs '{"thinking": true}'
```

### Request-Level Override

Request-level `chat_template_kwargs` always take priority over server defaults. For example, if the server is started with `enable_thinking=false`, a client can still enable it for a specific request:

```python
response = client.chat.completions.create(
model=model,
messages=messages,
extra_body={"chat_template_kwargs": {"enable_thinking": True}} # Overrides server default
)
```

## Limitations

- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
Expand Down
33 changes: 33 additions & 0 deletions tests/entrypoints/openai/test_cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,36 @@ def test_middleware(serve_parser, cli_args, expected_middleware):
"""Ensure multiple middleware args are parsed properly"""
args = serve_parser.parse_args(args=cli_args)
assert args.middleware == expected_middleware


def test_default_chat_template_kwargs_parsing(serve_parser):
"""Ensure default_chat_template_kwargs JSON is parsed correctly"""
args = serve_parser.parse_args(
args=["--default-chat-template-kwargs", '{"enable_thinking": false}']
)
assert args.default_chat_template_kwargs == {"enable_thinking": False}


def test_default_chat_template_kwargs_complex(serve_parser):
"""Ensure complex default_chat_template_kwargs JSON is parsed correctly"""
kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}'
args = serve_parser.parse_args(args=["--default-chat-template-kwargs", kwargs_json])
assert args.default_chat_template_kwargs == {
"enable_thinking": False,
"custom_param": "value",
"num": 42,
}


def test_default_chat_template_kwargs_default_none(serve_parser):
"""Ensure default_chat_template_kwargs defaults to None"""
args = serve_parser.parse_args(args=[])
assert args.default_chat_template_kwargs is None


def test_default_chat_template_kwargs_invalid_json(serve_parser):
"""Ensure invalid JSON raises an error"""
with pytest.raises(SystemExit):
serve_parser.parse_args(
args=["--default-chat-template-kwargs", "not valid json"]
)
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,6 +1079,7 @@ async def init_app_state(
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
default_chat_template_kwargs=args.default_chat_template_kwargs,
trust_request_chat_template=args.trust_request_chat_template,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
Expand Down
11 changes: 10 additions & 1 deletion vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import ssl
from collections.abc import Sequence
from dataclasses import field
from typing import Literal
from typing import Any, Literal

from pydantic.dataclasses import dataclass

Expand Down Expand Up @@ -114,6 +114,12 @@ class FrontendArgs:
"""Whether to trust the chat template provided in the request. If False,
the server will always use the chat template specified by `--chat-template`
or the ones from tokenizer."""
default_chat_template_kwargs: dict[str, Any] | None = None
"""Default keyword arguments to pass to the chat template renderer.
These will be merged with request-level chat_template_kwargs,
with request values taking precedence. Useful for setting default
behavior for reasoning models. Example: '{"enable_thinking": false}'
to disable thinking mode by default for Qwen3/DeepSeek models."""
response_role: str = "assistant"
"""The role name to return if `request.add_generation_prompt=true`."""
ssl_keyfile: str | None = None
Expand Down Expand Up @@ -211,6 +217,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
del frontend_kwargs["allowed_methods"]["nargs"]
del frontend_kwargs["allowed_headers"]["nargs"]

# Special case: default_chat_template_kwargs needs json.loads type
frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads

# Special case: LoRA modules need custom parser action and
# optional_type(str)
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,9 @@ async def run_batch(
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
default_chat_template_kwargs=getattr(
args, "default_chat_template_kwargs", None
),
)
if "generate" in supported_tasks
else None
Expand Down
6 changes: 5 additions & 1 deletion vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Final
from typing import Any, Final

import jinja2
import partial_json_parser
Expand Down Expand Up @@ -102,6 +102,7 @@ def __init__(
enable_force_include_usage: bool = False,
enable_log_outputs: bool = False,
log_error_stack: bool = False,
default_chat_template_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__(
engine_client=engine_client,
Expand All @@ -115,6 +116,7 @@ def __init__(
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
self.trust_request_chat_template = trust_request_chat_template
self.default_chat_template_kwargs = default_chat_template_kwargs or {}
self.enable_log_outputs = enable_log_outputs

# set up logits processors
Expand Down Expand Up @@ -203,6 +205,7 @@ async def warmup(self) -> None:
tool_dicts=None,
documents=None,
chat_template_kwargs=None,
default_chat_template_kwargs=self.default_chat_template_kwargs,
tool_parser=None,
add_special_tokens=False,
)
Expand Down Expand Up @@ -310,6 +313,7 @@ async def create_chat_completion(
tool_dicts=tool_dicts,
documents=request.documents,
chat_template_kwargs=request.chat_template_kwargs,
default_chat_template_kwargs=self.default_chat_template_kwargs,
tool_parser=tool_parser,
add_special_tokens=request.add_special_tokens,
)
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,7 @@ async def _preprocess_chat(
tool_dicts: list[dict[str, Any]] | None = None,
documents: list[dict[str, str]] | None = None,
chat_template_kwargs: dict[str, Any] | None = None,
default_chat_template_kwargs: dict[str, Any] | None = None,
tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
add_special_tokens: bool = False,
) -> tuple[list[ConversationMessage], list[TokensPrompt]]:
Expand All @@ -1183,6 +1184,8 @@ async def _preprocess_chat(
tools=tool_dicts,
documents=documents,
)
if default_chat_template_kwargs:
_chat_template_kwargs.update(default_chat_template_kwargs)
_chat_template_kwargs.update(chat_template_kwargs or {})

request_prompt: str | list[int]
Expand Down