From 58cd56e5eb80abfa282d8f3f56f2711e956e5b6c Mon Sep 17 00:00:00 2001 From: effortprogrammer Date: Thu, 25 Dec 2025 19:04:43 +0900 Subject: [PATCH 1/4] feat(frontend): add --default-chat-template-kwargs CLI argument Add server-level default chat_template_kwargs to control reasoning model behavior at deployment time. Request-level kwargs override these defaults. Fixes #28070 Signed-off-by: effortprogrammer --- tests/entrypoints/openai/test_cli_args.py | 35 +++++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 1 + vllm/entrypoints/openai/cli_args.py | 11 ++++++- vllm/entrypoints/openai/run_batch.py | 3 ++ vllm/entrypoints/openai/serving_chat.py | 6 +++- vllm/entrypoints/openai/serving_engine.py | 3 ++ 6 files changed, 57 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index b5d71c20bb4e..9637e8a441ef 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -208,3 +208,38 @@ def test_middleware(serve_parser, cli_args, expected_middleware): """Ensure multiple middleware args are parsed properly""" args = serve_parser.parse_args(args=cli_args) assert args.middleware == expected_middleware + + +def test_default_chat_template_kwargs_parsing(serve_parser): + """Ensure default_chat_template_kwargs JSON is parsed correctly""" + args = serve_parser.parse_args( + args=["--default-chat-template-kwargs", '{"enable_thinking": false}'] + ) + assert args.default_chat_template_kwargs == {"enable_thinking": False} + + +def test_default_chat_template_kwargs_complex(serve_parser): + """Ensure complex default_chat_template_kwargs JSON is parsed correctly""" + kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}' + args = serve_parser.parse_args( + args=["--default-chat-template-kwargs", kwargs_json] + ) + assert args.default_chat_template_kwargs == { + "enable_thinking": False, + "custom_param": "value", + "num": 42, + } + + +def test_default_chat_template_kwargs_default_none(serve_parser): + """Ensure default_chat_template_kwargs defaults to None""" + args = serve_parser.parse_args(args=[]) + assert args.default_chat_template_kwargs is None + + +def test_default_chat_template_kwargs_invalid_json(serve_parser): + """Ensure invalid JSON raises an error""" + with pytest.raises(SystemExit): + serve_parser.parse_args( + args=["--default-chat-template-kwargs", "not valid json"] + ) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bc8855a76e2a..bb3d3c818eb2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1089,6 +1089,7 @@ async def init_app_state( enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, log_error_stack=args.log_error_stack, + default_chat_template_kwargs=args.default_chat_template_kwargs, ) if "generate" in supported_tasks else None diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index a8eef76cd8ae..7b6cbcb3d081 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -11,7 +11,7 @@ import ssl from collections.abc import Sequence from dataclasses import field -from typing import Literal +from typing import Any, Literal from pydantic.dataclasses import dataclass @@ -114,6 +114,12 @@ class FrontendArgs: """Whether to trust the chat template provided in the request. If False, the server will always use the chat template specified by `--chat-template` or the ones from tokenizer.""" + default_chat_template_kwargs: dict[str, Any] | None = None + """Default keyword arguments to pass to the chat template renderer. + These will be merged with request-level chat_template_kwargs, + with request values taking precedence. Useful for setting default + behavior for reasoning models. Example: '{"enable_thinking": false}' + to disable thinking mode by default for Qwen3/DeepSeek models.""" response_role: str = "assistant" """The role name to return if `request.add_generation_prompt=true`.""" ssl_keyfile: str | None = None @@ -211,6 +217,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: del frontend_kwargs["allowed_methods"]["nargs"] del frontend_kwargs["allowed_headers"]["nargs"] + # Special case: default_chat_template_kwargs needs json.loads type + frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads + # Special case: LoRA modules need custom parser action and # optional_type(str) frontend_kwargs["lora_modules"]["type"] = optional_type(str) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 2cdb6a6f8eea..6bb6d0f3f97b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -468,6 +468,9 @@ async def run_batch( reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, + default_chat_template_kwargs=getattr( + args, "default_chat_template_kwargs", None + ), ) if "generate" in supported_tasks else None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index da7df1e5bf4a..32a3cf04951e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -6,7 +6,7 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import Final +from typing import Any, Final import jinja2 import partial_json_parser @@ -102,6 +102,7 @@ def __init__( enable_force_include_usage: bool = False, enable_log_outputs: bool = False, log_error_stack: bool = False, + default_chat_template_kwargs: dict[str, Any] | None = None, ) -> None: super().__init__( engine_client=engine_client, @@ -115,6 +116,7 @@ def __init__( self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.trust_request_chat_template = trust_request_chat_template + self.default_chat_template_kwargs = default_chat_template_kwargs or {} self.enable_log_outputs = enable_log_outputs # set up logits processors @@ -203,6 +205,7 @@ async def warmup(self) -> None: tool_dicts=None, documents=None, chat_template_kwargs=None, + default_chat_template_kwargs=self.default_chat_template_kwargs, tool_parser=None, add_special_tokens=False, ) @@ -310,6 +313,7 @@ async def create_chat_completion( tool_dicts=tool_dicts, documents=request.documents, chat_template_kwargs=request.chat_template_kwargs, + default_chat_template_kwargs=self.default_chat_template_kwargs, tool_parser=tool_parser, add_special_tokens=request.add_special_tokens, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 5ea2a7a57265..f1c4ab63f05b 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1158,6 +1158,7 @@ async def _preprocess_chat( tool_dicts: list[dict[str, Any]] | None = None, documents: list[dict[str, str]] | None = None, chat_template_kwargs: dict[str, Any] | None = None, + default_chat_template_kwargs: dict[str, Any] | None = None, tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, add_special_tokens: bool = False, ) -> tuple[list[ConversationMessage], list[TokensPrompt]]: @@ -1183,6 +1184,8 @@ async def _preprocess_chat( tools=tool_dicts, documents=documents, ) + if default_chat_template_kwargs: + _chat_template_kwargs.update(default_chat_template_kwargs) _chat_template_kwargs.update(chat_template_kwargs or {}) request_prompt: str | list[int] From 483f916d0b65dbe3b87aa1abaa3a0b08d9c497e4 Mon Sep 17 00:00:00 2001 From: effortprogrammer Date: Thu, 25 Dec 2025 20:38:05 +0900 Subject: [PATCH 2/4] chore: check pre-commit Signed-off-by: effortprogrammer --- tests/entrypoints/openai/test_cli_args.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 9637e8a441ef..0d7e6ae37d1e 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -221,9 +221,7 @@ def test_default_chat_template_kwargs_parsing(serve_parser): def test_default_chat_template_kwargs_complex(serve_parser): """Ensure complex default_chat_template_kwargs JSON is parsed correctly""" kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}' - args = serve_parser.parse_args( - args=["--default-chat-template-kwargs", kwargs_json] - ) + args = serve_parser.parse_args(args=["--default-chat-template-kwargs", kwargs_json]) assert args.default_chat_template_kwargs == { "enable_thinking": False, "custom_param": "value", From f260529b34491f7d1664b4104ec40c96fd3a582b Mon Sep 17 00:00:00 2001 From: effortprogrammer Date: Fri, 26 Dec 2025 22:37:28 +0900 Subject: [PATCH 3/4] refactor: move default_chat_template_kwargs to group with chat template args Signed-off-by: effortprogrammer --- vllm/entrypoints/openai/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bb3d3c818eb2..c165fee4c627 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1079,6 +1079,7 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, + default_chat_template_kwargs=args.default_chat_template_kwargs, trust_request_chat_template=args.trust_request_chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, @@ -1089,7 +1090,6 @@ async def init_app_state( enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, log_error_stack=args.log_error_stack, - default_chat_template_kwargs=args.default_chat_template_kwargs, ) if "generate" in supported_tasks else None From dda72c86effea308dc7f83665b34aec43d7f52a3 Mon Sep 17 00:00:00 2001 From: effortprogrammer Date: Mon, 29 Dec 2025 22:49:25 +0900 Subject: [PATCH 4/4] add: use case for reasoning_outputs.md Signed-off-by: effortprogrammer --- docs/features/reasoning_outputs.md | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 93cca23856a9..107d1d2b5bce 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -204,6 +204,42 @@ The reasoning content is also available when both tool calling and the reasoning For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py). +## Server-Level Default Chat Template Kwargs + +You can set default `chat_template_kwargs` at the server level using the `--default-chat-template-kwargs` CLI argument. This is useful for configuring reasoning behavior across all requests without requiring clients to specify it in each request. + +### Disabling Thinking Mode by Default + +For models like Qwen3 where thinking is enabled by default, you can disable it server-wide: + +```bash +vllm serve Qwen/Qwen3-8B \ + --reasoning-parser qwen3 \ + --default-chat-template-kwargs '{"enable_thinking": false}' +``` + +### Enabling Thinking Mode by Default + +For models like IBM Granite 3.2 or DeepSeek-V3.1 where thinking is disabled by default, you can enable it server-wide: + +```bash +vllm serve ibm-granite/granite-3.2-2b-instruct \ + --reasoning-parser granite \ + --default-chat-template-kwargs '{"thinking": true}' +``` + +### Request-Level Override + +Request-level `chat_template_kwargs` always take priority over server defaults. For example, if the server is started with `enable_thinking=false`, a client can still enable it for a specific request: + +```python +response = client.chat.completions.create( + model=model, + messages=messages, + extra_body={"chat_template_kwargs": {"enable_thinking": True}} # Overrides server default +) +``` + ## Limitations - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).