From 58cd56e5eb80abfa282d8f3f56f2711e956e5b6c Mon Sep 17 00:00:00 2001
From: effortprogrammer <yhjhoward7@gmail.com>
Date: Thu, 25 Dec 2025 19:04:43 +0900
Subject: [PATCH 1/4] feat(frontend): add --default-chat-template-kwargs CLI
 argument Add server-level default chat_template_kwargs to control reasoning
 model behavior at deployment time. Request-level kwargs override these
 defaults. Fixes #28070

Signed-off-by: effortprogrammer <yhjhoward7@gmail.com>
---
 tests/entrypoints/openai/test_cli_args.py | 35 +++++++++++++++++++++++
 vllm/entrypoints/openai/api_server.py     |  1 +
 vllm/entrypoints/openai/cli_args.py       | 11 ++++++-
 vllm/entrypoints/openai/run_batch.py      |  3 ++
 vllm/entrypoints/openai/serving_chat.py   |  6 +++-
 vllm/entrypoints/openai/serving_engine.py |  3 ++
 6 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index b5d71c20bb4e..9637e8a441ef 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -208,3 +208,38 @@ def test_middleware(serve_parser, cli_args, expected_middleware):
     """Ensure multiple middleware args are parsed properly"""
     args = serve_parser.parse_args(args=cli_args)
     assert args.middleware == expected_middleware
+
+
+def test_default_chat_template_kwargs_parsing(serve_parser):
+    """Ensure default_chat_template_kwargs JSON is parsed correctly"""
+    args = serve_parser.parse_args(
+        args=["--default-chat-template-kwargs", '{"enable_thinking": false}']
+    )
+    assert args.default_chat_template_kwargs == {"enable_thinking": False}
+
+
+def test_default_chat_template_kwargs_complex(serve_parser):
+    """Ensure complex default_chat_template_kwargs JSON is parsed correctly"""
+    kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}'
+    args = serve_parser.parse_args(
+        args=["--default-chat-template-kwargs", kwargs_json]
+    )
+    assert args.default_chat_template_kwargs == {
+        "enable_thinking": False,
+        "custom_param": "value",
+        "num": 42,
+    }
+
+
+def test_default_chat_template_kwargs_default_none(serve_parser):
+    """Ensure default_chat_template_kwargs defaults to None"""
+    args = serve_parser.parse_args(args=[])
+    assert args.default_chat_template_kwargs is None
+
+
+def test_default_chat_template_kwargs_invalid_json(serve_parser):
+    """Ensure invalid JSON raises an error"""
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args(
+            args=["--default-chat-template-kwargs", "not valid json"]
+        )
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc8855a76e2a..bb3d3c818eb2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1089,6 +1089,7 @@ async def init_app_state(
             enable_force_include_usage=args.enable_force_include_usage,
             enable_log_outputs=args.enable_log_outputs,
             log_error_stack=args.log_error_stack,
+            default_chat_template_kwargs=args.default_chat_template_kwargs,
         )
         if "generate" in supported_tasks
         else None
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a8eef76cd8ae..7b6cbcb3d081 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,7 +11,7 @@
 import ssl
 from collections.abc import Sequence
 from dataclasses import field
-from typing import Literal
+from typing import Any, Literal
 
 from pydantic.dataclasses import dataclass
 
@@ -114,6 +114,12 @@ class FrontendArgs:
     """Whether to trust the chat template provided in the request. If False,
     the server will always use the chat template specified by `--chat-template`
     or the ones from tokenizer."""
+    default_chat_template_kwargs: dict[str, Any] | None = None
+    """Default keyword arguments to pass to the chat template renderer.
+    These will be merged with request-level chat_template_kwargs,
+    with request values taking precedence. Useful for setting default
+    behavior for reasoning models. Example: '{"enable_thinking": false}'
+    to disable thinking mode by default for Qwen3/DeepSeek models."""
     response_role: str = "assistant"
     """The role name to return if `request.add_generation_prompt=true`."""
     ssl_keyfile: str | None = None
@@ -211,6 +217,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         del frontend_kwargs["allowed_methods"]["nargs"]
         del frontend_kwargs["allowed_headers"]["nargs"]
 
+        # Special case: default_chat_template_kwargs needs json.loads type
+        frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
+
         # Special case: LoRA modules need custom parser action and
         # optional_type(str)
         frontend_kwargs["lora_modules"]["type"] = optional_type(str)
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 2cdb6a6f8eea..6bb6d0f3f97b 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -468,6 +468,9 @@ async def run_batch(
             reasoning_parser=args.structured_outputs_config.reasoning_parser,
             enable_prompt_tokens_details=args.enable_prompt_tokens_details,
             enable_force_include_usage=args.enable_force_include_usage,
+            default_chat_template_kwargs=getattr(
+                args, "default_chat_template_kwargs", None
+            ),
         )
         if "generate" in supported_tasks
         else None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index da7df1e5bf4a..32a3cf04951e 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,7 +6,7 @@
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Final
+from typing import Any, Final
 
 import jinja2
 import partial_json_parser
@@ -102,6 +102,7 @@ def __init__(
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
         log_error_stack: bool = False,
+        default_chat_template_kwargs: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -115,6 +116,7 @@ def __init__(
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
         self.trust_request_chat_template = trust_request_chat_template
+        self.default_chat_template_kwargs = default_chat_template_kwargs or {}
         self.enable_log_outputs = enable_log_outputs
 
         # set up logits processors
@@ -203,6 +205,7 @@ async def warmup(self) -> None:
                 tool_dicts=None,
                 documents=None,
                 chat_template_kwargs=None,
+                default_chat_template_kwargs=self.default_chat_template_kwargs,
                 tool_parser=None,
                 add_special_tokens=False,
             )
@@ -310,6 +313,7 @@ async def create_chat_completion(
                     tool_dicts=tool_dicts,
                     documents=request.documents,
                     chat_template_kwargs=request.chat_template_kwargs,
+                    default_chat_template_kwargs=self.default_chat_template_kwargs,
                     tool_parser=tool_parser,
                     add_special_tokens=request.add_special_tokens,
                 )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5ea2a7a57265..f1c4ab63f05b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1158,6 +1158,7 @@ async def _preprocess_chat(
         tool_dicts: list[dict[str, Any]] | None = None,
         documents: list[dict[str, str]] | None = None,
         chat_template_kwargs: dict[str, Any] | None = None,
+        default_chat_template_kwargs: dict[str, Any] | None = None,
         tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[list[ConversationMessage], list[TokensPrompt]]:
@@ -1183,6 +1184,8 @@ async def _preprocess_chat(
             tools=tool_dicts,
             documents=documents,
         )
+        if default_chat_template_kwargs:
+            _chat_template_kwargs.update(default_chat_template_kwargs)
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
         request_prompt: str | list[int]

From 483f916d0b65dbe3b87aa1abaa3a0b08d9c497e4 Mon Sep 17 00:00:00 2001
From: effortprogrammer <yhjhoward7@gmail.com>
Date: Thu, 25 Dec 2025 20:38:05 +0900
Subject: [PATCH 2/4] chore: check pre-commit

Signed-off-by: effortprogrammer <yhjhoward7@gmail.com>
---
 tests/entrypoints/openai/test_cli_args.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 9637e8a441ef..0d7e6ae37d1e 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -221,9 +221,7 @@ def test_default_chat_template_kwargs_parsing(serve_parser):
 def test_default_chat_template_kwargs_complex(serve_parser):
     """Ensure complex default_chat_template_kwargs JSON is parsed correctly"""
     kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}'
-    args = serve_parser.parse_args(
-        args=["--default-chat-template-kwargs", kwargs_json]
-    )
+    args = serve_parser.parse_args(args=["--default-chat-template-kwargs", kwargs_json])
     assert args.default_chat_template_kwargs == {
         "enable_thinking": False,
         "custom_param": "value",

From f260529b34491f7d1664b4104ec40c96fd3a582b Mon Sep 17 00:00:00 2001
From: effortprogrammer <yhjhoward7@gmail.com>
Date: Fri, 26 Dec 2025 22:37:28 +0900
Subject: [PATCH 3/4] refactor: move default_chat_template_kwargs to group with
 chat template args

Signed-off-by: effortprogrammer <yhjhoward7@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bb3d3c818eb2..c165fee4c627 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1079,6 +1079,7 @@ async def init_app_state(
             request_logger=request_logger,
             chat_template=resolved_chat_template,
             chat_template_content_format=args.chat_template_content_format,
+            default_chat_template_kwargs=args.default_chat_template_kwargs,
             trust_request_chat_template=args.trust_request_chat_template,
             return_tokens_as_token_ids=args.return_tokens_as_token_ids,
             enable_auto_tools=args.enable_auto_tool_choice,
@@ -1089,7 +1090,6 @@ async def init_app_state(
             enable_force_include_usage=args.enable_force_include_usage,
             enable_log_outputs=args.enable_log_outputs,
             log_error_stack=args.log_error_stack,
-            default_chat_template_kwargs=args.default_chat_template_kwargs,
         )
         if "generate" in supported_tasks
         else None

From dda72c86effea308dc7f83665b34aec43d7f52a3 Mon Sep 17 00:00:00 2001
From: effortprogrammer <yhjhoward7@gmail.com>
Date: Mon, 29 Dec 2025 22:49:25 +0900
Subject: [PATCH 4/4] add: use case for reasoning_outputs.md

Signed-off-by: effortprogrammer <yhjhoward7@gmail.com>
---
 docs/features/reasoning_outputs.md | 36 ++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 93cca23856a9..107d1d2b5bce 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -204,6 +204,42 @@ The reasoning content is also available when both tool calling and the reasoning
 
 For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
 
+## Server-Level Default Chat Template Kwargs
+
+You can set default `chat_template_kwargs` at the server level using the `--default-chat-template-kwargs` CLI argument. This is useful for configuring reasoning behavior across all requests without requiring clients to specify it in each request.
+
+### Disabling Thinking Mode by Default
+
+For models like Qwen3 where thinking is enabled by default, you can disable it server-wide:
+
+```bash
+vllm serve Qwen/Qwen3-8B \
+    --reasoning-parser qwen3 \
+    --default-chat-template-kwargs '{"enable_thinking": false}'
+```
+
+### Enabling Thinking Mode by Default
+
+For models like IBM Granite 3.2 or DeepSeek-V3.1 where thinking is disabled by default, you can enable it server-wide:
+
+```bash
+vllm serve ibm-granite/granite-3.2-2b-instruct \
+    --reasoning-parser granite \
+    --default-chat-template-kwargs '{"thinking": true}'
+```
+
+### Request-Level Override
+
+Request-level `chat_template_kwargs` always take priority over server defaults. For example, if the server is started with `enable_thinking=false`, a client can still enable it for a specific request:
+
+```python
+response = client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={"chat_template_kwargs": {"enable_thinking": True}}  # Overrides server default
+)
+```
+
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).