From 0a9665d4a9300618e5f490b19c342fbc2fefa97f Mon Sep 17 00:00:00 2001 From: vzed <207368749+vincentzed@users.noreply.github.com> Date: Wed, 23 Apr 2025 11:33:31 -0400 Subject: [PATCH 1/6] support the chat_template_kwargs like vllm --- python/sglang/srt/openai_api/adapter.py | 11 ++++++++++- python/sglang/srt/openai_api/protocol.py | 3 ++- python/sglang/srt/server_args.py | 25 +++++++++++++++++++++++- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 040548ad085..d1e78783756 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -994,12 +994,21 @@ def v1_chat_generate_request( else: assistant_prefix = None + request_kwargs = request.chat_template_kwargs # Get from request + if request_kwargs is not None: + kwargs_to_pass = request_kwargs # Use request kwargs if they exist (overrides global) + else: + kwargs_to_pass = ( + tokenizer_manager.server_args.chat_template_kwargs or {} + ) + try: prompt_ids = tokenizer_manager.tokenizer.apply_chat_template( - openai_compatible_messages, + conversation=openai_compatible_messages, tokenize=True, add_generation_prompt=True, tools=tools, + **kwargs_to_pass, ) except: # This except branch will be triggered when the chosen model diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 33644dd11be..2ecdbf80e01 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -14,7 +14,7 @@ """Pydantic models for OpenAI API protocol""" import time -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Field, root_validator from typing_extensions import Literal @@ -335,6 +335,7 @@ class ChatCompletionRequest(BaseModel): tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field( default="auto", examples=["none"] ) # noqa + chat_template_kwargs: Optional[Dict[str, Any]] = None @root_validator(pre=True) def set_tool_choice_default(cls, values): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 787cbad0e5f..105c0319893 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -20,7 +20,7 @@ import os import random import tempfile -from typing import List, Literal, Optional +from typing import List, Literal, Optional, Dict from sglang.srt.hf_transformers_utils import check_gguf_file from sglang.srt.reasoning_parser import ReasoningParser @@ -57,6 +57,8 @@ class ServerArgs: device: Optional[str] = None served_model_name: Optional[str] = None chat_template: Optional[str] = None + chat_template_kwargs_json: Optional[str] = None + chat_template_kwargs: Optional[Dict[str, Any]] = None # Any due to parsing user provided json, vLLM uses the same typing for the equivalent server argument. completion_template: Optional[str] = None is_embedding: bool = False revision: Optional[str] = None @@ -198,6 +200,19 @@ class ServerArgs: disaggregation_ib_device: Optional[str] = None def __post_init__(self): + # Parse chat_template_kwargs_json + if self.chat_template_kwargs_json: + try: + self.chat_template_kwargs = json.loads(self.chat_template_kwargs_json) + except json.JSONDecodeError as e: + logger.error( + f"Error parsing --chat-template-kwargs JSON: {e}. " + f"Input string: '{self.chat_template_kwargs_json}'" + ) + raise ValueError( + "Invalid JSON string provided for --chat-template-kwargs" + ) from e + # Expert parallelism if self.enable_ep_moe: self.ep_size = self.tp_size @@ -237,6 +252,7 @@ def __post_init__(self): # FIXME: more fine grained auto-selection polices self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem + # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: if gpu_mem is not None and gpu_mem < 25_000: @@ -542,6 +558,13 @@ def add_cli_args(parser: argparse.ArgumentParser): default=ServerArgs.chat_template, help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.", ) + parser.add_argument( + "--chat-template-kwargs", + type=str, + dest="chat_template_kwargs_json", # Store raw JSON string here + default=ServerArgs.chat_template_kwargs_json, + help="Additional kwargs to pass to the template renderer (Jinja only), specified as a JSON string. Will be accessible by the chat template.", + ) parser.add_argument( "--completion-template", type=str, From e706580a4887adc2bc43e84205f11e7acfbb8bd3 Mon Sep 17 00:00:00 2001 From: vzed <207368749+vincentzed@users.noreply.github.com> Date: Wed, 23 Apr 2025 11:33:49 -0400 Subject: [PATCH 2/6] support the chat_template_kwargs for documentation --- docs/backend/custom_chat_template.md | 43 ++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/backend/custom_chat_template.md b/docs/backend/custom_chat_template.md index 557af5bf5f7..ce403ac96fd 100644 --- a/docs/backend/custom_chat_template.md +++ b/docs/backend/custom_chat_template.md @@ -40,3 +40,46 @@ You can also use the [Jinja template format](https://huggingface.co/docs/transfo ```bash python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja ``` + +### Passing Arguments to Jinja Templates + +When using a Jinja-based chat template (either the default template embedded in the tokenizer or a custom template loaded via `--chat-template path/to/template.jinja`), you can pass additional keyword arguments to the template renderer context. + +This is done using the `chat_template_kwargs` parameter in the `/v1/chat/completions` request body. This parameter accepts a JSON object (dictionary) where keys are the argument names and values are the corresponding values you want to make available within your Jinja template. + +**Example Request:** + +```json +{ + "model": "meta-llama/Llama-3-8B-Instruct", + "messages": [ + {"role": "user", "content": "Hello!"} + ], + "chat_template_kwargs": { + "my_custom_arg": "some_value", + "another_arg": 123 + } +} +``` + +**Example Jinja Template (`my_template.jinja`):** + +```jinja +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + message['content'] + eos_token }} + {% endif %} +{% endfor %} + +{# Accessing custom arguments #} +Custom Arg: {{ my_custom_arg }} +Another Arg: {{ another_arg }} +``` + +**Important Notes:** + +* The `chat_template_kwargs` parameter in the request **only** works with Jinja-based templates. It has no effect when using legacy JSON-based templates (loaded via `--chat-template template_name` or `--chat-template path/to/template.json`). +* You can also set *global* default arguments using the `--chat-template-kwargs` server launch flag, which accepts a JSON string (e.g., `--chat-template-kwargs '{"global_arg": true}'`). +* If `chat_template_kwargs` is provided in a specific request, it **completely overrides** any global arguments set via the server flag for that request. If `chat_template_kwargs` is *not* provided in the request, the global arguments (if set) will be used. From 1202628e44607d187364358e1acc23adcba40554 Mon Sep 17 00:00:00 2001 From: vzed <207368749+vincentzed@users.noreply.github.com> Date: Wed, 23 Apr 2025 15:41:38 -0400 Subject: [PATCH 3/6] run black on server args, resolve failing CI --- python/sglang/srt/server_args.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 105c0319893..2e2a429b5da 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -58,7 +58,9 @@ class ServerArgs: served_model_name: Optional[str] = None chat_template: Optional[str] = None chat_template_kwargs_json: Optional[str] = None - chat_template_kwargs: Optional[Dict[str, Any]] = None # Any due to parsing user provided json, vLLM uses the same typing for the equivalent server argument. + chat_template_kwargs: Optional[Dict[str, Any]] = ( + None # Any due to parsing user provided json, vLLM uses the same typing for the equivalent server argument. + ) completion_template: Optional[str] = None is_embedding: bool = False revision: Optional[str] = None @@ -252,7 +254,6 @@ def __post_init__(self): # FIXME: more fine grained auto-selection polices self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem - # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: if gpu_mem is not None and gpu_mem < 25_000: From 57192b13ebe729e2c2b0f2688c909a6c67886d34 Mon Sep 17 00:00:00 2001 From: vzed <207368749+vincentzed@users.noreply.github.com> Date: Thu, 24 Apr 2025 21:03:43 -0400 Subject: [PATCH 4/6] isort. --- python/sglang/srt/server_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 2e2a429b5da..bddf8949a9b 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -20,7 +20,7 @@ import os import random import tempfile -from typing import List, Literal, Optional, Dict +from typing import Dict, List, Literal, Optional from sglang.srt.hf_transformers_utils import check_gguf_file from sglang.srt.reasoning_parser import ReasoningParser From 8b6e9888b4812fc74f836fa543fa53367f34fbc4 Mon Sep 17 00:00:00 2001 From: vzed <207368749+vincentzed@users.noreply.github.com> Date: Wed, 30 Apr 2025 22:33:23 -0400 Subject: [PATCH 5/6] remove the things --- docs/backend/custom_chat_template.md | 43 ------------------------ python/sglang/srt/openai_api/adapter.py | 10 +----- python/sglang/srt/openai_api/protocol.py | 3 +- python/sglang/srt/server_args.py | 26 +------------- 4 files changed, 3 insertions(+), 79 deletions(-) diff --git a/docs/backend/custom_chat_template.md b/docs/backend/custom_chat_template.md index ce403ac96fd..557af5bf5f7 100644 --- a/docs/backend/custom_chat_template.md +++ b/docs/backend/custom_chat_template.md @@ -40,46 +40,3 @@ You can also use the [Jinja template format](https://huggingface.co/docs/transfo ```bash python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja ``` - -### Passing Arguments to Jinja Templates - -When using a Jinja-based chat template (either the default template embedded in the tokenizer or a custom template loaded via `--chat-template path/to/template.jinja`), you can pass additional keyword arguments to the template renderer context. - -This is done using the `chat_template_kwargs` parameter in the `/v1/chat/completions` request body. This parameter accepts a JSON object (dictionary) where keys are the argument names and values are the corresponding values you want to make available within your Jinja template. - -**Example Request:** - -```json -{ - "model": "meta-llama/Llama-3-8B-Instruct", - "messages": [ - {"role": "user", "content": "Hello!"} - ], - "chat_template_kwargs": { - "my_custom_arg": "some_value", - "another_arg": 123 - } -} -``` - -**Example Jinja Template (`my_template.jinja`):** - -```jinja -{% for message in messages %} - {% if message['role'] == 'user' %} - {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }} - {% elif message['role'] == 'assistant' %} - {{ ' ' + message['content'] + eos_token }} - {% endif %} -{% endfor %} - -{# Accessing custom arguments #} -Custom Arg: {{ my_custom_arg }} -Another Arg: {{ another_arg }} -``` - -**Important Notes:** - -* The `chat_template_kwargs` parameter in the request **only** works with Jinja-based templates. It has no effect when using legacy JSON-based templates (loaded via `--chat-template template_name` or `--chat-template path/to/template.json`). -* You can also set *global* default arguments using the `--chat-template-kwargs` server launch flag, which accepts a JSON string (e.g., `--chat-template-kwargs '{"global_arg": true}'`). -* If `chat_template_kwargs` is provided in a specific request, it **completely overrides** any global arguments set via the server flag for that request. If `chat_template_kwargs` is *not* provided in the request, the global arguments (if set) will be used. diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 47492ad1502..adf2cebc876 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1002,17 +1002,9 @@ def v1_chat_generate_request( else: assistant_prefix = None - request_kwargs = request.chat_template_kwargs # Get from request - if request_kwargs is not None: - kwargs_to_pass = request_kwargs # Use request kwargs if they exist (overrides global) - else: - kwargs_to_pass = ( - tokenizer_manager.server_args.chat_template_kwargs or {} - ) - try: prompt_ids = tokenizer_manager.tokenizer.apply_chat_template( - conversation=openai_compatible_messages, + openai_compatible_messages, tokenize=True, add_generation_prompt=True, tools=tools, diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 770c9480b5b..88d8873d189 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -14,7 +14,7 @@ """Pydantic models for OpenAI API protocol""" import time -from typing import Any, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union from pydantic import BaseModel, Field, root_validator from typing_extensions import Literal @@ -344,7 +344,6 @@ class ChatCompletionRequest(BaseModel): tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field( default="auto", examples=["none"] ) # noqa - chat_template_kwargs: Optional[Dict[str, Any]] = None @root_validator(pre=True) def set_tool_choice_default(cls, values): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ccbe293f82a..1d7c2aa1ab0 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -20,7 +20,7 @@ import os import random import tempfile -from typing import Dict, List, Literal, Optional +from typing import List, Literal, Optional from sglang.srt.hf_transformers_utils import check_gguf_file, get_config from sglang.srt.reasoning_parser import ReasoningParser @@ -57,10 +57,6 @@ class ServerArgs: device: Optional[str] = None served_model_name: Optional[str] = None chat_template: Optional[str] = None - chat_template_kwargs_json: Optional[str] = None - chat_template_kwargs: Optional[Dict[str, Any]] = ( - None # Any due to parsing user provided json, vLLM uses the same typing for the equivalent server argument. - ) completion_template: Optional[str] = None is_embedding: bool = False revision: Optional[str] = None @@ -204,19 +200,6 @@ class ServerArgs: disaggregation_ib_device: Optional[str] = None def __post_init__(self): - # Parse chat_template_kwargs_json - if self.chat_template_kwargs_json: - try: - self.chat_template_kwargs = json.loads(self.chat_template_kwargs_json) - except json.JSONDecodeError as e: - logger.error( - f"Error parsing --chat-template-kwargs JSON: {e}. " - f"Input string: '{self.chat_template_kwargs_json}'" - ) - raise ValueError( - "Invalid JSON string provided for --chat-template-kwargs" - ) from e - # Expert parallelism if self.enable_ep_moe: self.ep_size = self.tp_size @@ -582,13 +565,6 @@ def add_cli_args(parser: argparse.ArgumentParser): default=ServerArgs.chat_template, help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.", ) - parser.add_argument( - "--chat-template-kwargs", - type=str, - dest="chat_template_kwargs_json", # Store raw JSON string here - default=ServerArgs.chat_template_kwargs_json, - help="Additional kwargs to pass to the template renderer (Jinja only), specified as a JSON string. Will be accessible by the chat template.", - ) parser.add_argument( "--completion-template", type=str, From 4bfa02ecd2fcdad8950bb94db0bc720a0aafd3ea Mon Sep 17 00:00:00 2001 From: vzed <207368749+vincentzed@users.noreply.github.com> Date: Wed, 30 Apr 2025 22:37:07 -0400 Subject: [PATCH 6/6] make it into a docs PR --- docs/backend/openai_api_completions.ipynb | 58 ++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 2fc74a7be13..5424e45a03b 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -94,7 +94,63 @@ "\n", "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n", "\n", - "Here is an example of a detailed chat completion request:" + "SGLang extends the standard API with the `extra_body` parameter, allowing for additional customization. One key option within `extra_body` is `chat_template_kwargs`, which can be used to pass arguments to the chat template processor.\n", + "\n", + "#### Enabling Model Thinking/Reasoning\n", + "\n", + "You can use `chat_template_kwargs` to enable or disable the model's internal thinking or reasoning process output. Set `\"enable_thinking\": True` within `chat_template_kwargs` to include the reasoning steps in the response. This requires launching the server with a compatible reasoning parser (e.g., `--reasoning-parser qwen3` for Qwen3 models).\n", + "\n", + "Here's an example demonstrating how to enable thinking and retrieve the reasoning content separately (using `separate_reasoning: True`):\n", + "\n", + "```python\n", + "# Ensure the server is launched with a compatible reasoning parser, e.g.:\n", + "# python3 -m sglang.launch_server --model-path QwQ/Qwen3-32B-250415 --reasoning-parser qwen3 ...\n", + "\n", + "from openai import OpenAI\n", + "\n", + "# Modify OpenAI's API key and API base to use SGLang's API server.\n", + "openai_api_key = \"EMPTY\"\n", + "openai_api_base = f\"http://127.0.0.1:{port}/v1\" # Use the correct port\n", + "\n", + "client = OpenAI(\n", + " api_key=openai_api_key,\n", + " base_url=openai_api_base,\n", + ")\n", + "\n", + "model = \"QwQ/Qwen3-32B-250415\" # Use the model loaded by the server\n", + "messages = [{\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"}]\n", + "\n", + "response = client.chat.completions.create(\n", + " model=model,\n", + " messages=messages,\n", + " extra_body={\n", + " \"chat_template_kwargs\": {\"enable_thinking\": True},\n", + " \"separate_reasoning\": True\n", + " }\n", + ")\n", + "\n", + "print(\"response.choices[0].message.reasoning_content: \\n\", response.choices[0].message.reasoning_content)\n", + "print(\"response.choices[0].message.content: \\n\", response.choices[0].message.content)\n", + "```\n", + "\n", + "**Example Output:**\n", + "\n", + "```\n", + "response.choices[0].message.reasoning_content: \n", + " Okay, so I need to figure out which number is greater between 9.11 and 9.8. Hmm, let me think. Both numbers start with 9, right? So the whole number part is the same. That means I need to look at the decimal parts to determine which one is bigger.\n", + "...\n", + "Therefore, after checking multiple methods—aligning decimals, subtracting, converting to fractions, and using a real-world analogy—it's clear that 9.8 is greater than 9.11.\n", + "\n", + "response.choices[0].message.content: \n", + " To determine which number is greater between **9.11** and **9.8**, follow these steps:\n", + "...\n", + "**Answer**: \n", + "9.8 is greater than 9.11.\n", + "```\n", + "\n", + "Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`.\n", + "\n", + "Here is an example of a detailed chat completion request using standard OpenAI parameters:" ] }, {