Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
64f4927
Enables force reasoning based on chat template
JustinTong0323 Jul 25, 2025
1e62a55
update
JustinTong0323 Jul 25, 2025
b6b087a
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 25, 2025
ba7c8f1
Unifies Qwen3 reasoning parser
JustinTong0323 Jul 25, 2025
548651c
fix none chat-template
JustinTong0323 Jul 26, 2025
2630d3a
fix: tokenizer is none
JustinTong0323 Jul 26, 2025
e11e124
Improves reasoning pattern detection
JustinTong0323 Jul 26, 2025
445ea2e
Adds qwen3-thinking reasoning parser option
JustinTong0323 Jul 26, 2025
c940b77
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 26, 2025
371b5fc
Update docs/backend/openai_api_completions.ipynb
JustinTong0323 Jul 27, 2025
037b469
Update docs/backend/separate_reasoning.ipynb
JustinTong0323 Jul 27, 2025
994947e
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 27, 2025
af32c23
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 27, 2025
7683956
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 28, 2025
5005b07
feat: add thinking prompt for request in OpenAIServingChat
JustinTong0323 Jul 28, 2025
273d252
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 28, 2025
be24e17
fix: update thinking token comment and method signature in OpenAIServ…
JustinTong0323 Jul 30, 2025
75eb213
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Jul 30, 2025
68d9567
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Aug 1, 2025
ba8da26
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Aug 3, 2025
92b35e3
fix: disables thinking by default
JustinTong0323 Aug 3, 2025
d1b4f00
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Aug 4, 2025
f557d10
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Aug 4, 2025
f1f0c58
Merge branch 'main' into feat-auto-detect-force-reasoning-qwen3
JustinTong0323 Aug 4, 2025
b05b795
fix: tokenizer is none when skip init
JustinTong0323 Aug 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions docs/backend/openai_api_completions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -101,19 +101,16 @@
"\n",
"**Reasoning Parser Options:**\n",
"- `--reasoning-parser deepseek-r1`: For DeepSeek-R1 family models (R1, R1-0528, R1-Distill)\n",
"- `--reasoning-parser qwen3`: For standard Qwen3 models that support `enable_thinking` parameter\n",
"- `--reasoning-parser qwen3-thinking`: For Qwen3-Thinking models (e.g., Qwen/Qwen3-235B-A22B-Thinking-2507) that always generate thinking content\n",
"- `--reasoning-parser qwen3`: For both standard Qwen3 models that support `enable_thinking` parameter and Qwen3-Thinking models\n",
"- `--reasoning-parser qwen3-thinking`: For Qwen3-Thinking models, force reasoning version of qwen3 parser\n",
"- `--reasoning-parser kimi`: For Kimi thinking models\n",
"\n",
"Here's an example demonstrating how to enable thinking and retrieve the reasoning content separately (using `separate_reasoning: True`):\n",
"\n",
"```python\n",
"# For standard Qwen3 models with enable_thinking support:\n",
"# For Qwen3 models with enable_thinking support:\n",
"# python3 -m sglang.launch_server --model-path QwQ/Qwen3-32B-250415 --reasoning-parser qwen3 ...\n",
"\n",
"# For Qwen3-Thinking models that always think:\n",
"# python3 -m sglang.launch_server --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 --reasoning-parser qwen3-thinking ...\n",
"\n",
"from openai import OpenAI\n",
"\n",
"# Modify OpenAI's API key and API base to use SGLang's API server.\n",
Expand All @@ -132,7 +129,7 @@
" model=model,\n",
" messages=messages,\n",
" extra_body={\n",
" \"chat_template_kwargs\": {\"enable_thinking\": True}, # Only for standard Qwen3 models\n",
" \"chat_template_kwargs\": {\"enable_thinking\": True},\n",
" \"separate_reasoning\": True\n",
" }\n",
")\n",
Expand All @@ -158,7 +155,7 @@
"\n",
"Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`.\n",
"\n",
"**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. When using `--reasoning-parser qwen3-thinking`, the model will always produce reasoning content regardless of the `enable_thinking` setting.\n",
"**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. Use `--reasoning-parser qwen3-thinking` or `--reasoning-parser qwen3` to parse the thinking content.\n",
"\n",
"Here is an example of a detailed chat completion request using standard OpenAI parameters:"
]
Expand Down
7 changes: 5 additions & 2 deletions docs/backend/separate_reasoning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"|---------|-----------------------------|------------------|-------|\n",
"| [DeepSeek‑R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `<think>` … `</think>` | `deepseek-r1` | Supports all variants (R1, R1-0528, R1-Distill) |\n",
"| [Standard Qwen3 models](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `<think>` … `</think>` | `qwen3` | Supports `enable_thinking` parameter |\n",
"| [Qwen3-Thinking models](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) | `<think>` … `</think>` | `qwen3-thinking` | Always generates thinking content |\n",
"| [Qwen3-Thinking models](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) | `<think>` … `</think>` | `qwen3` or `qwen3-thinking` | Always generates thinking content |\n",
"| [Kimi models](https://huggingface.co/collections/MoonshotAI/kimi-675e30c072b7ba7e79833be7) | `◁think▷` … `◁/think▷` | `kimi` | Uses special thinking delimiters |\n",
"\n",
"### Model-Specific Behaviors\n",
Expand All @@ -26,7 +26,10 @@
"\n",
"**Qwen3 Family:**\n",
"- Standard Qwen3 (e.g., Qwen3-2507): Use `qwen3` parser, supports `enable_thinking` in chat templates\n",
"- Qwen3-Thinking (e.g., Qwen3-235B-A22B-Thinking-2507): Use `qwen3-thinking` parser, always thinks"
"- Qwen3-Thinking (e.g., Qwen3-235B-A22B-Thinking-2507): Use `qwen3` or `qwen3-thinking` parser, always thinks\n",
"\n",
"**Kimi:**\n",
"- Kimi: Uses special `◁think▷` and `◁/think▷` tags"
]
},
{
Expand Down
13 changes: 9 additions & 4 deletions python/sglang/srt/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,8 @@ def _apply_conversation_template(
prompt = prompt[: -len(conv.sep2)]
else:
prompt = conv.get_prompt()
if self._get_enable_thinking_from_request(request):
prompt += "<think>" # Note(Xinyuan): hard code thinking token

image_data = conv.image_data if conv.image_data else None
video_data = conv.video_data if conv.video_data else None
Expand Down Expand Up @@ -647,7 +649,9 @@ def _build_chat_response(
if reasoning_parser and request.separate_reasoning:
try:
parser = ReasoningParser(
model_type=reasoning_parser, stream_reasoning=False
model_type=reasoning_parser,
stream_reasoning=False,
force_reasoning=self.template_manager.force_reasoning,
)
reasoning_text, text = parser.parse_non_stream(text)
except Exception as e:
Expand Down Expand Up @@ -813,11 +817,12 @@ def _process_reasoning_stream(
reasoning_parser_dict[index] = ReasoningParser(
self.tokenizer_manager.server_args.reasoning_parser,
request.stream_reasoning,
self.template_manager.force_reasoning,
)
reasoning_parser = reasoning_parser_dict[index]
return reasoning_parser.parse_stream_chunk(delta)

def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.

NOTE: This parameter is only useful for models that support enable_thinking
Expand All @@ -826,15 +831,15 @@ def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
Args:
request_obj: The request object (or an item from a list of requests).
Returns:
The boolean value of 'enable_thinking' if found and not True, otherwise True.
The boolean value of 'enable_thinking' if found, otherwise False.
"""
if (
hasattr(request, "chat_template_kwargs")
and request.chat_template_kwargs
and request.chat_template_kwargs.get("enable_thinking") is not None
):
return request.chat_template_kwargs.get("enable_thinking")
return True
return False

async def _process_tool_call_stream(
self,
Expand Down
36 changes: 35 additions & 1 deletion python/sglang/srt/managers/template_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import json
import logging
import os
import re
from typing import Optional

from sglang.srt.code_completion_parser import (
Expand Down Expand Up @@ -54,6 +55,7 @@ def __init__(self):
self._chat_template_name: Optional[str] = None
self._completion_template_name: Optional[str] = None
self._jinja_template_content_format: Optional[str] = "openai"
self._force_reasoning: bool = False

@property
def chat_template_name(self) -> Optional[str]:
Expand All @@ -70,6 +72,31 @@ def jinja_template_content_format(self) -> Optional[str]:
"""Get the detected template content format ('string' or 'openai' or None)."""
return self._jinja_template_content_format

@property
def force_reasoning(self) -> bool:
"""
Check if the current chat template enforces reasoning/thinking.

Returns:
True if the template contains reasoning patterns like <think> tags
"""
return self._force_reasoning

def _detect_reasoning_pattern(self, template: str) -> bool:
"""
Detect if the chat template contains reasoning/thinking patterns.
"""
if template is None:
return False

force_reasoning_pattern = r"<\|im_start\|>assistant\\n<think>\\n"
has_reasoning = re.search(force_reasoning_pattern, template) is not None

if has_reasoning:
logger.info("Detected the force reasoning pattern in chat template.")

return has_reasoning

def load_chat_template(
self, tokenizer_manager, chat_template_arg: Optional[str], model_path: str
) -> None:
Expand All @@ -93,7 +120,8 @@ def load_chat_template(
hf_template = self._resolve_hf_chat_template(tokenizer_manager)
if hf_template:
# override the chat template
tokenizer_manager.tokenizer.chat_template = hf_template
if tokenizer_manager.tokenizer:
tokenizer_manager.tokenizer.chat_template = hf_template
self._jinja_template_content_format = (
detect_jinja_template_content_format(hf_template)
)
Expand All @@ -106,6 +134,12 @@ def load_chat_template(
self._jinja_template_content_format = "string"
logger.info("No chat template found, defaulting to 'string' content format")

# Detect reasoning pattern from chat template
if tokenizer_manager.tokenizer:
self._force_reasoning = self._detect_reasoning_pattern(
tokenizer_manager.tokenizer.chat_template
)

def _load_explicit_chat_template(
self, tokenizer_manager, chat_template_arg: str
) -> None:
Expand Down
57 changes: 18 additions & 39 deletions python/sglang/srt/reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
If True, streams reasoning content as it arrives.
"""

def __init__(self, stream_reasoning: bool = True):
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
# DeepSeek-R1 is assumed to be reasoning until `</think>` token
super().__init__(
"<think>",
Expand All @@ -144,7 +144,7 @@ def __init__(self, stream_reasoning: bool = True):

class Qwen3Detector(BaseReasoningFormatDetector):
"""
Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
Assumes reasoning format:
(<think>)*(.*)</think>

Expand All @@ -153,47 +153,16 @@ class Qwen3Detector(BaseReasoningFormatDetector):
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
- enable_thinking=False: "The answer is 42." (no thinking tokens)

This detector handles both cases.

NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
Those models always generate thinking content without <think> start tags.
Use "qwen3-thinking" parser type for those models instead.

Args:
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
If True, streams reasoning content as it arrives.
"""

def __init__(self, stream_reasoning: bool = True):
super().__init__(
"<think>",
"</think>",
force_reasoning=False,
stream_reasoning=stream_reasoning,
)


class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
"""
Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
Assumes reasoning format:
*(.*)</think>

These models always generate thinking content without <think> start tag.
They do not support the enable_thinking parameter and always think.

Format: "I need to think about this...</think>The answer is 42."

Args:
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
If True, streams reasoning content as it arrives.
"""

def __init__(self, stream_reasoning: bool = True):
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
super().__init__(
"<think>",
"</think>",
force_reasoning=True,
force_reasoning=force_reasoning,
stream_reasoning=stream_reasoning,
)

Expand All @@ -207,7 +176,7 @@ class KimiDetector(BaseReasoningFormatDetector):
and the rest of the text as `normal_text`.
"""

def __init__(self, stream_reasoning: bool = True):
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
super().__init__(
"◁think▷",
"◁/think▷",
Expand All @@ -230,21 +199,31 @@ class ReasoningParser:
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
"deepseek-r1": DeepSeekR1Detector,
"qwen3": Qwen3Detector,
"qwen3-thinking": Qwen3ThinkingDetector,
"qwen3-thinking": Qwen3Detector,
"glm45": Qwen3Detector,
"kimi": KimiDetector,
"step3": DeepSeekR1Detector,
}

def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
def __init__(
self,
model_type: Optional[str] = None,
stream_reasoning: bool = True,
force_reasoning: bool = False,
):
if not model_type:
raise ValueError("Model type must be specified")

detector_class = self.DetectorMap.get(model_type.lower())
if not detector_class:
raise ValueError(f"Unsupported model type: {model_type}")

self.detector = detector_class(stream_reasoning=stream_reasoning)
if model_type.lower() == "qwen3-thinking":
force_reasoning = True

self.detector = detector_class(
stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
)

def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
"""Non-streaming call: one-time parsing"""
Expand Down
Loading
Loading