Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1793de4
make reasoning compatible with structured output
Muqi1029 Nov 3, 2025
faad6ef
Add default is_in_reasoning
Muqi1029 Nov 11, 2025
15aca9b
Merge branch 'main' into feat/req_reason
Muqi1029 Nov 11, 2025
85517b3
Merge branch 'main' into feat/req_reason
Muqi1029 Nov 17, 2025
7761d92
remove _get_enable_thinking_from_request
Muqi1029 Nov 17, 2025
825634a
fix typo
Muqi1029 Nov 17, 2025
894da05
Merge branch 'main' into feat/req_reason
CatherineSue Nov 18, 2025
5fb323d
Merge branch 'main' into feat/req_reason
Muqi1029 Nov 20, 2025
a2cbbe2
Merge remote-tracking branch 'origin/feat/req_reason' into feat/req_r…
Muqi1029 Nov 20, 2025
90e05ef
Add more model
Muqi1029 Nov 20, 2025
c978ced
Merge branch 'main' into feat/req_reason
acelyc111 Nov 22, 2025
a5a91ef
Merge branch 'main' into feat/req_reason
JustinTong0323 Dec 6, 2025
c34e4d5
fix logic
JustinTong0323 Dec 6, 2025
56eda38
fix ci
JustinTong0323 Dec 6, 2025
dfb0ec7
Merge branch 'sgl-project:main' into feat/req_reason
Muqi1029 Dec 6, 2025
992c2b3
prevent returning None to reasoning field
Muqi1029 Dec 6, 2025
b6365e0
refactor
Muqi1029 Dec 6, 2025
31951ea
Merge branch 'main' into feat/req_reason
JustinTong0323 Dec 6, 2025
dfbfadb
Revert "refactor"
JustinTong0323 Dec 6, 2025
34526e7
Merge branch 'main' into feat/req_reason
JustinTong0323 Dec 7, 2025
fc3fc30
Merge branch 'main' into feat/req_reason
JustinTong0323 Dec 7, 2025
e4f0ef5
fix gpt-oss
JustinTong0323 Dec 7, 2025
1c58c5f
fix
JustinTong0323 Dec 7, 2025
48e7919
fix
JustinTong0323 Dec 8, 2025
56ec295
Merge branch 'main' into feat/req_reason
JustinTong0323 Dec 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,13 @@
"## Native API and SGLang Runtime (SRT)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> Note: For native API, as a work-around, you need to set `reasoning` argument to `True` to ensure the model will think before generating the structured output. It's not required for chat-completion API."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -407,6 +414,7 @@
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": text,\n",
" \"reasoning\": True,\n",
" \"sampling_params\": {\n",
" \"temperature\": 0,\n",
" \"max_new_tokens\": 2048,\n",
Expand Down Expand Up @@ -454,6 +462,7 @@
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": text,\n",
" \"reasoning\": True,\n",
" \"sampling_params\": {\n",
" \"temperature\": 0,\n",
" \"max_new_tokens\": 2048,\n",
Expand Down Expand Up @@ -482,6 +491,7 @@
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"Give me the information of the capital of France.\",\n",
" \"reasoning\": True,\n",
" \"sampling_params\": {\n",
" \"max_new_tokens\": 2048,\n",
" \"temperature\": 0,\n",
Expand Down Expand Up @@ -519,6 +529,7 @@
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"Paris is the capital of\",\n",
" \"reasoning\": True,\n",
" \"sampling_params\": {\n",
" \"temperature\": 0,\n",
" \"max_new_tokens\": 2048,\n",
Expand Down Expand Up @@ -547,6 +558,7 @@
")\n",
"payload = {\n",
" \"text\": text,\n",
" \"reasoning\": True,\n",
" \"sampling_params\": {\n",
" \"max_new_tokens\": 2048,\n",
" \"structural_tag\": json.dumps(\n",
Expand Down
15 changes: 11 additions & 4 deletions python/sglang/srt/constrained/base_grammar_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def __init__(self):
self.grammar_stats = None
self.current_token = None

def maybe_init_reasoning(self, reasoning: bool):
pass

def accept_token(self, token: int) -> None:
"""
Accept a token in the grammar.
Expand Down Expand Up @@ -151,7 +154,9 @@ def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
return self._not_supported("structural_tag", key_string)

def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
def _init_value_dispatch(
self, key: Tuple[str, str], reasoning: bool
) -> Optional[BaseGrammarObject]:
s = time.perf_counter()
key_type, key_string = key
if key_type == "json":
Expand All @@ -174,12 +179,14 @@ def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObje
return grammar

def get_cached_or_future_value(
self, key: Tuple[str, str]
self, key: Tuple[str, str], reasoning: bool
) -> Optional[BaseGrammarObject]:
value = self.cache.get(key)
if value:
return value.copy(), True
value = self.executor.submit(self._init_value_dispatch, key)
copied_value = value.copy()
copied_value.maybe_init_reasoning(reasoning)
return copied_value, True
value = self.executor.submit(self._init_value_dispatch, key, reasoning)
return value, False

def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject):
Expand Down
15 changes: 11 additions & 4 deletions python/sglang/srt/constrained/reasoner_grammar_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


class ReasonerGrammarObject(BaseGrammarObject):
def __init__(self, grammar: BaseGrammarObject, think_end_id):
def __init__(self, grammar: BaseGrammarObject, think_end_id: int):
super().__init__()
self.grammar = grammar
self.think_end_id = think_end_id
Expand All @@ -34,6 +34,9 @@ def __init__(self, grammar: BaseGrammarObject, think_end_id):
# + means number of tokens after thinking ended
self.tokens_after_think_end = -1

def maybe_init_reasoning(self, reasoning: bool):
self.tokens_after_think_end = -1 if reasoning else 0

def transfer_state(self, token: int) -> int:
if self.tokens_after_think_end == -1 and token == self.think_end_id:
self.tokens_after_think_end = 0
Expand Down Expand Up @@ -109,9 +112,13 @@ def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
self.grammar_backend = grammar_backend
self.think_end_id = think_end_id

def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
ret = self.grammar_backend._init_value_dispatch(key)
def _init_value_dispatch(
self, key: Tuple[str, str], reasoning: bool
) -> Optional[BaseGrammarObject]:
ret = self.grammar_backend._init_value_dispatch(key, reasoning)
# avoid wrapping invalid grammar, so that the scheduler can detect it
if ret is None or ret is INVALID_GRAMMAR_OBJ:
return ret
return ReasonerGrammarObject(ret, self.think_end_id)
obj = ReasonerGrammarObject(ret, self.think_end_id)
obj.maybe_init_reasoning(reasoning)
return obj
47 changes: 23 additions & 24 deletions python/sglang/srt/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ def _convert_to_internal_request(
return_hidden_states=request.return_hidden_states,
rid=request.rid,
extra_key=self._compute_extra_key(request),
reasoning=self._get_reasoning_from_request(request),
priority=request.priority,
custom_labels=custom_labels,
custom_logit_processor=request.custom_logit_processor,
Expand Down Expand Up @@ -443,7 +444,10 @@ def _apply_conversation_template(
prompt = prompt[: -len(conv.sep2)]
else:
prompt = conv.get_prompt()
if self._get_enable_thinking_from_request(request):
if self._get_reasoning_from_request(
request
) and self.reasoning_parser not in ["qwen3", "qwen3-thinking", "glm4"]:
# qwen3 and glm4 think internally without a leading <think> token
prompt += "<think>" # Note(Xinyuan): hard code thinking token

image_data = conv.image_data if conv.image_data else None
Expand Down Expand Up @@ -775,7 +779,7 @@ def _build_chat_response(
if reasoning_parser and request.separate_reasoning:
is_force_reasoning = (
self.template_manager.force_reasoning
or self._get_enable_thinking_from_request(request)
or self._get_reasoning_from_request(request)
)
try:
parser = ReasoningParser(
Expand Down Expand Up @@ -1022,7 +1026,7 @@ def _process_reasoning_stream(
if index not in reasoning_parser_dict:
is_force_reasoning = (
self.template_manager.force_reasoning
or self._get_enable_thinking_from_request(request)
or self._get_reasoning_from_request(request)
)
reasoning_parser_dict[index] = ReasoningParser(
self.reasoning_parser,
Expand Down Expand Up @@ -1052,27 +1056,22 @@ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
return idx

def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.

NOTE: This parameter is only useful for models that support enable_thinking
flag, such as Qwen3.

Args:
request_obj: The request object (or an item from a list of requests).
Returns:
The boolean value of 'enable_thinking' if found, otherwise False.
"""
if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
# For Qwen3 models, `enable_thinking` is supported.
if self.reasoning_parser in ["qwen3", "glm45"]:
return request.chat_template_kwargs.get("enable_thinking", False)
# For DeepSeek-V3.1 models, `thinking` is supported.
elif self.reasoning_parser in ["deepseek-v3"]:
return request.chat_template_kwargs.get("thinking", False)
else:
return False
return False
def _get_reasoning_from_request(self, request: ChatCompletionRequest) -> bool:
"""Judge whether the request needs reasoning"""
if not self.reasoning_parser:
return False
if self.reasoning_parser in ["deepseek-v3"]:
return (
request.chat_template_kwargs is not None
and request.chat_template_kwargs.get("thinking") is True
)
if self.reasoning_parser in ["qwen3", "glm45"]:
# qwen3 and glm45 are reasoning by default
return (
not request.chat_template_kwargs
or request.chat_template_kwargs.get("enable_thinking", True) is True
)
return True # default

async def _process_tool_call_stream(
self,
Expand Down
6 changes: 6 additions & 0 deletions python/sglang/srt/managers/io_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ class GenerateReqInput(BaseReq):
bootstrap_room: Optional[Union[List[int], int]] = None
bootstrap_pair_key: Optional[Union[List[str], str]] = None

# For reasoning
reasoning: bool = False

# Validation step duration
validation_time: Optional[float] = None

Expand Down Expand Up @@ -675,6 +678,9 @@ class TokenizedGenerateReqInput(BaseReq):
bootstrap_room: Optional[int] = None
bootstrap_pair_key: Optional[str] = None

# For reasoning
reasoning: bool = False

# For data parallel rank routing
data_parallel_rank: Optional[int] = None

Expand Down
4 changes: 4 additions & 0 deletions python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ def __init__(
token_type_ids: List[int] = None,
session_id: Optional[str] = None,
custom_logit_processor: Optional[str] = None,
reasoning: bool = False,
return_hidden_states: bool = False,
eos_token_ids: Optional[Set[int]] = None,
bootstrap_host: Optional[str] = None,
Expand Down Expand Up @@ -517,6 +518,9 @@ def __init__(
# For multi-http worker
self.http_worker_ipc = http_worker_ipc

# For reasoning
self.reasoning = reasoning

# Sampling info
if isinstance(sampling_params.custom_params, dict):
sampling_params = copy.copy(sampling_params)
Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/managers/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1304,6 +1304,7 @@ def handle_generate_request(
lora_id=recv_req.lora_id,
input_embeds=recv_req.input_embeds,
custom_logit_processor=recv_req.custom_logit_processor,
reasoning=recv_req.reasoning,
return_hidden_states=recv_req.return_hidden_states,
eos_token_ids=self.model_config.hf_eos_token_id,
bootstrap_host=recv_req.bootstrap_host,
Expand Down Expand Up @@ -1431,7 +1432,9 @@ def handle_generate_request(
elif req.sampling_params.structural_tag:
key = ("structural_tag", req.sampling_params.structural_tag)

value, cache_hit = self.grammar_backend.get_cached_or_future_value(key)
value, cache_hit = self.grammar_backend.get_cached_or_future_value(
key, req.reasoning
)
req.grammar = value

if not cache_hit:
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,7 @@ def _create_tokenized_object(
input_embeds=input_embeds,
session_params=session_params,
custom_logit_processor=obj.custom_logit_processor,
reasoning=obj.reasoning,
return_hidden_states=obj.return_hidden_states,
data_parallel_rank=obj.data_parallel_rank,
priority=obj.priority,
Expand Down
Loading