sgl-project · Fridge003 · Dec 8, 2025 · Nov 3, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb
@@ -360,6 +360,13 @@
     "## Native API and SGLang Runtime (SRT)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> Note: For native API, as a work-around, you need to set `reasoning` argument to `True` to ensure the model will think before generating the structured output. It's not required for chat-completion API."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -407,6 +414,7 @@
     "    f\"http://localhost:{port}/generate\",\n",
     "    json={\n",
     "        \"text\": text,\n",
+    "        \"reasoning\": True,\n",
     "        \"sampling_params\": {\n",
     "            \"temperature\": 0,\n",
     "            \"max_new_tokens\": 2048,\n",
@@ -454,6 +462,7 @@
     "    f\"http://localhost:{port}/generate\",\n",
     "    json={\n",
     "        \"text\": text,\n",
+    "        \"reasoning\": True,\n",
     "        \"sampling_params\": {\n",
     "            \"temperature\": 0,\n",
     "            \"max_new_tokens\": 2048,\n",
@@ -482,6 +491,7 @@
     "    f\"http://localhost:{port}/generate\",\n",
     "    json={\n",
     "        \"text\": \"Give me the information of the capital of France.\",\n",
+    "        \"reasoning\": True,\n",
     "        \"sampling_params\": {\n",
     "            \"max_new_tokens\": 2048,\n",
     "            \"temperature\": 0,\n",
@@ -519,6 +529,7 @@
     "    f\"http://localhost:{port}/generate\",\n",
     "    json={\n",
     "        \"text\": \"Paris is the capital of\",\n",
+    "        \"reasoning\": True,\n",
     "        \"sampling_params\": {\n",
     "            \"temperature\": 0,\n",
     "            \"max_new_tokens\": 2048,\n",
@@ -547,6 +558,7 @@
     ")\n",
     "payload = {\n",
     "    \"text\": text,\n",
+    "    \"reasoning\": True,\n",
     "    \"sampling_params\": {\n",
     "        \"max_new_tokens\": 2048,\n",
     "        \"structural_tag\": json.dumps(\n",

@@ -46,6 +46,9 @@ def __init__(self):
         self.grammar_stats = None
         self.current_token = None
 
+    def maybe_init_reasoning(self, reasoning: bool):
+        pass
+
     def accept_token(self, token: int) -> None:
         """
         Accept a token in the grammar.
@@ -151,7 +154,9 @@ def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
     def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
         return self._not_supported("structural_tag", key_string)
 
-    def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
+    def _init_value_dispatch(
+        self, key: Tuple[str, str], reasoning: bool
+    ) -> Optional[BaseGrammarObject]:
         s = time.perf_counter()
         key_type, key_string = key
         if key_type == "json":
@@ -174,12 +179,14 @@ def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObje
         return grammar
 
     def get_cached_or_future_value(
-        self, key: Tuple[str, str]
+        self, key: Tuple[str, str], reasoning: bool
     ) -> Optional[BaseGrammarObject]:
         value = self.cache.get(key)
         if value:
-            return value.copy(), True
-        value = self.executor.submit(self._init_value_dispatch, key)
+            copied_value = value.copy()
+            copied_value.maybe_init_reasoning(reasoning)
+            return copied_value, True
+        value = self.executor.submit(self._init_value_dispatch, key, reasoning)
         return value, False
 
     def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject):

@@ -25,7 +25,7 @@
 
 
 class ReasonerGrammarObject(BaseGrammarObject):
-    def __init__(self, grammar: BaseGrammarObject, think_end_id):
+    def __init__(self, grammar: BaseGrammarObject, think_end_id: int):
         super().__init__()
         self.grammar = grammar
         self.think_end_id = think_end_id
@@ -34,6 +34,9 @@ def __init__(self, grammar: BaseGrammarObject, think_end_id):
         # +     means number of tokens after thinking ended
         self.tokens_after_think_end = -1
 
+    def maybe_init_reasoning(self, reasoning: bool):
+        self.tokens_after_think_end = -1 if reasoning else 0
+
     def transfer_state(self, token: int) -> int:
         if self.tokens_after_think_end == -1 and token == self.think_end_id:
             self.tokens_after_think_end = 0
@@ -109,9 +112,13 @@ def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
         self.grammar_backend = grammar_backend
         self.think_end_id = think_end_id
 
-    def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
-        ret = self.grammar_backend._init_value_dispatch(key)
+    def _init_value_dispatch(
+        self, key: Tuple[str, str], reasoning: bool
+    ) -> Optional[BaseGrammarObject]:
+        ret = self.grammar_backend._init_value_dispatch(key, reasoning)
         # avoid wrapping invalid grammar, so that the scheduler can detect it
         if ret is None or ret is INVALID_GRAMMAR_OBJ:
             return ret
-        return ReasonerGrammarObject(ret, self.think_end_id)
+        obj = ReasonerGrammarObject(ret, self.think_end_id)
+        obj.maybe_init_reasoning(reasoning)
+        return obj
@@ -213,6 +213,7 @@ def _convert_to_internal_request(
             return_hidden_states=request.return_hidden_states,
             rid=request.rid,
             extra_key=self._compute_extra_key(request),
+            reasoning=self._get_reasoning_from_request(request),
             priority=request.priority,
             custom_labels=custom_labels,
             custom_logit_processor=request.custom_logit_processor,
@@ -443,7 +444,10 @@ def _apply_conversation_template(
                 prompt = prompt[: -len(conv.sep2)]
         else:
             prompt = conv.get_prompt()
-            if self._get_enable_thinking_from_request(request):
+            if self._get_reasoning_from_request(
+                request
+            ) and self.reasoning_parser not in ["qwen3", "qwen3-thinking", "glm4"]:
+                # qwen3 and glm4 think internally without a leading <think> token
                 prompt += "<think>"  # Note(Xinyuan): hard code thinking token
 
         image_data = conv.image_data if conv.image_data else None
@@ -775,7 +779,7 @@ def _build_chat_response(
             if reasoning_parser and request.separate_reasoning:
                 is_force_reasoning = (
                     self.template_manager.force_reasoning
-                    or self._get_enable_thinking_from_request(request)
+                    or self._get_reasoning_from_request(request)
                 )
                 try:
                     parser = ReasoningParser(
@@ -1022,7 +1026,7 @@ def _process_reasoning_stream(
         if index not in reasoning_parser_dict:
             is_force_reasoning = (
                 self.template_manager.force_reasoning
-                or self._get_enable_thinking_from_request(request)
+                or self._get_reasoning_from_request(request)
             )
             reasoning_parser_dict[index] = ReasoningParser(
                 self.reasoning_parser,
@@ -1052,27 +1056,22 @@ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
                 idx += len(list(tool_calls)) if tool_calls is not None else 0  # noqa
         return idx
 
-    def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
-        """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
-
-        NOTE: This parameter is only useful for models that support enable_thinking
-        flag, such as Qwen3.
-
-        Args:
-            request_obj: The request object (or an item from a list of requests).
-        Returns:
-            The boolean value of 'enable_thinking' if found, otherwise False.
-        """
-        if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
-            # For Qwen3 models, `enable_thinking` is supported.
-            if self.reasoning_parser in ["qwen3", "glm45"]:
-                return request.chat_template_kwargs.get("enable_thinking", False)
-            # For DeepSeek-V3.1 models, `thinking` is supported.
-            elif self.reasoning_parser in ["deepseek-v3"]:
-                return request.chat_template_kwargs.get("thinking", False)
-            else:
-                return False
-        return False
+    def _get_reasoning_from_request(self, request: ChatCompletionRequest) -> bool:
+        """Judge whether the request needs reasoning"""
+        if not self.reasoning_parser:
+            return False
+        if self.reasoning_parser in ["deepseek-v3"]:
+            return (
+                request.chat_template_kwargs is not None
+                and request.chat_template_kwargs.get("thinking") is True
+            )
+        if self.reasoning_parser in ["qwen3", "glm45"]:
+            # qwen3 and glm45 are reasoning by default
+            return (
+                not request.chat_template_kwargs
+                or request.chat_template_kwargs.get("enable_thinking", True) is True
+            )
+        return True  # default
 
     async def _process_tool_call_stream(
         self,

@@ -197,6 +197,9 @@ class GenerateReqInput(BaseReq):
     bootstrap_room: Optional[Union[List[int], int]] = None
     bootstrap_pair_key: Optional[Union[List[str], str]] = None
 
+    # For reasoning
+    reasoning: bool = False
+
     # Validation step duration
     validation_time: Optional[float] = None
 
@@ -675,6 +678,9 @@ class TokenizedGenerateReqInput(BaseReq):
     bootstrap_room: Optional[int] = None
     bootstrap_pair_key: Optional[str] = None
 
+    # For reasoning
+    reasoning: bool = False
+
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
 

@@ -472,6 +472,7 @@ def __init__(
         token_type_ids: List[int] = None,
         session_id: Optional[str] = None,
         custom_logit_processor: Optional[str] = None,
+        reasoning: bool = False,
         return_hidden_states: bool = False,
         eos_token_ids: Optional[Set[int]] = None,
         bootstrap_host: Optional[str] = None,
@@ -517,6 +518,9 @@ def __init__(
         # For multi-http worker
         self.http_worker_ipc = http_worker_ipc
 
+        # For reasoning
+        self.reasoning = reasoning
+
         # Sampling info
         if isinstance(sampling_params.custom_params, dict):
             sampling_params = copy.copy(sampling_params)

@@ -1304,6 +1304,7 @@ def handle_generate_request(
                 lora_id=recv_req.lora_id,
                 input_embeds=recv_req.input_embeds,
                 custom_logit_processor=recv_req.custom_logit_processor,
+                reasoning=recv_req.reasoning,
                 return_hidden_states=recv_req.return_hidden_states,
                 eos_token_ids=self.model_config.hf_eos_token_id,
                 bootstrap_host=recv_req.bootstrap_host,
@@ -1431,7 +1432,9 @@ def handle_generate_request(
                 elif req.sampling_params.structural_tag:
                     key = ("structural_tag", req.sampling_params.structural_tag)
 
-                value, cache_hit = self.grammar_backend.get_cached_or_future_value(key)
+                value, cache_hit = self.grammar_backend.get_cached_or_future_value(
+                    key, req.reasoning
+                )
                 req.grammar = value
 
                 if not cache_hit:

@@ -887,6 +887,7 @@ def _create_tokenized_object(
                 input_embeds=input_embeds,
                 session_params=session_params,
                 custom_logit_processor=obj.custom_logit_processor,
+                reasoning=obj.reasoning,
                 return_hidden_states=obj.return_hidden_states,
                 data_parallel_rank=obj.data_parallel_rank,
                 priority=obj.priority,