diff --git a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb index 9cdcc29e152a..e7f390a1a80b 100644 --- a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb +++ b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb @@ -360,6 +360,13 @@ "## Native API and SGLang Runtime (SRT)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Note: For native API, as a work-around, you need to set `reasoning` argument to `True` to ensure the model will think before generating the structured output. It's not required for chat-completion API." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -407,6 +414,7 @@ " f\"http://localhost:{port}/generate\",\n", " json={\n", " \"text\": text,\n", + " \"reasoning\": True,\n", " \"sampling_params\": {\n", " \"temperature\": 0,\n", " \"max_new_tokens\": 2048,\n", @@ -454,6 +462,7 @@ " f\"http://localhost:{port}/generate\",\n", " json={\n", " \"text\": text,\n", + " \"reasoning\": True,\n", " \"sampling_params\": {\n", " \"temperature\": 0,\n", " \"max_new_tokens\": 2048,\n", @@ -482,6 +491,7 @@ " f\"http://localhost:{port}/generate\",\n", " json={\n", " \"text\": \"Give me the information of the capital of France.\",\n", + " \"reasoning\": True,\n", " \"sampling_params\": {\n", " \"max_new_tokens\": 2048,\n", " \"temperature\": 0,\n", @@ -519,6 +529,7 @@ " f\"http://localhost:{port}/generate\",\n", " json={\n", " \"text\": \"Paris is the capital of\",\n", + " \"reasoning\": True,\n", " \"sampling_params\": {\n", " \"temperature\": 0,\n", " \"max_new_tokens\": 2048,\n", @@ -547,6 +558,7 @@ ")\n", "payload = {\n", " \"text\": text,\n", + " \"reasoning\": True,\n", " \"sampling_params\": {\n", " \"max_new_tokens\": 2048,\n", " \"structural_tag\": json.dumps(\n", diff --git a/python/sglang/srt/constrained/base_grammar_backend.py b/python/sglang/srt/constrained/base_grammar_backend.py index 1144bb17cbfe..6369cea74b4b 100644 --- a/python/sglang/srt/constrained/base_grammar_backend.py +++ b/python/sglang/srt/constrained/base_grammar_backend.py @@ -46,6 +46,9 @@ def __init__(self): self.grammar_stats = None self.current_token = None + def maybe_init_reasoning(self, reasoning: bool): + pass + def accept_token(self, token: int) -> None: """ Accept a token in the grammar. @@ -151,7 +154,9 @@ def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]: def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]: return self._not_supported("structural_tag", key_string) - def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]: + def _init_value_dispatch( + self, key: Tuple[str, str], reasoning: bool + ) -> Optional[BaseGrammarObject]: s = time.perf_counter() key_type, key_string = key if key_type == "json": @@ -174,12 +179,14 @@ def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObje return grammar def get_cached_or_future_value( - self, key: Tuple[str, str] + self, key: Tuple[str, str], reasoning: bool ) -> Optional[BaseGrammarObject]: value = self.cache.get(key) if value: - return value.copy(), True - value = self.executor.submit(self._init_value_dispatch, key) + copied_value = value.copy() + copied_value.maybe_init_reasoning(reasoning) + return copied_value, True + value = self.executor.submit(self._init_value_dispatch, key, reasoning) return value, False def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject): diff --git a/python/sglang/srt/constrained/reasoner_grammar_backend.py b/python/sglang/srt/constrained/reasoner_grammar_backend.py index 0179d3426001..e2ae8405e315 100644 --- a/python/sglang/srt/constrained/reasoner_grammar_backend.py +++ b/python/sglang/srt/constrained/reasoner_grammar_backend.py @@ -25,7 +25,7 @@ class ReasonerGrammarObject(BaseGrammarObject): - def __init__(self, grammar: BaseGrammarObject, think_end_id): + def __init__(self, grammar: BaseGrammarObject, think_end_id: int): super().__init__() self.grammar = grammar self.think_end_id = think_end_id @@ -34,6 +34,9 @@ def __init__(self, grammar: BaseGrammarObject, think_end_id): # + means number of tokens after thinking ended self.tokens_after_think_end = -1 + def maybe_init_reasoning(self, reasoning: bool): + self.tokens_after_think_end = -1 if reasoning else 0 + def transfer_state(self, token: int) -> int: if self.tokens_after_think_end == -1 and token == self.think_end_id: self.tokens_after_think_end = 0 @@ -109,9 +112,13 @@ def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id): self.grammar_backend = grammar_backend self.think_end_id = think_end_id - def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]: - ret = self.grammar_backend._init_value_dispatch(key) + def _init_value_dispatch( + self, key: Tuple[str, str], reasoning: bool + ) -> Optional[BaseGrammarObject]: + ret = self.grammar_backend._init_value_dispatch(key, reasoning) # avoid wrapping invalid grammar, so that the scheduler can detect it if ret is None or ret is INVALID_GRAMMAR_OBJ: return ret - return ReasonerGrammarObject(ret, self.think_end_id) + obj = ReasonerGrammarObject(ret, self.think_end_id) + obj.maybe_init_reasoning(reasoning) + return obj diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py index 48197bd78645..b51912d57f64 100644 --- a/python/sglang/srt/entrypoints/openai/serving_chat.py +++ b/python/sglang/srt/entrypoints/openai/serving_chat.py @@ -213,6 +213,7 @@ def _convert_to_internal_request( return_hidden_states=request.return_hidden_states, rid=request.rid, extra_key=self._compute_extra_key(request), + reasoning=self._get_reasoning_from_request(request), priority=request.priority, custom_labels=custom_labels, custom_logit_processor=request.custom_logit_processor, @@ -443,7 +444,10 @@ def _apply_conversation_template( prompt = prompt[: -len(conv.sep2)] else: prompt = conv.get_prompt() - if self._get_enable_thinking_from_request(request): + if self._get_reasoning_from_request( + request + ) and self.reasoning_parser not in ["qwen3", "qwen3-thinking", "glm4"]: + # qwen3 and glm4 think internally without a leading token prompt += "" # Note(Xinyuan): hard code thinking token image_data = conv.image_data if conv.image_data else None @@ -775,7 +779,7 @@ def _build_chat_response( if reasoning_parser and request.separate_reasoning: is_force_reasoning = ( self.template_manager.force_reasoning - or self._get_enable_thinking_from_request(request) + or self._get_reasoning_from_request(request) ) try: parser = ReasoningParser( @@ -1022,7 +1026,7 @@ def _process_reasoning_stream( if index not in reasoning_parser_dict: is_force_reasoning = ( self.template_manager.force_reasoning - or self._get_enable_thinking_from_request(request) + or self._get_reasoning_from_request(request) ) reasoning_parser_dict[index] = ReasoningParser( self.reasoning_parser, @@ -1052,27 +1056,22 @@ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int: idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa return idx - def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool: - """Extracts the 'enable_thinking' flag from request chat_template_kwargs. - - NOTE: This parameter is only useful for models that support enable_thinking - flag, such as Qwen3. - - Args: - request_obj: The request object (or an item from a list of requests). - Returns: - The boolean value of 'enable_thinking' if found, otherwise False. - """ - if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs: - # For Qwen3 models, `enable_thinking` is supported. - if self.reasoning_parser in ["qwen3", "glm45"]: - return request.chat_template_kwargs.get("enable_thinking", False) - # For DeepSeek-V3.1 models, `thinking` is supported. - elif self.reasoning_parser in ["deepseek-v3"]: - return request.chat_template_kwargs.get("thinking", False) - else: - return False - return False + def _get_reasoning_from_request(self, request: ChatCompletionRequest) -> bool: + """Judge whether the request needs reasoning""" + if not self.reasoning_parser: + return False + if self.reasoning_parser in ["deepseek-v3"]: + return ( + request.chat_template_kwargs is not None + and request.chat_template_kwargs.get("thinking") is True + ) + if self.reasoning_parser in ["qwen3", "glm45"]: + # qwen3 and glm45 are reasoning by default + return ( + not request.chat_template_kwargs + or request.chat_template_kwargs.get("enable_thinking", True) is True + ) + return True # default async def _process_tool_call_stream( self, diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index e34736cc409c..39bbfaf3c50b 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -197,6 +197,9 @@ class GenerateReqInput(BaseReq): bootstrap_room: Optional[Union[List[int], int]] = None bootstrap_pair_key: Optional[Union[List[str], str]] = None + # For reasoning + reasoning: bool = False + # Validation step duration validation_time: Optional[float] = None @@ -675,6 +678,9 @@ class TokenizedGenerateReqInput(BaseReq): bootstrap_room: Optional[int] = None bootstrap_pair_key: Optional[str] = None + # For reasoning + reasoning: bool = False + # For data parallel rank routing data_parallel_rank: Optional[int] = None diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index bf1f13d5831b..31a4846eb989 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -472,6 +472,7 @@ def __init__( token_type_ids: List[int] = None, session_id: Optional[str] = None, custom_logit_processor: Optional[str] = None, + reasoning: bool = False, return_hidden_states: bool = False, eos_token_ids: Optional[Set[int]] = None, bootstrap_host: Optional[str] = None, @@ -517,6 +518,9 @@ def __init__( # For multi-http worker self.http_worker_ipc = http_worker_ipc + # For reasoning + self.reasoning = reasoning + # Sampling info if isinstance(sampling_params.custom_params, dict): sampling_params = copy.copy(sampling_params) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index b801fd8f8e63..c47d5ab65eef 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1304,6 +1304,7 @@ def handle_generate_request( lora_id=recv_req.lora_id, input_embeds=recv_req.input_embeds, custom_logit_processor=recv_req.custom_logit_processor, + reasoning=recv_req.reasoning, return_hidden_states=recv_req.return_hidden_states, eos_token_ids=self.model_config.hf_eos_token_id, bootstrap_host=recv_req.bootstrap_host, @@ -1431,7 +1432,9 @@ def handle_generate_request( elif req.sampling_params.structural_tag: key = ("structural_tag", req.sampling_params.structural_tag) - value, cache_hit = self.grammar_backend.get_cached_or_future_value(key) + value, cache_hit = self.grammar_backend.get_cached_or_future_value( + key, req.reasoning + ) req.grammar = value if not cache_hit: diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index b90cf0616cba..1396a8248cca 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -887,6 +887,7 @@ def _create_tokenized_object( input_embeds=input_embeds, session_params=session_params, custom_logit_processor=obj.custom_logit_processor, + reasoning=obj.reasoning, return_hidden_states=obj.return_hidden_states, data_parallel_rank=obj.data_parallel_rank, priority=obj.priority,