diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py index 1550c707a311..1f44b08fe07d 100644 --- a/python/sglang/srt/entrypoints/openai/protocol.py +++ b/python/sglang/srt/entrypoints/openai/protocol.py @@ -588,12 +588,13 @@ class ChatCompletionRequest(BaseModel): return_hidden_states: bool = False return_routed_experts: bool = False return_cached_tokens_details: bool = False - reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field( + reasoning_effort: Optional[Literal["none", "low", "medium", "high"]] = Field( default="medium", description="Constrains effort on reasoning for reasoning models. " - "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can " - "result in faster responses and fewer tokens used on reasoning in a response. " - "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.", + "'none' disables reasoning entirely, 'low' is the least effort, 'high' is the most effort. " + "Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning " + "in a response. 'none' defaults thinking and enable_thinking to false in " + "chat_template_kwargs (unless explicitly overridden). Not supported in the harmony path.", ) # Extra parameters for SRT backend only and will be ignored by OpenAI models. @@ -672,12 +673,10 @@ def set_tool_choice_default(cls, values): @classmethod def normalize_reasoning_inputs(cls, values: Dict): r = values.get("reasoning") - if r is None: - return values - if isinstance(r, dict): + if r is not None and isinstance(r, dict): effort = r.get("effort") or r.get("reasoning_effort") - if effort in {"low", "medium", "high"}: + if effort in {"none", "low", "medium", "high"}: values["reasoning_effort"] = effort enabled = ( @@ -694,6 +693,17 @@ def normalize_reasoning_inputs(cls, values: Dict): ctk.setdefault("thinking", True) values["chat_template_kwargs"] = ctk + if values.get("reasoning_effort") == "none": + ctk = values.get("chat_template_kwargs") + if not isinstance(ctk, dict): + ctk = {} + # different models check different keys: + # - "thinking" for deepseek-v3, kimi_k2 + # - "enable_thinking" for qwen3, glm45, nemotron_3, interns1 + ctk.setdefault("thinking", False) + ctk.setdefault("enable_thinking", False) + values["chat_template_kwargs"] = ctk + return values @model_validator(mode="before") diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py index 45efacb090e0..9e0f74dd3760 100644 --- a/python/sglang/srt/entrypoints/openai/serving_chat.py +++ b/python/sglang/srt/entrypoints/openai/serving_chat.py @@ -248,6 +248,11 @@ def _convert_to_internal_request( if request.chat_template_kwargs else None ) + if self.is_gpt_oss and reasoning_effort == "none": + raise ValueError( + f"Harmony does not support reasoning effort {reasoning_effort}" + ) + if reasoning_effort is not None: request.reasoning_effort = reasoning_effort diff --git a/test/registered/openai_server/basic/test_protocol.py b/test/registered/openai_server/basic/test_protocol.py index 47bf563816ef..b5a1758ffedc 100644 --- a/test/registered/openai_server/basic/test_protocol.py +++ b/test/registered/openai_server/basic/test_protocol.py @@ -192,6 +192,30 @@ def test_chat_completion_reasoning_effort(self): self.assertEqual(request.reasoning_effort, "high") self.assertEqual(request.chat_template_kwargs, {"thinking": True}) + def test_chat_completion_reasoning_effort_none(self): + """Test reasoning_effort='none' disables thinking""" + messages = [{"role": "user", "content": "Hello"}] + request = ChatCompletionRequest( + model="test-model", + messages=messages, + reasoning_effort="none", + ) + self.assertEqual(request.reasoning_effort, "none") + self.assertFalse(request.chat_template_kwargs.get("thinking")) + self.assertFalse(request.chat_template_kwargs.get("enable_thinking")) + + def test_chat_completion_reasoning_effort_none_from_reasoning_dict(self): + """Test reasoning_effort='none' via nested reasoning dict""" + messages = [{"role": "user", "content": "Hello"}] + request = ChatCompletionRequest( + model="test-model", + messages=messages, + reasoning={"effort": "none"}, + ) + self.assertEqual(request.reasoning_effort, "none") + self.assertFalse(request.chat_template_kwargs.get("thinking")) + self.assertFalse(request.chat_template_kwargs.get("enable_thinking")) + def test_chat_completion_json_format(self): """Test chat completion json format""" transcript = "Good morning! It's 7:00 AM, and I'm just waking up. Today is going to be a busy day, "