diff --git a/python/sglang/srt/entrypoints/harmony_utils.py b/python/sglang/srt/entrypoints/harmony_utils.py index 86d4356dc8e5..18c09f2e4581 100644 --- a/python/sglang/srt/entrypoints/harmony_utils.py +++ b/python/sglang/srt/entrypoints/harmony_utils.py @@ -60,7 +60,7 @@ def get_encoding(): def get_system_message( model_identity: Optional[str] = None, - reasoning_effort: Optional[Literal["high", "medium", "low"]] = None, + reasoning_effort: Optional[Literal["none", "high", "medium", "low"]] = None, start_date: Optional[str] = None, browser_description: Optional[str] = None, python_description: Optional[str] = None, @@ -69,6 +69,8 @@ def get_system_message( if model_identity is not None: sys_msg_content = sys_msg_content.with_model_identity(model_identity) if reasoning_effort is not None: + if reasoning_effort == "none": + raise ValueError("Harmony does not support reasoning_effort='none'") sys_msg_content = sys_msg_content.with_reasoning_effort( REASONING_EFFORT[reasoning_effort] ) diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py index 1550c707a311..6f77d5d0a72f 100644 --- a/python/sglang/srt/entrypoints/openai/protocol.py +++ b/python/sglang/srt/entrypoints/openai/protocol.py @@ -588,12 +588,12 @@ class ChatCompletionRequest(BaseModel): return_hidden_states: bool = False return_routed_experts: bool = False return_cached_tokens_details: bool = False - reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field( + reasoning_effort: Optional[Literal["none", "low", "medium", "high"]] = Field( default="medium", description="Constrains effort on reasoning for reasoning models. " - "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can " - "result in faster responses and fewer tokens used on reasoning in a response. " - "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.", + "'none' disables reasoning entirely, 'low' is the least effort, 'high' is the most effort. " + "Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning " + "in a response. 'none' sets enable_thinking=false in chat_template_kwargs.", ) # Extra parameters for SRT backend only and will be ignored by OpenAI models. @@ -677,7 +677,7 @@ def normalize_reasoning_inputs(cls, values: Dict): if isinstance(r, dict): effort = r.get("effort") or r.get("reasoning_effort") - if effort in {"low", "medium", "high"}: + if effort in {"none", "low", "medium", "high"}: values["reasoning_effort"] = effort enabled = ( @@ -696,6 +696,18 @@ def normalize_reasoning_inputs(cls, values: Dict): return values + @model_validator(mode="before") + @classmethod + def disable_thinking_for_none_effort(cls, values): + if values.get("reasoning_effort") == "none": + ctk = values.get("chat_template_kwargs") + if not isinstance(ctk, dict): + ctk = {} + ctk["thinking"] = False + values["chat_template_kwargs"] = ctk + values["separate_reasoning"] = False + return values + @model_validator(mode="before") @classmethod def set_json_schema(cls, values):