Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/sglang/srt/entrypoints/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_encoding():

def get_system_message(
model_identity: Optional[str] = None,
reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
reasoning_effort: Optional[Literal["none", "high", "medium", "low"]] = None,
start_date: Optional[str] = None,
browser_description: Optional[str] = None,
python_description: Optional[str] = None,
Expand All @@ -69,6 +69,8 @@ def get_system_message(
if model_identity is not None:
sys_msg_content = sys_msg_content.with_model_identity(model_identity)
if reasoning_effort is not None:
if reasoning_effort == "none":
raise ValueError("Harmony does not support reasoning_effort='none'")
sys_msg_content = sys_msg_content.with_reasoning_effort(
REASONING_EFFORT[reasoning_effort]
)
Expand Down
22 changes: 17 additions & 5 deletions python/sglang/srt/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,12 +588,12 @@ class ChatCompletionRequest(BaseModel):
return_hidden_states: bool = False
return_routed_experts: bool = False
return_cached_tokens_details: bool = False
reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
reasoning_effort: Optional[Literal["none", "low", "medium", "high"]] = Field(
default="medium",
description="Constrains effort on reasoning for reasoning models. "
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
"result in faster responses and fewer tokens used on reasoning in a response. "
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
"'none' disables reasoning entirely, 'low' is the least effort, 'high' is the most effort. "
"Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning "
"in a response. 'none' sets enable_thinking=false in chat_template_kwargs.",
)

# Extra parameters for SRT backend only and will be ignored by OpenAI models.
Expand Down Expand Up @@ -677,7 +677,7 @@ def normalize_reasoning_inputs(cls, values: Dict):

if isinstance(r, dict):
effort = r.get("effort") or r.get("reasoning_effort")
if effort in {"low", "medium", "high"}:
if effort in {"none", "low", "medium", "high"}:
values["reasoning_effort"] = effort

enabled = (
Expand All @@ -696,6 +696,18 @@ def normalize_reasoning_inputs(cls, values: Dict):

return values

@model_validator(mode="before")
@classmethod
def disable_thinking_for_none_effort(cls, values):
if values.get("reasoning_effort") == "none":
ctk = values.get("chat_template_kwargs")
if not isinstance(ctk, dict):
ctk = {}
ctk["thinking"] = False
values["chat_template_kwargs"] = ctk
values["separate_reasoning"] = False
return values

@model_validator(mode="before")
@classmethod
def set_json_schema(cls, values):
Expand Down
Loading