sgl-project · Javtor · Mar 13, 2026
@@ -60,7 +60,7 @@ def get_encoding():
 
 def get_system_message(
     model_identity: Optional[str] = None,
-    reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
+    reasoning_effort: Optional[Literal["none", "high", "medium", "low"]] = None,
     start_date: Optional[str] = None,
     browser_description: Optional[str] = None,
     python_description: Optional[str] = None,
@@ -69,6 +69,8 @@ def get_system_message(
     if model_identity is not None:
         sys_msg_content = sys_msg_content.with_model_identity(model_identity)
     if reasoning_effort is not None:
+        if reasoning_effort == "none":
+            raise ValueError("Harmony does not support reasoning_effort='none'")
         sys_msg_content = sys_msg_content.with_reasoning_effort(
             REASONING_EFFORT[reasoning_effort]
         )

@@ -588,12 +588,12 @@ class ChatCompletionRequest(BaseModel):
     return_hidden_states: bool = False
     return_routed_experts: bool = False
     return_cached_tokens_details: bool = False
-    reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
+    reasoning_effort: Optional[Literal["none", "low", "medium", "high"]] = Field(
         default="medium",
         description="Constrains effort on reasoning for reasoning models. "
-        "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
-        "result in faster responses and fewer tokens used on reasoning in a response. "
-        "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
+        "'none' disables reasoning entirely, 'low' is the least effort, 'high' is the most effort. "
+        "Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning "
+        "in a response. 'none' sets enable_thinking=false in chat_template_kwargs.",
     )
 
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
@@ -677,7 +677,7 @@ def normalize_reasoning_inputs(cls, values: Dict):
 
         if isinstance(r, dict):
             effort = r.get("effort") or r.get("reasoning_effort")
-            if effort in {"low", "medium", "high"}:
+            if effort in {"none", "low", "medium", "high"}:
                 values["reasoning_effort"] = effort
 
             enabled = (
@@ -696,6 +696,18 @@ def normalize_reasoning_inputs(cls, values: Dict):
 
         return values
 
+    @model_validator(mode="before")
+    @classmethod
+    def disable_thinking_for_none_effort(cls, values):
+        if values.get("reasoning_effort") == "none":
+            ctk = values.get("chat_template_kwargs")
+            if not isinstance(ctk, dict):
+                ctk = {}
+            ctk["thinking"] = False
+            values["chat_template_kwargs"] = ctk
+            values["separate_reasoning"] = False
+        return values
+
     @model_validator(mode="before")
     @classmethod
     def set_json_schema(cls, values):