diff --git a/docs/references/deepseek.rst b/docs/references/deepseek.rst
index b45383a4b3b4..f77fe1ba78ac 100644
--- a/docs/references/deepseek.rst
+++ b/docs/references/deepseek.rst
@@ -4,3 +4,4 @@ Multi-Node Deployment
:maxdepth: 1
deepseek.md
+ reasoning_parser.md
diff --git a/docs/references/reasoning_parser.md b/docs/references/reasoning_parser.md
new file mode 100644
index 000000000000..bce419759187
--- /dev/null
+++ b/docs/references/reasoning_parser.md
@@ -0,0 +1,138 @@
+# Reasoning Parser
+
+SGLang supports parsing reasoning content our from "normal" content for reasoning models such as [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1).
+
+The contract follows the [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model) established with the release of DeepSeek-R1:
+
+- `reasoning_content`: The content of the CoT.
+- `content`: The content of the final answer.
+
+## Supported Models
+
+Currently, SGLang supports the following reasoning models:
+- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `` and `` tags.
+
+## Usage
+
+There are two ways to enable reasoning parsing:
+
+1) Enable the reasoning parser when starting the SGLang Server by setting the `--enable-reasoning` and `--reasoning-parser` options. The `--reasoning-parser` option specifies the reasoning parser to extract the reasoning content and final answer.
+
+```bash
+python -m sglang.launch_server --host 0.0.0.0 \
+--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
+--enable-reasoning --reasoning-parser deepseek-r1
+```
+
+2) Specify on a per-request basis by setting the `separate_reasoning` body field on a `/chat/completions` request.
+
+```bash
+curl -X POST -H "Content-Type: application/json" \
+-d '{"messages":[{"role":"user","content":"Compute 1+3"}],"max_tokens":100,"model":"deepseek-r1","stream":true,"separate_reasoning":true}' http://0.0.0.0:30000/v1/chat/completions
+```
+
+There is another body param which can be set to buffer the reasoning traces to be sent in one chunk after the closing `` tag, `"stream_reasoning": false`.
+
+### Non-streaming Request
+
+Make a request to the reasoning model, get the reasoning content and final answer.
+
+Using OpenAI python api:
+```python
+import openai
+
+client = openai.Client(base_url="http://localhost:30000/v1", api_key="None")
+
+response = client.chat.completions.create(
+ model="deepseek-r1:14b",
+ messages=[{"role": "user", "content": "Compute 1+3"}],
+ max_tokens=1024,
+ stream=False
+)
+
+response.choices[0].message.reasoning_content
+# 'First, I recognize that the problem requires adding the numbers 1 and 3.\n\nNext, I identify the numbers to be added, which are 1 and 3.\n\nThen, I perform the addition operation: 1 plus 3 equals 4.\n\nFinally, I conclude that the sum of 1 and 3 is 4.\n'
+response.choices[0].message.content
+# \n\nTo compute \\(1 + 3\\), follow these simple steps:\n\n1. **Identify the numbers to add:** \n The numbers are **1** and **3**.\n\n2. **Add the numbers together:** \n \\[\n 1 + 3 = 4\n \\]\n\n3. **Write the final answer:** \n The sum of \\(1 + 3\\) is \\(\\boxed{4}\\).'
+```
+
+### Streaming Request
+
+`reasoning_content` is available in the `delta` field of the streaming response.
+
+Using OpenAI python api:
+
+```python
+# ... Initialize the client as before ...
+
+response = client.chat.completions.create(
+ model="deepseek-r1:14b",
+ messages=[{"role": "user", "content": "Compute 1+3"}],
+ max_tokens=1024,
+ stream=True
+)
+reasoning_content = ""
+content = ""
+for chunk in response:
+ if chunk.choices[0].delta.content:
+ content += chunk.choices[0].delta.content
+ elif chunk.choices[0].delta.reasoning_content:
+ reasoning_content += chunk.choices[0].delta.reasoning_content
+
+reasoning_content
+# 'I need to calculate the sum of 1 and 3. \n\nFirst, I identify the numbers involved in the addition: 1 and 3.\n\nNext, I add these two numbers together to find the total.\n\nFinally, the result of the addition is 4.\n'
+content
+# '\n\n**Solution:**\n\nWe need to compute the sum of 1 and 3.\n\n1. **Identify the numbers to add:**\n - Number 1\n - Number 3\n\n2. **Add the numbers together:**\n \\[\n 1 + 3 = 4\n \\]\n\n3. **Final Answer:**\n \\[\n \\boxed{4}\n \\]'
+```
+
+
+## Supporting New Reasoning Models
+
+For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningParser` in `python/sglang/srt/reasoning_parser.py`.
+
+```python
+class BaseReasoningParser:
+ """Base class for reasoning parser."""
+
+ def __init__(self):
+ self._buffer = ""
+
+ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
+ """Detect and parse the text, return reasoning_content and content."""
+ raise NotImplementedError
+
+ def parse_streaming_increment(
+ self, new_text: str
+ ) -> Tuple[Optional[str], Optional[str]]:
+ """Parse the new text incrementally, return reasoning_content and content."""
+ raise NotImplementedError
+```
+
+And specify the reasoning parser for new reasoning models accordingly.
+
+```python
+class ReasoningParser:
+ """Reasoning parser for different reasoning models."""
+
+ # Specify the reasoning parser for each reasoning model here
+ ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = {
+ "deepseek-r1": DeepSeekR1ReasoningParser
+ }
+
+ def __init__(self, reasoning_parser: str):
+ self.parser = self.ReasoningParserDict[reasoning_parser]()
+
+ def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
+ """
+ Non-streaming parsing for reasoning models.
+ Return: reasoning_content, content
+ """
+ return self.parser.detect_and_parse(full_text)
+
+ def parse_stream_chunk(self, chunk_text: str):
+ """
+ Streaming parsing for reasoning models.
+ Return: reasoning_content, content
+ """
+ return self.parser.parse_streaming_increment(chunk_text)
+```
diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 7c385d40b623..a3f93a3ecfcf 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -74,6 +74,7 @@
TopLogprob,
UsageInfo,
)
+from sglang.srt.reasoning_parser import ReasoningParser
from sglang.utils import get_exception_traceback
logger = logging.getLogger(__name__)
@@ -1045,7 +1046,12 @@ def v1_chat_generate_request(
def v1_chat_generate_response(
- request, ret, to_file=False, cache_report=False, tool_call_parser=None
+ request,
+ ret,
+ to_file=False,
+ cache_report=False,
+ tool_call_parser=None,
+ reasoning_parser=None,
):
choices = []
@@ -1099,9 +1105,26 @@ def v1_chat_generate_response(
if isinstance(request, list):
tool_choice = request[idx].tool_choice
tools = request[idx].tools
+ separate_reasoning = request[idx].separate_reasoning
else:
tool_choice = request.tool_choice
tools = request.tools
+ separate_reasoning = request.separate_reasoning
+
+ if reasoning_parser and separate_reasoning:
+ try:
+ parser = ReasoningParser(reasoning_parser, True)
+ parse_result = parser.parse_non_stream(text)
+ text = parse_result.normal_text #! text can not be None
+ reasoning_text = parse_result.reasoning_text
+ except Exception as e:
+ logger.error(f"Exception: {e}")
+ return create_error_response(
+ HTTPStatus.BAD_REQUEST,
+ "Failed to parse reasoning related info to json format!",
+ )
+ else:
+ reasoning_text = None
if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
if finish_reason == "stop":
@@ -1131,8 +1154,9 @@ def v1_chat_generate_response(
"index": 0,
"message": {
"role": "assistant",
- "content": ret_item["text"] if tool_calls is None else None,
+ "content": text if tool_calls is None else None,
"tool_calls": tool_calls,
+ "reasoning_content": reasoning_text,
},
"logprobs": choice_logprobs,
"finish_reason": (finish_reason["type"] if finish_reason else ""),
@@ -1147,8 +1171,9 @@ def v1_chat_generate_response(
index=idx,
message=ChatMessage(
role="assistant",
- content=ret_item["text"] if tool_calls is None else None,
+ content=text if tool_calls is None else None,
tool_calls=tool_calls,
+ reasoning_content=reasoning_text,
),
logprobs=choice_logprobs,
finish_reason=(finish_reason["type"] if finish_reason else ""),
@@ -1215,6 +1240,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
if adapted_request.stream:
parser_dict = {}
+ reasoning_parser_dict = {}
async def generate_stream_resp():
is_firsts = {}
@@ -1281,15 +1307,27 @@ async def generate_stream_resp():
choice_logprobs = None
finish_reason = content["meta_info"]["finish_reason"]
+ finish_reason_type = (
+ finish_reason["type"] if finish_reason else None
+ )
if is_first:
# First chunk with role
is_first = False
+ if (
+ tokenizer_manager.server_args.reasoning_parser
+ and request.separate_reasoning
+ ):
+ delta = DeltaMessage(role="assistant", reasoning_content="")
+ else:
+ delta = DeltaMessage(role="assistant", content="")
choice_data = ChatCompletionResponseStreamChoice(
index=index,
- delta=DeltaMessage(role="assistant", content=""),
+ delta=delta,
finish_reason=(
- finish_reason["type"] if finish_reason else ""
+ None
+ if finish_reason_type and len(finish_reason_type) == 0
+ else finish_reason_type
),
matched_stop=(
finish_reason["matched"]
@@ -1309,6 +1347,42 @@ async def generate_stream_resp():
delta = text[len(stream_buffer) :]
new_stream_buffer = stream_buffer + delta
+ if (
+ tokenizer_manager.server_args.reasoning_parser
+ and request.separate_reasoning
+ ):
+ if index not in reasoning_parser_dict:
+ reasoning_parser_dict[index] = ReasoningParser(
+ tokenizer_manager.server_args.reasoning_parser,
+ request.stream_reasoning,
+ )
+ reasoning_parser = reasoning_parser_dict[index]
+ parse_result = reasoning_parser.parse_stream_chunk(delta)
+ if parse_result.reasoning_text:
+ choice_data = ChatCompletionResponseStreamChoice(
+ index=index,
+ delta=DeltaMessage(
+ reasoning_content=parse_result.reasoning_text
+ ),
+ finish_reason=(
+ None
+ if finish_reason_type
+ and len(finish_reason_type) == 0
+ else finish_reason_type
+ ),
+ )
+ chunk = ChatCompletionStreamResponse(
+ id=content["meta_info"]["id"],
+ choices=[choice_data],
+ model=request.model,
+ )
+ yield f"data: {chunk.model_dump_json()}\n\n"
+ delta = parse_result.normal_text
+ if (delta and len(delta) == 0) or not delta:
+ stream_buffers[index] = new_stream_buffer
+ is_firsts[index] = is_first
+ continue
+
if request.tool_choice != "none" and request.tools:
if index not in parser_dict:
parser_dict[index] = FunctionCallParser(
@@ -1326,7 +1400,10 @@ async def generate_stream_resp():
index=index,
delta=DeltaMessage(content=normal_text),
finish_reason=(
- finish_reason["type"] if finish_reason else ""
+ None
+ if finish_reason_type
+ and len(finish_reason_type) == 0
+ else finish_reason_type
),
)
chunk = ChatCompletionStreamResponse(
@@ -1395,7 +1472,9 @@ async def generate_stream_resp():
index=index,
delta=DeltaMessage(content=delta),
finish_reason=(
- finish_reason["type"] if finish_reason else ""
+ None
+ if finish_reason_type and len(finish_reason_type) == 0
+ else finish_reason_type
),
matched_stop=(
finish_reason["matched"]
@@ -1463,6 +1542,7 @@ async def generate_stream_resp():
ret,
cache_report=tokenizer_manager.server_args.enable_cache_report,
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
+ reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
)
return response
diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py
index 5f1ba431abd5..0c0aa09619cc 100644
--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -336,6 +336,8 @@ class ChatCompletionRequest(BaseModel):
skip_special_tokens: bool = True
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
session_params: Optional[Dict] = None
+ separate_reasoning: bool = True
+ stream_reasoning: bool = True
class FunctionResponse(BaseModel):
@@ -356,6 +358,7 @@ class ToolCall(BaseModel):
class ChatMessage(BaseModel):
role: Optional[str] = None
content: Optional[str] = None
+ reasoning_content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
@@ -379,6 +382,7 @@ class ChatCompletionResponse(BaseModel):
class DeltaMessage(BaseModel):
role: Optional[str] = None
content: Optional[str] = None
+ reasoning_content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
new file mode 100644
index 000000000000..b13ae8392331
--- /dev/null
+++ b/python/sglang/srt/reasoning_parser.py
@@ -0,0 +1,158 @@
+import re
+from typing import Dict
+
+REASONING_MODELS = ["deepseek-r1"]
+
+
+class StreamingParseResult:
+ """Result of streaming incremental parsing."""
+
+ def __init__(self, normal_text: str = "", reasoning_text: str = ""):
+ self.normal_text = normal_text
+ self.reasoning_text = reasoning_text
+
+
+class BaseReasoningFormatDetector:
+ """Base class providing two sets of interfaces: one-time and streaming incremental."""
+
+ def __init__(
+ self,
+ think_start_token: str,
+ think_end_token: str,
+ force_reasoning: bool = False,
+ stream_reasoning: bool = False,
+ ):
+ self.think_start_token = think_start_token
+ self.think_end_token = think_end_token
+ self._in_reasoning = force_reasoning
+ self.stream_reasoning = stream_reasoning
+
+ self._buffer = ""
+ self._current_reasoning = ""
+ self.stripped_think_start = False
+
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
+ """
+ One-time parsing: Detects and parses reasoning sections in the provided text.
+ Returns both reasoning content and normal text separately.
+ """
+ text = text.replace(self.think_start_token, "").strip()
+ if self.think_end_token not in text:
+ # Assume reasoning was truncated before `` token
+ return StreamingParseResult(reasoning_text=text)
+
+ # Extract reasoning content
+ splits = text.split(self.think_end_token, maxsplit=1)
+ reasoning_text = splits[0]
+ text = splits[1].strip()
+
+ return StreamingParseResult(normal_text=text, reasoning_text=reasoning_text)
+
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+ """
+ Streaming incremental parsing for reasoning content.
+ Handles partial reasoning tags and content.
+
+ If stream_reasoning is False:
+ Accumulates reasoning content until the end tag is found
+ If stream_reasoning is True:
+ Streams reasoning content as it arrives
+ """
+ self._buffer += new_text
+ current_text = self._buffer
+
+ # Strip `` token if present
+ if not self.stripped_think_start and current_text.find(self.think_start_token):
+ current_text = current_text.replace(self.think_start_token, "")
+ self.stripped_think_start = True
+
+ # Handle end of reasoning block
+ if self._in_reasoning and self.think_end_token in current_text:
+ end_idx = current_text.find(self.think_end_token)
+
+ if self.stream_reasoning:
+ # Just return the final chunk before the end tag
+ reasoning_text = current_text[:end_idx]
+ else:
+ # Return accumulated reasoning plus final chunk
+ reasoning_text = self._current_reasoning + current_text[:end_idx]
+
+ self._in_reasoning = False
+ self._current_reasoning = ""
+ normal_text = current_text[end_idx + len(self.think_end_token) :]
+ self._buffer = ""
+
+ return StreamingParseResult(
+ normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
+ )
+
+ # Continue with reasoning content
+ if self._in_reasoning:
+ if self.stream_reasoning:
+ # Stream the content immediately
+ self._buffer = ""
+ return StreamingParseResult(reasoning_text=current_text)
+ else:
+ # Accumulate content but don't return it yet
+ self._current_reasoning += current_text
+ return StreamingParseResult()
+
+ # If we're not in a reasoning block return as normal text
+ if not self._in_reasoning:
+ self._buffer = ""
+ return StreamingParseResult(normal_text=new_text)
+
+ return StreamingParseResult()
+
+
+class DeepSeekR1Detector(BaseReasoningFormatDetector):
+ """
+ Detector for DeepSeek-R1 model.
+ Assumes reasoning format:
+ ()*(.*)
+ Returns all the text before the tag as `reasoning_text`
+ and the rest of the text as `normal_text`.
+
+ Args:
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+ If True, streams reasoning content as it arrives.
+ """
+
+ def __init__(self, stream_reasoning: bool = False):
+ # DeepSeek-R1 is assumed to be reasoning until `` token
+ super().__init__("", "", True, stream_reasoning=stream_reasoning)
+ # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
+
+
+class ReasoningParser:
+ """
+ Parser that handles both streaming and non-streaming scenarios for extracting
+ reasoning content from model outputs.
+
+ Args:
+ model_type (str): Type of model to parse reasoning from
+ stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
+ If True, streams reasoning content as it arrives.
+ """
+
+ DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
+ "deepseek-r1": DeepSeekR1Detector
+ }
+
+ def __init__(self, model_type: str = None, stream_reasoning: bool = True):
+ if not model_type:
+ raise ValueError("Model type must be specified")
+
+ detector_class = self.DetectorMap.get(model_type.lower())
+ if not detector_class:
+ raise ValueError(f"Unsupported model type: {model_type}")
+
+ self.detector = detector_class(stream_reasoning=stream_reasoning)
+
+ def parse_non_stream(self, full_text: str) -> StreamingParseResult:
+ """Non-streaming call: one-time parsing"""
+ return self.detector.detect_and_parse(full_text)
+
+ def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult:
+ """Streaming call: incremental parsing"""
+ return self.detector.parse_streaming_increment(chunk_text)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index c62a3dbdac34..76ea63b16cd0 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -23,6 +23,7 @@
import torch
from sglang.srt.hf_transformers_utils import check_gguf_file
+from sglang.srt.reasoning_parser import REASONING_MODELS
from sglang.srt.utils import (
get_amdgpu_memory_capacity,
get_hpu_memory_capacity,
@@ -96,6 +97,7 @@ class ServerArgs:
api_key: Optional[str] = None
file_storage_pth: str = "sglang_storage"
enable_cache_report: bool = False
+ reasoning_parser: Optional[str] = None
# Data parallelism
dp_size: int = 1
@@ -613,6 +615,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
action="store_true",
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
)
+ parser.add_argument(
+ "--reasoning-parser",
+ type=str,
+ choices=REASONING_MODELS,
+ default=ServerArgs.reasoning_parser,
+ help=f"Specify the parser for reasoning models, supported parsers are: {REASONING_MODELS}.",
+ )
# Data parallelism
parser.add_argument(
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 3dc1ae347372..b1d88a8c49de 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -34,6 +34,7 @@
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
+DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index b02bbec56a46..d5fd6cbc132e 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -57,6 +57,7 @@
"test_w8a8_quantization.py",
"test_fp8_kernel.py",
"test_block_int8.py",
+ "test_reasoning_content.py",
],
"nightly": [
"test_nightly_gsm8k_eval.py",
diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py
new file mode 100644
index 000000000000..d2f34a35c10f
--- /dev/null
+++ b/test/srt/test_reasoning_content.py
@@ -0,0 +1,180 @@
+"""
+Usage:
+python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false
+python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true
+python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true_stream_reasoning_false
+python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false
+python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true
+python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_nonstreaming
+python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_streaming
+"""
+
+import json
+import unittest
+
+import openai
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+ DEFAULT_REASONING_MODEL_NAME_FOR_TEST,
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+ DEFAULT_URL_FOR_TEST,
+ popen_launch_server,
+)
+
+
+class TestReasoningContentAPI(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST
+ cls.base_url = DEFAULT_URL_FOR_TEST
+ cls.api_key = "sk-1234"
+ cls.process = popen_launch_server(
+ cls.model,
+ cls.base_url,
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+ api_key=cls.api_key,
+ other_args=[
+ "--reasoning-parser",
+ "deepseek-r1",
+ ],
+ )
+
+ @classmethod
+ def tearDownClass(cls):
+ kill_process_tree(cls.process.pid)
+
+ def test_streaming_separate_reasoning_false(self):
+ # Test streaming with separate_reasoning=False, reasoning_content should be empty
+ client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+ payload = {
+ "model": self.model,
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is 1+3?",
+ }
+ ],
+ "max_tokens": 100,
+ "stream": True,
+ "separate_reasoning": False,
+ }
+ response = client.chat.completions.create(**payload)
+
+ assert response.status_code == 200
+ reasoning_content = ""
+ content = ""
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ content += chunk.choices[0].delta.content
+ elif chunk.choices[0].delta.reasoning_content:
+ reasoning_content += chunk.choices[0].delta.reasoning_content
+
+ assert len(reasoning_content) == 0
+ assert len(content) > 0
+
+ def test_streaming_separate_reasoning_true(self):
+ # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+ client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+ payload = {
+ "model": self.model,
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is 1+3?",
+ }
+ ],
+ "max_tokens": 100,
+ "stream": True,
+ "separate_reasoning": True,
+ }
+ response = client.chat.completions.create(**payload)
+
+ assert response.status_code == 200
+ reasoning_content = ""
+ content = ""
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ content += chunk.choices[0].delta.content
+ elif chunk.choices[0].delta.reasoning_content:
+ reasoning_content += chunk.choices[0].delta.reasoning_content
+
+ assert len(reasoning_content) > 0
+ assert len(content) > 0
+
+ def test_streaming_separate_reasoning_true_stream_reasoning_false(self):
+ # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+ client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+ payload = {
+ "model": self.model,
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is 1+3?",
+ }
+ ],
+ "max_tokens": 100,
+ "stream": True,
+ "separate_reasoning": True,
+ "stream_reasoning": False,
+ }
+ response = client.chat.completions.create(**payload)
+
+ assert response.status_code == 200
+ reasoning_content = ""
+ content = ""
+ first_chunk = False
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ first_chunk = True
+ content += chunk.choices[0].delta.content
+ reasoning_content = chunk.choices[0].delta.reasoning_content
+ if not first_chunk:
+ assert len(chunk.choices[0].delta.reasoning_content) == 0
+ assert len(reasoning_content) > 0
+ assert len(content) > 0
+
+ def test_nonstreaming_separate_reasoning_false(self):
+ # Test non-streaming with separate_reasoning=False, reasoning_content should be empty
+ client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+ payload = {
+ "model": self.model,
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is 1+3?",
+ }
+ ],
+ "max_tokens": 100,
+ "separate_reasoning": False,
+ }
+ response = client.chat.completions.create(**payload)
+
+ assert response.status_code == 200
+ assert len(response.choices[0].message.reasoning_content) == 0
+ assert len(response.choices[0].message.content) > 0
+
+ def test_nonstreaming_separate_reasoning_true(self):
+ # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty
+ client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+ payload = {
+ "model": self.model,
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is 1+3?",
+ }
+ ],
+ "max_tokens": 100,
+ "separate_reasoning": True,
+ }
+ response = client.chat.completions.create(**payload)
+
+ assert response.status_code == 200
+ assert len(response.choices[0].message.reasoning_content) > 0
+ assert len(response.choices[0].message.content) > 0
+
+
+if __name__ == "__main__":
+ unittest.main()