diff --git a/docs/references/deepseek.rst b/docs/references/deepseek.rst index b45383a4b3b4..f77fe1ba78ac 100644 --- a/docs/references/deepseek.rst +++ b/docs/references/deepseek.rst @@ -4,3 +4,4 @@ Multi-Node Deployment :maxdepth: 1 deepseek.md + reasoning_parser.md diff --git a/docs/references/reasoning_parser.md b/docs/references/reasoning_parser.md new file mode 100644 index 000000000000..bce419759187 --- /dev/null +++ b/docs/references/reasoning_parser.md @@ -0,0 +1,138 @@ +# Reasoning Parser + +SGLang supports parsing reasoning content our from "normal" content for reasoning models such as [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1). + +The contract follows the [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model) established with the release of DeepSeek-R1: + +- `reasoning_content`: The content of the CoT. +- `content`: The content of the final answer. + +## Supported Models + +Currently, SGLang supports the following reasoning models: +- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `` and `` tags. + +## Usage + +There are two ways to enable reasoning parsing: + +1) Enable the reasoning parser when starting the SGLang Server by setting the `--enable-reasoning` and `--reasoning-parser` options. The `--reasoning-parser` option specifies the reasoning parser to extract the reasoning content and final answer. + +```bash +python -m sglang.launch_server --host 0.0.0.0 \ +--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ +--enable-reasoning --reasoning-parser deepseek-r1 +``` + +2) Specify on a per-request basis by setting the `separate_reasoning` body field on a `/chat/completions` request. + +```bash +curl -X POST -H "Content-Type: application/json" \ +-d '{"messages":[{"role":"user","content":"Compute 1+3"}],"max_tokens":100,"model":"deepseek-r1","stream":true,"separate_reasoning":true}' http://0.0.0.0:30000/v1/chat/completions +``` + +There is another body param which can be set to buffer the reasoning traces to be sent in one chunk after the closing `` tag, `"stream_reasoning": false`. + +### Non-streaming Request + +Make a request to the reasoning model, get the reasoning content and final answer. + +Using OpenAI python api: +```python +import openai + +client = openai.Client(base_url="http://localhost:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="deepseek-r1:14b", + messages=[{"role": "user", "content": "Compute 1+3"}], + max_tokens=1024, + stream=False +) + +response.choices[0].message.reasoning_content +# 'First, I recognize that the problem requires adding the numbers 1 and 3.\n\nNext, I identify the numbers to be added, which are 1 and 3.\n\nThen, I perform the addition operation: 1 plus 3 equals 4.\n\nFinally, I conclude that the sum of 1 and 3 is 4.\n' +response.choices[0].message.content +# \n\nTo compute \\(1 + 3\\), follow these simple steps:\n\n1. **Identify the numbers to add:** \n The numbers are **1** and **3**.\n\n2. **Add the numbers together:** \n \\[\n 1 + 3 = 4\n \\]\n\n3. **Write the final answer:** \n The sum of \\(1 + 3\\) is \\(\\boxed{4}\\).' +``` + +### Streaming Request + +`reasoning_content` is available in the `delta` field of the streaming response. + +Using OpenAI python api: + +```python +# ... Initialize the client as before ... + +response = client.chat.completions.create( + model="deepseek-r1:14b", + messages=[{"role": "user", "content": "Compute 1+3"}], + max_tokens=1024, + stream=True +) +reasoning_content = "" +content = "" +for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content + +reasoning_content +# 'I need to calculate the sum of 1 and 3. \n\nFirst, I identify the numbers involved in the addition: 1 and 3.\n\nNext, I add these two numbers together to find the total.\n\nFinally, the result of the addition is 4.\n' +content +# '\n\n**Solution:**\n\nWe need to compute the sum of 1 and 3.\n\n1. **Identify the numbers to add:**\n - Number 1\n - Number 3\n\n2. **Add the numbers together:**\n \\[\n 1 + 3 = 4\n \\]\n\n3. **Final Answer:**\n \\[\n \\boxed{4}\n \\]' +``` + + +## Supporting New Reasoning Models + +For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningParser` in `python/sglang/srt/reasoning_parser.py`. + +```python +class BaseReasoningParser: + """Base class for reasoning parser.""" + + def __init__(self): + self._buffer = "" + + def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: + """Detect and parse the text, return reasoning_content and content.""" + raise NotImplementedError + + def parse_streaming_increment( + self, new_text: str + ) -> Tuple[Optional[str], Optional[str]]: + """Parse the new text incrementally, return reasoning_content and content.""" + raise NotImplementedError +``` + +And specify the reasoning parser for new reasoning models accordingly. + +```python +class ReasoningParser: + """Reasoning parser for different reasoning models.""" + + # Specify the reasoning parser for each reasoning model here + ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = { + "deepseek-r1": DeepSeekR1ReasoningParser + } + + def __init__(self, reasoning_parser: str): + self.parser = self.ReasoningParserDict[reasoning_parser]() + + def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]: + """ + Non-streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.detect_and_parse(full_text) + + def parse_stream_chunk(self, chunk_text: str): + """ + Streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.parse_streaming_increment(chunk_text) +``` diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 7c385d40b623..a3f93a3ecfcf 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -74,6 +74,7 @@ TopLogprob, UsageInfo, ) +from sglang.srt.reasoning_parser import ReasoningParser from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -1045,7 +1046,12 @@ def v1_chat_generate_request( def v1_chat_generate_response( - request, ret, to_file=False, cache_report=False, tool_call_parser=None + request, + ret, + to_file=False, + cache_report=False, + tool_call_parser=None, + reasoning_parser=None, ): choices = [] @@ -1099,9 +1105,26 @@ def v1_chat_generate_response( if isinstance(request, list): tool_choice = request[idx].tool_choice tools = request[idx].tools + separate_reasoning = request[idx].separate_reasoning else: tool_choice = request.tool_choice tools = request.tools + separate_reasoning = request.separate_reasoning + + if reasoning_parser and separate_reasoning: + try: + parser = ReasoningParser(reasoning_parser, True) + parse_result = parser.parse_non_stream(text) + text = parse_result.normal_text #! text can not be None + reasoning_text = parse_result.reasoning_text + except Exception as e: + logger.error(f"Exception: {e}") + return create_error_response( + HTTPStatus.BAD_REQUEST, + "Failed to parse reasoning related info to json format!", + ) + else: + reasoning_text = None if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]): if finish_reason == "stop": @@ -1131,8 +1154,9 @@ def v1_chat_generate_response( "index": 0, "message": { "role": "assistant", - "content": ret_item["text"] if tool_calls is None else None, + "content": text if tool_calls is None else None, "tool_calls": tool_calls, + "reasoning_content": reasoning_text, }, "logprobs": choice_logprobs, "finish_reason": (finish_reason["type"] if finish_reason else ""), @@ -1147,8 +1171,9 @@ def v1_chat_generate_response( index=idx, message=ChatMessage( role="assistant", - content=ret_item["text"] if tool_calls is None else None, + content=text if tool_calls is None else None, tool_calls=tool_calls, + reasoning_content=reasoning_text, ), logprobs=choice_logprobs, finish_reason=(finish_reason["type"] if finish_reason else ""), @@ -1215,6 +1240,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): if adapted_request.stream: parser_dict = {} + reasoning_parser_dict = {} async def generate_stream_resp(): is_firsts = {} @@ -1281,15 +1307,27 @@ async def generate_stream_resp(): choice_logprobs = None finish_reason = content["meta_info"]["finish_reason"] + finish_reason_type = ( + finish_reason["type"] if finish_reason else None + ) if is_first: # First chunk with role is_first = False + if ( + tokenizer_manager.server_args.reasoning_parser + and request.separate_reasoning + ): + delta = DeltaMessage(role="assistant", reasoning_content="") + else: + delta = DeltaMessage(role="assistant", content="") choice_data = ChatCompletionResponseStreamChoice( index=index, - delta=DeltaMessage(role="assistant", content=""), + delta=delta, finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type ), matched_stop=( finish_reason["matched"] @@ -1309,6 +1347,42 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] new_stream_buffer = stream_buffer + delta + if ( + tokenizer_manager.server_args.reasoning_parser + and request.separate_reasoning + ): + if index not in reasoning_parser_dict: + reasoning_parser_dict[index] = ReasoningParser( + tokenizer_manager.server_args.reasoning_parser, + request.stream_reasoning, + ) + reasoning_parser = reasoning_parser_dict[index] + parse_result = reasoning_parser.parse_stream_chunk(delta) + if parse_result.reasoning_text: + choice_data = ChatCompletionResponseStreamChoice( + index=index, + delta=DeltaMessage( + reasoning_content=parse_result.reasoning_text + ), + finish_reason=( + None + if finish_reason_type + and len(finish_reason_type) == 0 + else finish_reason_type + ), + ) + chunk = ChatCompletionStreamResponse( + id=content["meta_info"]["id"], + choices=[choice_data], + model=request.model, + ) + yield f"data: {chunk.model_dump_json()}\n\n" + delta = parse_result.normal_text + if (delta and len(delta) == 0) or not delta: + stream_buffers[index] = new_stream_buffer + is_firsts[index] = is_first + continue + if request.tool_choice != "none" and request.tools: if index not in parser_dict: parser_dict[index] = FunctionCallParser( @@ -1326,7 +1400,10 @@ async def generate_stream_resp(): index=index, delta=DeltaMessage(content=normal_text), finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type + and len(finish_reason_type) == 0 + else finish_reason_type ), ) chunk = ChatCompletionStreamResponse( @@ -1395,7 +1472,9 @@ async def generate_stream_resp(): index=index, delta=DeltaMessage(content=delta), finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type ), matched_stop=( finish_reason["matched"] @@ -1463,6 +1542,7 @@ async def generate_stream_resp(): ret, cache_report=tokenizer_manager.server_args.enable_cache_report, tool_call_parser=tokenizer_manager.server_args.tool_call_parser, + reasoning_parser=tokenizer_manager.server_args.reasoning_parser, ) return response diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 5f1ba431abd5..0c0aa09619cc 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -336,6 +336,8 @@ class ChatCompletionRequest(BaseModel): skip_special_tokens: bool = True lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None session_params: Optional[Dict] = None + separate_reasoning: bool = True + stream_reasoning: bool = True class FunctionResponse(BaseModel): @@ -356,6 +358,7 @@ class ToolCall(BaseModel): class ChatMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None + reasoning_content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) @@ -379,6 +382,7 @@ class ChatCompletionResponse(BaseModel): class DeltaMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None + reasoning_content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py new file mode 100644 index 000000000000..b13ae8392331 --- /dev/null +++ b/python/sglang/srt/reasoning_parser.py @@ -0,0 +1,158 @@ +import re +from typing import Dict + +REASONING_MODELS = ["deepseek-r1"] + + +class StreamingParseResult: + """Result of streaming incremental parsing.""" + + def __init__(self, normal_text: str = "", reasoning_text: str = ""): + self.normal_text = normal_text + self.reasoning_text = reasoning_text + + +class BaseReasoningFormatDetector: + """Base class providing two sets of interfaces: one-time and streaming incremental.""" + + def __init__( + self, + think_start_token: str, + think_end_token: str, + force_reasoning: bool = False, + stream_reasoning: bool = False, + ): + self.think_start_token = think_start_token + self.think_end_token = think_end_token + self._in_reasoning = force_reasoning + self.stream_reasoning = stream_reasoning + + self._buffer = "" + self._current_reasoning = "" + self.stripped_think_start = False + + def detect_and_parse(self, text: str) -> StreamingParseResult: + """ + One-time parsing: Detects and parses reasoning sections in the provided text. + Returns both reasoning content and normal text separately. + """ + text = text.replace(self.think_start_token, "").strip() + if self.think_end_token not in text: + # Assume reasoning was truncated before `` token + return StreamingParseResult(reasoning_text=text) + + # Extract reasoning content + splits = text.split(self.think_end_token, maxsplit=1) + reasoning_text = splits[0] + text = splits[1].strip() + + return StreamingParseResult(normal_text=text, reasoning_text=reasoning_text) + + def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: + """ + Streaming incremental parsing for reasoning content. + Handles partial reasoning tags and content. + + If stream_reasoning is False: + Accumulates reasoning content until the end tag is found + If stream_reasoning is True: + Streams reasoning content as it arrives + """ + self._buffer += new_text + current_text = self._buffer + + # Strip `` token if present + if not self.stripped_think_start and current_text.find(self.think_start_token): + current_text = current_text.replace(self.think_start_token, "") + self.stripped_think_start = True + + # Handle end of reasoning block + if self._in_reasoning and self.think_end_token in current_text: + end_idx = current_text.find(self.think_end_token) + + if self.stream_reasoning: + # Just return the final chunk before the end tag + reasoning_text = current_text[:end_idx] + else: + # Return accumulated reasoning plus final chunk + reasoning_text = self._current_reasoning + current_text[:end_idx] + + self._in_reasoning = False + self._current_reasoning = "" + normal_text = current_text[end_idx + len(self.think_end_token) :] + self._buffer = "" + + return StreamingParseResult( + normal_text=normal_text, reasoning_text=reasoning_text.rstrip() + ) + + # Continue with reasoning content + if self._in_reasoning: + if self.stream_reasoning: + # Stream the content immediately + self._buffer = "" + return StreamingParseResult(reasoning_text=current_text) + else: + # Accumulate content but don't return it yet + self._current_reasoning += current_text + return StreamingParseResult() + + # If we're not in a reasoning block return as normal text + if not self._in_reasoning: + self._buffer = "" + return StreamingParseResult(normal_text=new_text) + + return StreamingParseResult() + + +class DeepSeekR1Detector(BaseReasoningFormatDetector): + """ + Detector for DeepSeek-R1 model. + Assumes reasoning format: + ()*(.*) + Returns all the text before the tag as `reasoning_text` + and the rest of the text as `normal_text`. + + Args: + stream_reasoning (bool): If False, accumulates reasoning content until the end tag. + If True, streams reasoning content as it arrives. + """ + + def __init__(self, stream_reasoning: bool = False): + # DeepSeek-R1 is assumed to be reasoning until `` token + super().__init__("", "", True, stream_reasoning=stream_reasoning) + # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599 + + +class ReasoningParser: + """ + Parser that handles both streaming and non-streaming scenarios for extracting + reasoning content from model outputs. + + Args: + model_type (str): Type of model to parse reasoning from + stream_reasoning (bool): If Flase, accumulates reasoning content until complete. + If True, streams reasoning content as it arrives. + """ + + DetectorMap: Dict[str, BaseReasoningFormatDetector] = { + "deepseek-r1": DeepSeekR1Detector + } + + def __init__(self, model_type: str = None, stream_reasoning: bool = True): + if not model_type: + raise ValueError("Model type must be specified") + + detector_class = self.DetectorMap.get(model_type.lower()) + if not detector_class: + raise ValueError(f"Unsupported model type: {model_type}") + + self.detector = detector_class(stream_reasoning=stream_reasoning) + + def parse_non_stream(self, full_text: str) -> StreamingParseResult: + """Non-streaming call: one-time parsing""" + return self.detector.detect_and_parse(full_text) + + def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult: + """Streaming call: incremental parsing""" + return self.detector.parse_streaming_increment(chunk_text) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index c62a3dbdac34..76ea63b16cd0 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -23,6 +23,7 @@ import torch from sglang.srt.hf_transformers_utils import check_gguf_file +from sglang.srt.reasoning_parser import REASONING_MODELS from sglang.srt.utils import ( get_amdgpu_memory_capacity, get_hpu_memory_capacity, @@ -96,6 +97,7 @@ class ServerArgs: api_key: Optional[str] = None file_storage_pth: str = "sglang_storage" enable_cache_report: bool = False + reasoning_parser: Optional[str] = None # Data parallelism dp_size: int = 1 @@ -613,6 +615,13 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.", ) + parser.add_argument( + "--reasoning-parser", + type=str, + choices=REASONING_MODELS, + default=ServerArgs.reasoning_parser, + help=f"Specify the parser for reasoning models, supported parsers are: {REASONING_MODELS}.", + ) # Data parallelism parser.add_argument( diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 3dc1ae347372..b1d88a8c49de 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -34,6 +34,7 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" +DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b02bbec56a46..d5fd6cbc132e 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -57,6 +57,7 @@ "test_w8a8_quantization.py", "test_fp8_kernel.py", "test_block_int8.py", + "test_reasoning_content.py", ], "nightly": [ "test_nightly_gsm8k_eval.py", diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py new file mode 100644 index 000000000000..d2f34a35c10f --- /dev/null +++ b/test/srt/test_reasoning_content.py @@ -0,0 +1,180 @@ +""" +Usage: +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true_stream_reasoning_false +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true +python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_nonstreaming +python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_streaming +""" + +import json +import unittest + +import openai +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_REASONING_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestReasoningContentAPI(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-1234" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=[ + "--reasoning-parser", + "deepseek-r1", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_streaming_separate_reasoning_false(self): + # Test streaming with separate_reasoning=False, reasoning_content should be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+3?", + } + ], + "max_tokens": 100, + "stream": True, + "separate_reasoning": False, + } + response = client.chat.completions.create(**payload) + + assert response.status_code == 200 + reasoning_content = "" + content = "" + for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content + + assert len(reasoning_content) == 0 + assert len(content) > 0 + + def test_streaming_separate_reasoning_true(self): + # Test streaming with separate_reasoning=True, reasoning_content should not be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+3?", + } + ], + "max_tokens": 100, + "stream": True, + "separate_reasoning": True, + } + response = client.chat.completions.create(**payload) + + assert response.status_code == 200 + reasoning_content = "" + content = "" + for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content + + assert len(reasoning_content) > 0 + assert len(content) > 0 + + def test_streaming_separate_reasoning_true_stream_reasoning_false(self): + # Test streaming with separate_reasoning=True, reasoning_content should not be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+3?", + } + ], + "max_tokens": 100, + "stream": True, + "separate_reasoning": True, + "stream_reasoning": False, + } + response = client.chat.completions.create(**payload) + + assert response.status_code == 200 + reasoning_content = "" + content = "" + first_chunk = False + for chunk in response: + if chunk.choices[0].delta.content: + first_chunk = True + content += chunk.choices[0].delta.content + reasoning_content = chunk.choices[0].delta.reasoning_content + if not first_chunk: + assert len(chunk.choices[0].delta.reasoning_content) == 0 + assert len(reasoning_content) > 0 + assert len(content) > 0 + + def test_nonstreaming_separate_reasoning_false(self): + # Test non-streaming with separate_reasoning=False, reasoning_content should be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+3?", + } + ], + "max_tokens": 100, + "separate_reasoning": False, + } + response = client.chat.completions.create(**payload) + + assert response.status_code == 200 + assert len(response.choices[0].message.reasoning_content) == 0 + assert len(response.choices[0].message.content) > 0 + + def test_nonstreaming_separate_reasoning_true(self): + # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+3?", + } + ], + "max_tokens": 100, + "separate_reasoning": True, + } + response = client.chat.completions.create(**payload) + + assert response.status_code == 200 + assert len(response.choices[0].message.reasoning_content) > 0 + assert len(response.choices[0].message.content) > 0 + + +if __name__ == "__main__": + unittest.main()