From ad623a05f69a9edda9597e662dc936c9b2127e0d Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Wed, 29 Jan 2025 01:48:48 +0000 Subject: [PATCH 01/27] Support `reasoning_content` in ChatCompletion choices like DeepSeek api. --- docker/Dockerfile | 11 +- python/sglang/srt/openai_api/adapter.py | 49 ++++++ python/sglang/srt/openai_api/protocol.py | 4 + python/sglang/srt/reasoning_parser.py | 187 +++++++++++++++++++++++ 4 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 python/sglang/srt/reasoning_parser.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 264397f851bf..65edc2576e09 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -22,10 +22,13 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN pip3 install datamodel_code_generator WORKDIR /sgl-workspace +ARG SGL_REPO="https://github.com/sgl-project/sglang" +ENV SGL_DEFAULT="main" +ARG SGL_BRANCH=${SGL_DEFAULT} ARG CUDA_VERSION RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ - && git clone --depth=1 https://github.com/sgl-project/sglang.git \ + && git clone ${SGL_REPO} \ && if [ "$CUDA_VERSION" = "12.1.1" ]; then \ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu121; \ elif [ "$CUDA_VERSION" = "12.4.1" ]; then \ @@ -39,6 +42,12 @@ RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \ fi \ && cd sglang \ + && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \ + echo "Using ${SGL_DEFAULT}, default branch."; \ + else \ + echo "Using ${SGL_BRANCH} branch."; \ + git checkout ${SGL_BRANCH}; \ + fi \ && if [ "$BUILD_TYPE" = "srt" ]; then \ if [ "$CUDA_VERSION" = "12.1.1" ]; then \ python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer/; \ diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 6687a4c0133e..d0b9a13cd1a1 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -41,6 +41,7 @@ register_conv_template, ) from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser +from sglang.srt.reasoning_parser import is_reasoning_model, ReasoningParser from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput from sglang.srt.openai_api.protocol import ( BatchRequest, @@ -1072,9 +1073,30 @@ def v1_chat_generate_response( if isinstance(request, list): tool_choice = request[idx].tool_choice tools = request[idx].tools + model = request[idx].model + stream_reasoning = request[idx].stream_reasoning + reasoning_parser = ReasoningParser() if is_reasoning_model(request[idx].model) else None else: tool_choice = request.tool_choice tools = request.tools + model = request.model + stream_reasoning = request.stream_reasoning + reasoning_parser = ReasoningParser() if is_reasoning_model(request.model) else None + + if reasoning_parser is not None: + try: + parser = ReasoningParser(model, True) + parse_result = parser.parse_non_stream(text) + ret_item["text"] = parse_result.normal_text + reasoning_text = parse_result.reasoning_text + except Exception as e: + logger.error(f"Exception: {e}") + return create_error_response( + HTTPStatus.BAD_REQUEST, + "Failed to parse reasoning related info to json format!", + ) + else: + reasoning_text = None if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]): if finish_reason == "stop": @@ -1115,6 +1137,8 @@ def v1_chat_generate_response( else None ), } + if reasoning_text: + choice_data["message"]["reasoning_content"] = reasoning_text else: choice_data = ChatCompletionResponseChoice( index=idx, @@ -1122,6 +1146,7 @@ def v1_chat_generate_response( role="assistant", content=ret_item["text"] if tool_calls is None else None, tool_calls=tool_calls, + reasoning_content=reasoning_text, ), logprobs=choice_logprobs, finish_reason=(finish_reason["type"] if finish_reason else ""), @@ -1188,6 +1213,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): if adapted_request.stream: parser_dict = {} + reasoning_parser_dict = {} async def generate_stream_resp(): is_firsts = {} @@ -1282,6 +1308,29 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] new_stream_buffer = stream_buffer + delta + if request.separate_reasoning and is_reasoning_model(request.model): + if index not in reasoning_parser_dict: + reasoning_parser_dict[index] = ReasoningParser( + request.model, request.stream_reasoning + ) + reasoning_parser = reasoning_parser_dict[index] + parse_result = reasoning_parser.parse_stream_chunk(delta) + if parse_result.reasoning_text: + choice_data = ChatCompletionResponseStreamChoice( + index=index, + delta=DeltaMessage(reasoning_content=parse_result.reasoning_text), + finish_reason=( + finish_reason["type"] if finish_reason else "" + ), + ) + chunk = ChatCompletionStreamResponse( + id=content["meta_info"]["id"], + choices=[choice_data], + model=request.model, + ) + yield f"data: {chunk.model_dump_json()}\n\n" + delta = parse_result.normal_text + if request.tool_choice != "none" and request.tools: if index not in parser_dict: parser_dict[index] = FunctionCallParser( diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 95b34527edbc..e23886b7856c 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -324,6 +324,8 @@ class ChatCompletionRequest(BaseModel): skip_special_tokens: bool = True lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None session_params: Optional[Dict] = None + separate_reasoning: bool = False + stream_reasoning: bool = True class FunctionResponse(BaseModel): @@ -345,6 +347,7 @@ class ChatMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) + reasoning_content: Optional[str] = None class ChatCompletionResponseChoice(BaseModel): @@ -368,6 +371,7 @@ class DeltaMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) + reasoning_content: Optional[str] = None class ChatCompletionResponseStreamChoice(BaseModel): diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py new file mode 100644 index 000000000000..7501b7c8f50e --- /dev/null +++ b/python/sglang/srt/reasoning_parser.py @@ -0,0 +1,187 @@ +import re +from typing import Optional, Dict + +REASONING_MODELS = ["deepseek-r1"] + + +def is_reasoning_model(model_name: str) -> bool: + """Checks if the model is a reasoning model.""" + return model_name.lower() in REASONING_MODELS + + +class StreamingParseResult: + """Result of streaming incremental parsing.""" + def __init__(self, normal_text: str = "", reasoning_text: str = ""): + self.normal_text = normal_text + self.reasoning_text = reasoning_text + +class BaseReasoningFormatDetector: + """Base class providing two sets of interfaces: one-time and streaming incremental.""" + def __init__(self, accumulate_reasoning: bool = False): + self._buffer = "" + self._in_reasoning = False + self._current_reasoning = "" + self.accumulate_reasoning = accumulate_reasoning + + def detect_and_parse(self, text: str) -> StreamingParseResult: + """Parses the text in one go.""" + raise NotImplementedError + + def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: + """Streaming incremental parsing.""" + raise NotImplementedError + +class DeepSeekR1Detector(BaseReasoningFormatDetector): + """ + Detector for DeepSeek-R1 model. + Assumes reasoning format: + ... + Returns all the text within the tags as `reasoning_text` + and the rest of the text as `normal_text`. + + Args: + accumulate_reasoning (bool): If True, accumulates reasoning content until the end tag. + If False, streams reasoning content as it arrives. + """ + def __init__(self, accumulate_reasoning: bool = False): + super().__init__(accumulate_reasoning=accumulate_reasoning) + self.think_start_token = "" + self.think_end_token = "" + self.reasoning_regex = re.compile( + rf"{self.think_start_token}(.*?){self.think_end_token}", + re.DOTALL + ) + + def detect_and_parse(self, text: str) -> StreamingParseResult: + """ + One-time parsing: Detects and parses reasoning sections in the provided text. + Returns both reasoning content and normal text separately. + """ + if self.think_start_token not in text or self.think_end_token not in text: + return StreamingParseResult(normal_text=text) + + # Extract reasoning content + reasoning_matches = self.reasoning_regex.findall(text) + if not reasoning_matches: + return StreamingParseResult(normal_text=text) + + reasoning_text = reasoning_matches[0] + + # Remove the reasoning section from the text to get normal_text + start_idx = text.find(self.think_start_token) + if start_idx != -1: + end_idx = start_idx + len( + f"{self.think_start_token}{reasoning_text}{self.think_end_token}" + ) + normal_text = text[:start_idx] + text[end_idx:] + normal_text = normal_text.strip() + + return StreamingParseResult( + normal_text=normal_text if normal_text else "", + reasoning_text=reasoning_text + ) + + return StreamingParseResult(normal_text=text) + + def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: + """ + Streaming incremental parsing for reasoning content. + Handles partial reasoning tags and content. + + If accumulate_reasoning is True: + Accumulates reasoning content until the end tag is found + If accumulate_reasoning is False: + Streams reasoning content as it arrives + """ + self._buffer += new_text + current_text = self._buffer + + # If we're not in a reasoning block and no think start token, + # return as normal text + if not self._in_reasoning and self.think_start_token not in current_text: + self._buffer = "" + return StreamingParseResult(normal_text=new_text) + + # Handle start of reasoning block + if not self._in_reasoning and self.think_start_token in current_text: + start_idx = current_text.find(self.think_start_token) + self._in_reasoning = True + normal_text = current_text[:start_idx] + + # Get any reasoning content after the start token + reasoning_start = start_idx + len(self.think_start_token) + reasoning_text = current_text[reasoning_start:] + + if self.accumulate_reasoning: + self._current_reasoning = reasoning_text + reasoning_text = "" + else: + self._buffer = "" # Clear buffer since we're streaming + + return StreamingParseResult(normal_text=normal_text, reasoning_text=reasoning_text) + + # Handle end of reasoning block + if self._in_reasoning and self.think_end_token in current_text: + end_idx = current_text.find(self.think_end_token) + + if self.accumulate_reasoning: + # Return accumulated reasoning plus final chunk + reasoning_text = self._current_reasoning + current_text[:end_idx] + else: + # Just return the final chunk before the end tag + reasoning_text = current_text[:end_idx] + + self._in_reasoning = False + self._current_reasoning = "" + normal_text = current_text[end_idx + len(self.think_end_token):] + self._buffer = "" + + return StreamingParseResult( + normal_text=normal_text, + reasoning_text=reasoning_text + ) + + # Continue with reasoning content + if self._in_reasoning: + if self.accumulate_reasoning: + # Accumulate content but don't return it yet + self._current_reasoning += new_text + return StreamingParseResult() + else: + # Stream the content immediately + self._buffer = "" + return StreamingParseResult(reasoning_text=new_text) + + return StreamingParseResult() + +class ReasoningParser: + """ + Parser that handles both streaming and non-streaming scenarios for extracting + reasoning content from model outputs. + + Args: + model_type (str): Type of model to parse reasoning from + accumulate_reasoning (bool): If True, accumulates reasoning content until complete. + If False, streams reasoning content as it arrives. + """ + DetectorMap: Dict[str, BaseReasoningFormatDetector] = { + "deepseek-r1": DeepSeekR1Detector + } + + def __init__(self, model_type: str = None, accumulate_reasoning: bool = False): + if not model_type: + raise ValueError("Model type must be specified") + + detector_class = self.DetectorMap.get(model_type) + if not detector_class: + raise ValueError(f"Unsupported model type: {model_type}") + + self.detector = detector_class(accumulate_reasoning=accumulate_reasoning) + + def parse_non_stream(self, full_text: str) -> StreamingParseResult: + """Non-streaming call: one-time parsing""" + return self.detector.detect_and_parse(full_text) + + def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult: + """Streaming call: incremental parsing""" + return self.detector.parse_streaming_increment(chunk_text) From 1fe665403e7e26fe9f5b2ff6f8182f19707e8da6 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Wed, 29 Jan 2025 02:06:42 +0000 Subject: [PATCH 02/27] Fix up silly mistakes in non-streaming path --- python/sglang/srt/openai_api/adapter.py | 6 +----- python/sglang/srt/reasoning_parser.py | 6 +++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index d0b9a13cd1a1..97d7e678f677 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1074,16 +1074,12 @@ def v1_chat_generate_response( tool_choice = request[idx].tool_choice tools = request[idx].tools model = request[idx].model - stream_reasoning = request[idx].stream_reasoning - reasoning_parser = ReasoningParser() if is_reasoning_model(request[idx].model) else None else: tool_choice = request.tool_choice tools = request.tools model = request.model - stream_reasoning = request.stream_reasoning - reasoning_parser = ReasoningParser() if is_reasoning_model(request.model) else None - if reasoning_parser is not None: + if request.separate_reasoning and is_reasoning_model(model): try: parser = ReasoningParser(model, True) parse_result = parser.parse_non_stream(text) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 7501b7c8f50e..af4753d7c242 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -171,11 +171,11 @@ class ReasoningParser: def __init__(self, model_type: str = None, accumulate_reasoning: bool = False): if not model_type: raise ValueError("Model type must be specified") - - detector_class = self.DetectorMap.get(model_type) + + detector_class = self.DetectorMap.get(model_type.lower()) if not detector_class: raise ValueError(f"Unsupported model type: {model_type}") - + self.detector = detector_class(accumulate_reasoning=accumulate_reasoning) def parse_non_stream(self, full_text: str) -> StreamingParseResult: From 0a0eaad8a1aaaed31188333d99ccb60b02f82099 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Wed, 29 Jan 2025 03:13:17 +0000 Subject: [PATCH 03/27] Flip accumulate_reasoning to stream_reasoning to match the api changes. --- python/sglang/srt/reasoning_parser.py | 54 +++++++++++++-------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index af4753d7c242..5e948b469a1a 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -17,11 +17,11 @@ def __init__(self, normal_text: str = "", reasoning_text: str = ""): class BaseReasoningFormatDetector: """Base class providing two sets of interfaces: one-time and streaming incremental.""" - def __init__(self, accumulate_reasoning: bool = False): + def __init__(self, stream_reasoning: bool = False): self._buffer = "" self._in_reasoning = False self._current_reasoning = "" - self.accumulate_reasoning = accumulate_reasoning + self.stream_reasoning = stream_reasoning def detect_and_parse(self, text: str) -> StreamingParseResult: """Parses the text in one go.""" @@ -40,11 +40,11 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector): and the rest of the text as `normal_text`. Args: - accumulate_reasoning (bool): If True, accumulates reasoning content until the end tag. - If False, streams reasoning content as it arrives. + stream_reasoning (bool): If False, accumulates reasoning content until the end tag. + If True, streams reasoning content as it arrives. """ - def __init__(self, accumulate_reasoning: bool = False): - super().__init__(accumulate_reasoning=accumulate_reasoning) + def __init__(self, stream_reasoning: bool = False): + super().__init__(stream_reasoning=stream_reasoning) self.think_start_token = "" self.think_end_token = "" self.reasoning_regex = re.compile( @@ -78,7 +78,7 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: return StreamingParseResult( normal_text=normal_text if normal_text else "", - reasoning_text=reasoning_text + reasoning_text=reasoning_text.strip() ) return StreamingParseResult(normal_text=text) @@ -88,9 +88,9 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: Streaming incremental parsing for reasoning content. Handles partial reasoning tags and content. - If accumulate_reasoning is True: + If stream_reasoning is False: Accumulates reasoning content until the end tag is found - If accumulate_reasoning is False: + If stream_reasoning is True: Streams reasoning content as it arrives """ self._buffer += new_text @@ -112,24 +112,24 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: reasoning_start = start_idx + len(self.think_start_token) reasoning_text = current_text[reasoning_start:] - if self.accumulate_reasoning: + if self.stream_reasoning: + self._buffer = "" # Clear buffer since we're streaming + else: self._current_reasoning = reasoning_text reasoning_text = "" - else: - self._buffer = "" # Clear buffer since we're streaming - return StreamingParseResult(normal_text=normal_text, reasoning_text=reasoning_text) + return StreamingParseResult(normal_text=normal_text, reasoning_text=reasoning_text.lstrip()) # Handle end of reasoning block if self._in_reasoning and self.think_end_token in current_text: end_idx = current_text.find(self.think_end_token) - if self.accumulate_reasoning: - # Return accumulated reasoning plus final chunk - reasoning_text = self._current_reasoning + current_text[:end_idx] - else: + if self.stream_reasoning: # Just return the final chunk before the end tag reasoning_text = current_text[:end_idx] + else: + # Return accumulated reasoning plus final chunk + reasoning_text = self._current_reasoning + current_text[:end_idx] self._in_reasoning = False self._current_reasoning = "" @@ -138,19 +138,19 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: return StreamingParseResult( normal_text=normal_text, - reasoning_text=reasoning_text + reasoning_text=reasoning_text.rstrip() ) # Continue with reasoning content if self._in_reasoning: - if self.accumulate_reasoning: - # Accumulate content but don't return it yet - self._current_reasoning += new_text - return StreamingParseResult() - else: + if self.stream_reasoning: # Stream the content immediately self._buffer = "" return StreamingParseResult(reasoning_text=new_text) + else: + # Accumulate content but don't return it yet + self._current_reasoning += new_text + return StreamingParseResult() return StreamingParseResult() @@ -161,14 +161,14 @@ class ReasoningParser: Args: model_type (str): Type of model to parse reasoning from - accumulate_reasoning (bool): If True, accumulates reasoning content until complete. - If False, streams reasoning content as it arrives. + stream_reasoning (bool): If Flase, accumulates reasoning content until complete. + If True, streams reasoning content as it arrives. """ DetectorMap: Dict[str, BaseReasoningFormatDetector] = { "deepseek-r1": DeepSeekR1Detector } - def __init__(self, model_type: str = None, accumulate_reasoning: bool = False): + def __init__(self, model_type: str = None, stream_reasoning: bool = True): if not model_type: raise ValueError("Model type must be specified") @@ -176,7 +176,7 @@ def __init__(self, model_type: str = None, accumulate_reasoning: bool = False): if not detector_class: raise ValueError(f"Unsupported model type: {model_type}") - self.detector = detector_class(accumulate_reasoning=accumulate_reasoning) + self.detector = detector_class(stream_reasoning=stream_reasoning) def parse_non_stream(self, full_text: str) -> StreamingParseResult: """Non-streaming call: one-time parsing""" From 9cc7b76931af3c0758d4e52590faa05079d411d2 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Tue, 4 Feb 2025 15:15:19 +0000 Subject: [PATCH 04/27] Ensure `finish_reason` is null by default to match OpenAI streaming response behavior. --- python/sglang/srt/openai_api/adapter.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 97d7e678f677..7136b2e06f2e 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1276,6 +1276,7 @@ async def generate_stream_resp(): choice_logprobs = None finish_reason = content["meta_info"]["finish_reason"] + finish_reason_type = finish_reason["type"] if finish_reason else None if is_first: # First chunk with role @@ -1284,7 +1285,9 @@ async def generate_stream_resp(): index=index, delta=DeltaMessage(role="assistant", content=""), finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type, ), matched_stop=( finish_reason["matched"] @@ -1316,7 +1319,9 @@ async def generate_stream_resp(): index=index, delta=DeltaMessage(reasoning_content=parse_result.reasoning_text), finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type, ), ) chunk = ChatCompletionStreamResponse( @@ -1344,7 +1349,9 @@ async def generate_stream_resp(): index=index, delta=DeltaMessage(content=normal_text), finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type, ), ) chunk = ChatCompletionStreamResponse( @@ -1413,7 +1420,9 @@ async def generate_stream_resp(): index=index, delta=DeltaMessage(content=delta), finish_reason=( - finish_reason["type"] if finish_reason else "" + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type, ), matched_stop=( finish_reason["matched"] From 64008008a7277987f769bed0c0128b4fc5b317d9 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Tue, 4 Feb 2025 15:34:02 +0000 Subject: [PATCH 05/27] fix silly python tuple mistake. --- python/sglang/srt/openai_api/adapter.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 7136b2e06f2e..2c162c3ecaf7 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1284,11 +1284,7 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(role="assistant", content=""), - finish_reason=( - None - if finish_reason_type and len(finish_reason_type) == 0 - else finish_reason_type, - ), + finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, matched_stop=( finish_reason["matched"] if finish_reason and "matched" in finish_reason @@ -1318,11 +1314,7 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(reasoning_content=parse_result.reasoning_text), - finish_reason=( - None - if finish_reason_type and len(finish_reason_type) == 0 - else finish_reason_type, - ), + finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, ) chunk = ChatCompletionStreamResponse( id=content["meta_info"]["id"], @@ -1348,11 +1340,7 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(content=normal_text), - finish_reason=( - None - if finish_reason_type and len(finish_reason_type) == 0 - else finish_reason_type, - ), + finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, ) chunk = ChatCompletionStreamResponse( id=content["meta_info"]["id"], @@ -1419,11 +1407,7 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(content=delta), - finish_reason=( - None - if finish_reason_type and len(finish_reason_type) == 0 - else finish_reason_type, - ), + finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, matched_stop=( finish_reason["matched"] if finish_reason and "matched" in finish_reason From d8561248caec80bc2e23f1ca4c78069b8e1866e0 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Wed, 5 Feb 2025 00:27:46 +0000 Subject: [PATCH 06/27] Don't send streaming chunks for empty content. --- python/sglang/srt/openai_api/adapter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 2c162c3ecaf7..f8a9c2e3d941 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1323,6 +1323,10 @@ async def generate_stream_resp(): ) yield f"data: {chunk.model_dump_json()}\n\n" delta = parse_result.normal_text + if len(delta) == 0: + stream_buffers[index] = new_stream_buffer + is_firsts[index] = is_first + continue if request.tool_choice != "none" and request.tools: if index not in parser_dict: From 99f2583c73dd8cbc289f193d0a00bdf97b6f19fb Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 05:14:31 +0000 Subject: [PATCH 07/27] Adapt reasoning_parser to handle token not being produced by model, also handle first response while separating reasoning. --- python/sglang/srt/openai_api/adapter.py | 6 +- python/sglang/srt/reasoning_parser.py | 82 +++++++++---------------- 2 files changed, 35 insertions(+), 53 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index f8a9c2e3d941..3511564fc83c 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1281,9 +1281,13 @@ async def generate_stream_resp(): if is_first: # First chunk with role is_first = False + if request.separate_reasoning and is_reasoning_model(request.model): + delta = DeltaMessage(role="assistant", reasoning_content="") + else: + delta = DeltaMessage(role="assistant", content="") choice_data = ChatCompletionResponseStreamChoice( index=index, - delta=DeltaMessage(role="assistant", content=""), + delta=delta, finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, matched_stop=( finish_reason["matched"] diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 5e948b469a1a..9c43a0e6d00f 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -35,8 +35,8 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector): """ Detector for DeepSeek-R1 model. Assumes reasoning format: - ... - Returns all the text within the tags as `reasoning_text` + ()*(.*) + Returns all the text before the tag as `reasoning_text` and the rest of the text as `normal_text`. Args: @@ -47,6 +47,10 @@ def __init__(self, stream_reasoning: bool = False): super().__init__(stream_reasoning=stream_reasoning) self.think_start_token = "" self.think_end_token = "" + # DeepSeek-R1 is assumed to be reasoning until `` token + # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599 + self._in_reasoning = True + self.stripped_think_start = False self.reasoning_regex = re.compile( rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL @@ -57,37 +61,25 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: One-time parsing: Detects and parses reasoning sections in the provided text. Returns both reasoning content and normal text separately. """ - if self.think_start_token not in text or self.think_end_token not in text: - return StreamingParseResult(normal_text=text) + if self.think_end_token not in text: + # Assume reasoning was truncated before `` token + return StreamingParseResult(reasoning_text==text) # Extract reasoning content - reasoning_matches = self.reasoning_regex.findall(text) - if not reasoning_matches: - return StreamingParseResult(normal_text=text) + splits = text.split(self.think_end_token, splits=1) + reasoning_text = splits[0].replace(self.think_start_token, "").strip() + text = splits[1].strip() - reasoning_text = reasoning_matches[0] - - # Remove the reasoning section from the text to get normal_text - start_idx = text.find(self.think_start_token) - if start_idx != -1: - end_idx = start_idx + len( - f"{self.think_start_token}{reasoning_text}{self.think_end_token}" - ) - normal_text = text[:start_idx] + text[end_idx:] - normal_text = normal_text.strip() - - return StreamingParseResult( - normal_text=normal_text if normal_text else "", - reasoning_text=reasoning_text.strip() - ) - - return StreamingParseResult(normal_text=text) + return StreamingParseResult( + normal_text=text, + reasoning_text=reasoning_text + ) def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: """ Streaming incremental parsing for reasoning content. Handles partial reasoning tags and content. - + If stream_reasoning is False: Accumulates reasoning content until the end tag is found If stream_reasoning is True: @@ -96,48 +88,29 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: self._buffer += new_text current_text = self._buffer - # If we're not in a reasoning block and no think start token, - # return as normal text - if not self._in_reasoning and self.think_start_token not in current_text: - self._buffer = "" - return StreamingParseResult(normal_text=new_text) - - # Handle start of reasoning block - if not self._in_reasoning and self.think_start_token in current_text: - start_idx = current_text.find(self.think_start_token) - self._in_reasoning = True - normal_text = current_text[:start_idx] - - # Get any reasoning content after the start token - reasoning_start = start_idx + len(self.think_start_token) - reasoning_text = current_text[reasoning_start:] - - if self.stream_reasoning: - self._buffer = "" # Clear buffer since we're streaming - else: - self._current_reasoning = reasoning_text - reasoning_text = "" - - return StreamingParseResult(normal_text=normal_text, reasoning_text=reasoning_text.lstrip()) + # Strip `` token if present + if not self.stripped_think_start and current_text.find(self.think_start_token): + current_text = current_text.replace(self.think_start_token, "") + self.stripped_think_start = True # Handle end of reasoning block if self._in_reasoning and self.think_end_token in current_text: end_idx = current_text.find(self.think_end_token) - + if self.stream_reasoning: # Just return the final chunk before the end tag reasoning_text = current_text[:end_idx] else: # Return accumulated reasoning plus final chunk reasoning_text = self._current_reasoning + current_text[:end_idx] - + self._in_reasoning = False self._current_reasoning = "" normal_text = current_text[end_idx + len(self.think_end_token):] self._buffer = "" - + return StreamingParseResult( - normal_text=normal_text, + normal_text=None if len(normal_text) == 0 else normal_text, reasoning_text=reasoning_text.rstrip() ) @@ -152,6 +125,11 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: self._current_reasoning += new_text return StreamingParseResult() + # If we're not in a reasoning block return as normal text + if not self._in_reasoning: + self._buffer = "" + return StreamingParseResult(normal_text=new_text) + return StreamingParseResult() class ReasoningParser: From f39a2560599a0e789b74ff832b77ff4e0b0662b9 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 05:52:13 +0000 Subject: [PATCH 08/27] Fix silly typo --- python/sglang/srt/reasoning_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 9c43a0e6d00f..18279c8f0dd0 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -63,7 +63,7 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: """ if self.think_end_token not in text: # Assume reasoning was truncated before `` token - return StreamingParseResult(reasoning_text==text) + return StreamingParseResult(reasoning_text=text) # Extract reasoning content splits = text.split(self.think_end_token, splits=1) From 132e5d6afdc80de8b748a80f2840534e9f0a7125 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 06:11:46 +0000 Subject: [PATCH 09/27] Fix up token stripping. --- python/sglang/srt/reasoning_parser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 18279c8f0dd0..48a6a2ba071a 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -61,13 +61,14 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: One-time parsing: Detects and parses reasoning sections in the provided text. Returns both reasoning content and normal text separately. """ + text = text.replace(self.think_start_token, "").strip() if self.think_end_token not in text: # Assume reasoning was truncated before `` token return StreamingParseResult(reasoning_text=text) # Extract reasoning content splits = text.split(self.think_end_token, splits=1) - reasoning_text = splits[0].replace(self.think_start_token, "").strip() + reasoning_text = splits[0] text = splits[1].strip() return StreamingParseResult( @@ -119,10 +120,10 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: if self.stream_reasoning: # Stream the content immediately self._buffer = "" - return StreamingParseResult(reasoning_text=new_text) + return StreamingParseResult(reasoning_text=current_text) else: # Accumulate content but don't return it yet - self._current_reasoning += new_text + self._current_reasoning += current_text return StreamingParseResult() # If we're not in a reasoning block return as normal text From 4a60111216db43e9dbd1ab6e92125e9f645d9b79 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 06:13:10 +0000 Subject: [PATCH 10/27] use split correctly --- python/sglang/srt/reasoning_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 48a6a2ba071a..3f399241c6c7 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -67,7 +67,7 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: return StreamingParseResult(reasoning_text=text) # Extract reasoning content - splits = text.split(self.think_end_token, splits=1) + splits = text.split(self.think_end_token, maxsplits=1) reasoning_text = splits[0] text = splits[1].strip() From 9ec06b1269a332cd258a90a43871338ef60d075d Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 06:17:35 +0000 Subject: [PATCH 11/27] Remove unused reasoning_regex. --- python/sglang/srt/reasoning_parser.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 3f399241c6c7..c3db1ae23a32 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -51,10 +51,6 @@ def __init__(self, stream_reasoning: bool = False): # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599 self._in_reasoning = True self.stripped_think_start = False - self.reasoning_regex = re.compile( - rf"{self.think_start_token}(.*?){self.think_end_token}", - re.DOTALL - ) def detect_and_parse(self, text: str) -> StreamingParseResult: """ From 22a0b61860b12e3dc97c5d79568c88c33b8e11b1 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 06:34:53 +0000 Subject: [PATCH 12/27] wow i really can't read, or it's late, or both. --- python/sglang/srt/reasoning_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index c3db1ae23a32..59a01bd6c131 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -63,7 +63,7 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: return StreamingParseResult(reasoning_text=text) # Extract reasoning content - splits = text.split(self.think_end_token, maxsplits=1) + splits = text.split(self.think_end_token, maxsplit=1) reasoning_text = splits[0] text = splits[1].strip() From cde50fcf258f79aab1aa7f6d432b469f6b8138a2 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 06:37:40 +0000 Subject: [PATCH 13/27] Fix another case. --- python/sglang/srt/openai_api/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 3511564fc83c..7383a42389a4 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1327,7 +1327,7 @@ async def generate_stream_resp(): ) yield f"data: {chunk.model_dump_json()}\n\n" delta = parse_result.normal_text - if len(delta) == 0: + if delta and len(delta) == 0: stream_buffers[index] = new_stream_buffer is_firsts[index] = is_first continue From 7330b0bcc8d5d184a290ae12d18a232a28342872 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 06:48:35 +0000 Subject: [PATCH 14/27] parse_result.normal_text _shouldn't_ ever be None, but lets be defensive just in case. --- python/sglang/srt/openai_api/adapter.py | 2 +- python/sglang/srt/reasoning_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 7383a42389a4..e2a4e6127f53 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1327,7 +1327,7 @@ async def generate_stream_resp(): ) yield f"data: {chunk.model_dump_json()}\n\n" delta = parse_result.normal_text - if delta and len(delta) == 0: + if (delta and len(delta) == 0) or not delta: stream_buffers[index] = new_stream_buffer is_firsts[index] = is_first continue diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 59a01bd6c131..3499c2ea6144 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -107,7 +107,7 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: self._buffer = "" return StreamingParseResult( - normal_text=None if len(normal_text) == 0 else normal_text, + normal_text=normal_text, reasoning_text=reasoning_text.rstrip() ) From 3fdce0dd827055f1465b1201fc36cfd28146d3d0 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Thu, 13 Feb 2025 07:11:25 +0000 Subject: [PATCH 15/27] Make content=None if iparse_results.normal_text returns an empty string --- python/sglang/srt/openai_api/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index e2a4e6127f53..f5ad5de964db 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1083,7 +1083,7 @@ def v1_chat_generate_response( try: parser = ReasoningParser(model, True) parse_result = parser.parse_non_stream(text) - ret_item["text"] = parse_result.normal_text + ret_item["text"] = None if parse_result.normal_text and len(parse_result.normal_text) == 0 else parse_result.normal_text reasoning_text = parse_result.reasoning_text except Exception as e: logger.error(f"Exception: {e}") From 1f7daaeb699690842510659df50a292022945345 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Fri, 21 Feb 2025 20:04:19 +0000 Subject: [PATCH 16/27] Run pre-commit hook to format changes. --- python/sglang/srt/openai_api/adapter.py | 46 ++++++++++++++++++++----- python/sglang/srt/reasoning_parser.py | 21 ++++++----- 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 7fc11a87e9f5..c2c05d9da25d 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -43,7 +43,6 @@ register_conv_template, ) from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser -from sglang.srt.reasoning_parser import is_reasoning_model, ReasoningParser from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput from sglang.srt.openai_api.protocol import ( BatchRequest, @@ -75,6 +74,7 @@ TopLogprob, UsageInfo, ) +from sglang.srt.reasoning_parser import ReasoningParser, is_reasoning_model from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -1103,7 +1103,11 @@ def v1_chat_generate_response( try: parser = ReasoningParser(model, True) parse_result = parser.parse_non_stream(text) - ret_item["text"] = None if parse_result.normal_text and len(parse_result.normal_text) == 0 else parse_result.normal_text + ret_item["text"] = ( + None + if parse_result.normal_text and len(parse_result.normal_text) == 0 + else parse_result.normal_text + ) reasoning_text = parse_result.reasoning_text except Exception as e: logger.error(f"Exception: {e}") @@ -1296,19 +1300,27 @@ async def generate_stream_resp(): choice_logprobs = None finish_reason = content["meta_info"]["finish_reason"] - finish_reason_type = finish_reason["type"] if finish_reason else None + finish_reason_type = ( + finish_reason["type"] if finish_reason else None + ) if is_first: # First chunk with role is_first = False - if request.separate_reasoning and is_reasoning_model(request.model): + if request.separate_reasoning and is_reasoning_model( + request.model + ): delta = DeltaMessage(role="assistant", reasoning_content="") else: delta = DeltaMessage(role="assistant", content="") choice_data = ChatCompletionResponseStreamChoice( index=index, delta=delta, - finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, + finish_reason=( + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type + ), matched_stop=( finish_reason["matched"] if finish_reason and "matched" in finish_reason @@ -1337,8 +1349,15 @@ async def generate_stream_resp(): if parse_result.reasoning_text: choice_data = ChatCompletionResponseStreamChoice( index=index, - delta=DeltaMessage(reasoning_content=parse_result.reasoning_text), - finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, + delta=DeltaMessage( + reasoning_content=parse_result.reasoning_text + ), + finish_reason=( + None + if finish_reason_type + and len(finish_reason_type) == 0 + else finish_reason_type + ), ) chunk = ChatCompletionStreamResponse( id=content["meta_info"]["id"], @@ -1368,7 +1387,12 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(content=normal_text), - finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, + finish_reason=( + None + if finish_reason_type + and len(finish_reason_type) == 0 + else finish_reason_type + ), ) chunk = ChatCompletionStreamResponse( id=content["meta_info"]["id"], @@ -1435,7 +1459,11 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(content=delta), - finish_reason=None if finish_reason_type and len(finish_reason_type) == 0 else finish_reason_type, + finish_reason=( + None + if finish_reason_type and len(finish_reason_type) == 0 + else finish_reason_type + ), matched_stop=( finish_reason["matched"] if finish_reason and "matched" in finish_reason diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 3499c2ea6144..4c3ddbfebcb2 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -1,5 +1,5 @@ import re -from typing import Optional, Dict +from typing import Dict, Optional REASONING_MODELS = ["deepseek-r1"] @@ -11,12 +11,15 @@ def is_reasoning_model(model_name: str) -> bool: class StreamingParseResult: """Result of streaming incremental parsing.""" + def __init__(self, normal_text: str = "", reasoning_text: str = ""): self.normal_text = normal_text self.reasoning_text = reasoning_text + class BaseReasoningFormatDetector: """Base class providing two sets of interfaces: one-time and streaming incremental.""" + def __init__(self, stream_reasoning: bool = False): self._buffer = "" self._in_reasoning = False @@ -31,18 +34,20 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: """Streaming incremental parsing.""" raise NotImplementedError + class DeepSeekR1Detector(BaseReasoningFormatDetector): """ Detector for DeepSeek-R1 model. Assumes reasoning format: ()*(.*) - Returns all the text before the tag as `reasoning_text` + Returns all the text before the tag as `reasoning_text` and the rest of the text as `normal_text`. Args: stream_reasoning (bool): If False, accumulates reasoning content until the end tag. If True, streams reasoning content as it arrives. """ + def __init__(self, stream_reasoning: bool = False): super().__init__(stream_reasoning=stream_reasoning) self.think_start_token = "" @@ -67,10 +72,7 @@ def detect_and_parse(self, text: str) -> StreamingParseResult: reasoning_text = splits[0] text = splits[1].strip() - return StreamingParseResult( - normal_text=text, - reasoning_text=reasoning_text - ) + return StreamingParseResult(normal_text=text, reasoning_text=reasoning_text) def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: """ @@ -103,12 +105,11 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: self._in_reasoning = False self._current_reasoning = "" - normal_text = current_text[end_idx + len(self.think_end_token):] + normal_text = current_text[end_idx + len(self.think_end_token) :] self._buffer = "" return StreamingParseResult( - normal_text=normal_text, - reasoning_text=reasoning_text.rstrip() + normal_text=normal_text, reasoning_text=reasoning_text.rstrip() ) # Continue with reasoning content @@ -129,6 +130,7 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: return StreamingParseResult() + class ReasoningParser: """ Parser that handles both streaming and non-streaming scenarios for extracting @@ -139,6 +141,7 @@ class ReasoningParser: stream_reasoning (bool): If Flase, accumulates reasoning content until complete. If True, streams reasoning content as it arrives. """ + DetectorMap: Dict[str, BaseReasoningFormatDetector] = { "deepseek-r1": DeepSeekR1Detector } From e165bf7d6804beaa544874794cc8da5e7af6073c Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Fri, 28 Feb 2025 02:10:17 +0000 Subject: [PATCH 17/27] Merge in awesome docs from #3859 by @ShaoZhang0115 and add unittests. --- docs/backend/reasoning_parser.md | 138 +++++++++++++ python/sglang/srt/openai_api/adapter.py | 27 ++- python/sglang/srt/openai_api/protocol.py | 4 +- python/sglang/srt/reasoning_parser.py | 7 +- python/sglang/srt/server_args.py | 9 + python/sglang/test/test_utils.py | 1 + test/srt/run_suite.py | 1 + test/srt/test_reasoning_content.py | 238 +++++++++++++++++++++++ 8 files changed, 413 insertions(+), 12 deletions(-) create mode 100644 docs/backend/reasoning_parser.md create mode 100644 test/srt/test_reasoning_content.py diff --git a/docs/backend/reasoning_parser.md b/docs/backend/reasoning_parser.md new file mode 100644 index 000000000000..bce419759187 --- /dev/null +++ b/docs/backend/reasoning_parser.md @@ -0,0 +1,138 @@ +# Reasoning Parser + +SGLang supports parsing reasoning content our from "normal" content for reasoning models such as [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1). + +The contract follows the [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model) established with the release of DeepSeek-R1: + +- `reasoning_content`: The content of the CoT. +- `content`: The content of the final answer. + +## Supported Models + +Currently, SGLang supports the following reasoning models: +- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `` and `` tags. + +## Usage + +There are two ways to enable reasoning parsing: + +1) Enable the reasoning parser when starting the SGLang Server by setting the `--enable-reasoning` and `--reasoning-parser` options. The `--reasoning-parser` option specifies the reasoning parser to extract the reasoning content and final answer. + +```bash +python -m sglang.launch_server --host 0.0.0.0 \ +--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ +--enable-reasoning --reasoning-parser deepseek-r1 +``` + +2) Specify on a per-request basis by setting the `separate_reasoning` body field on a `/chat/completions` request. + +```bash +curl -X POST -H "Content-Type: application/json" \ +-d '{"messages":[{"role":"user","content":"Compute 1+3"}],"max_tokens":100,"model":"deepseek-r1","stream":true,"separate_reasoning":true}' http://0.0.0.0:30000/v1/chat/completions +``` + +There is another body param which can be set to buffer the reasoning traces to be sent in one chunk after the closing `` tag, `"stream_reasoning": false`. + +### Non-streaming Request + +Make a request to the reasoning model, get the reasoning content and final answer. + +Using OpenAI python api: +```python +import openai + +client = openai.Client(base_url="http://localhost:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="deepseek-r1:14b", + messages=[{"role": "user", "content": "Compute 1+3"}], + max_tokens=1024, + stream=False +) + +response.choices[0].message.reasoning_content +# 'First, I recognize that the problem requires adding the numbers 1 and 3.\n\nNext, I identify the numbers to be added, which are 1 and 3.\n\nThen, I perform the addition operation: 1 plus 3 equals 4.\n\nFinally, I conclude that the sum of 1 and 3 is 4.\n' +response.choices[0].message.content +# \n\nTo compute \\(1 + 3\\), follow these simple steps:\n\n1. **Identify the numbers to add:** \n The numbers are **1** and **3**.\n\n2. **Add the numbers together:** \n \\[\n 1 + 3 = 4\n \\]\n\n3. **Write the final answer:** \n The sum of \\(1 + 3\\) is \\(\\boxed{4}\\).' +``` + +### Streaming Request + +`reasoning_content` is available in the `delta` field of the streaming response. + +Using OpenAI python api: + +```python +# ... Initialize the client as before ... + +response = client.chat.completions.create( + model="deepseek-r1:14b", + messages=[{"role": "user", "content": "Compute 1+3"}], + max_tokens=1024, + stream=True +) +reasoning_content = "" +content = "" +for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content + +reasoning_content +# 'I need to calculate the sum of 1 and 3. \n\nFirst, I identify the numbers involved in the addition: 1 and 3.\n\nNext, I add these two numbers together to find the total.\n\nFinally, the result of the addition is 4.\n' +content +# '\n\n**Solution:**\n\nWe need to compute the sum of 1 and 3.\n\n1. **Identify the numbers to add:**\n - Number 1\n - Number 3\n\n2. **Add the numbers together:**\n \\[\n 1 + 3 = 4\n \\]\n\n3. **Final Answer:**\n \\[\n \\boxed{4}\n \\]' +``` + + +## Supporting New Reasoning Models + +For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningParser` in `python/sglang/srt/reasoning_parser.py`. + +```python +class BaseReasoningParser: + """Base class for reasoning parser.""" + + def __init__(self): + self._buffer = "" + + def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: + """Detect and parse the text, return reasoning_content and content.""" + raise NotImplementedError + + def parse_streaming_increment( + self, new_text: str + ) -> Tuple[Optional[str], Optional[str]]: + """Parse the new text incrementally, return reasoning_content and content.""" + raise NotImplementedError +``` + +And specify the reasoning parser for new reasoning models accordingly. + +```python +class ReasoningParser: + """Reasoning parser for different reasoning models.""" + + # Specify the reasoning parser for each reasoning model here + ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = { + "deepseek-r1": DeepSeekR1ReasoningParser + } + + def __init__(self, reasoning_parser: str): + self.parser = self.ReasoningParserDict[reasoning_parser]() + + def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]: + """ + Non-streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.detect_and_parse(full_text) + + def parse_stream_chunk(self, chunk_text: str): + """ + Streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.parse_streaming_increment(chunk_text) +``` diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index c2c05d9da25d..fcbe6dbc46ca 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1039,7 +1039,12 @@ def v1_chat_generate_request( def v1_chat_generate_response( - request, ret, to_file=False, cache_report=False, tool_call_parser=None + request, + ret, + to_file=False, + cache_report=False, + tool_call_parser=None, + reasoning_parser=None, ): choices = [] @@ -1099,11 +1104,13 @@ def v1_chat_generate_response( tools = request.tools model = request.model - if request.separate_reasoning and is_reasoning_model(model): + if reasoning_parser or ( + request.separate_reasoning and is_reasoning_model(model) + ): try: parser = ReasoningParser(model, True) parse_result = parser.parse_non_stream(text) - ret_item["text"] = ( + text = ( None if parse_result.normal_text and len(parse_result.normal_text) == 0 else parse_result.normal_text @@ -1146,7 +1153,7 @@ def v1_chat_generate_response( "index": 0, "message": { "role": "assistant", - "content": ret_item["text"] if tool_calls is None else None, + "content": text if tool_calls is None else None, "tool_calls": tool_calls, }, "logprobs": choice_logprobs, @@ -1164,7 +1171,7 @@ def v1_chat_generate_response( index=idx, message=ChatMessage( role="assistant", - content=ret_item["text"] if tool_calls is None else None, + content=text if tool_calls is None else None, tool_calls=tool_calls, reasoning_content=reasoning_text, ), @@ -1307,8 +1314,9 @@ async def generate_stream_resp(): if is_first: # First chunk with role is_first = False - if request.separate_reasoning and is_reasoning_model( - request.model + if tokenizer_manager.server_args.reasoning_parser or ( + request.separate_reasoning + and is_reasoning_model(request.model) ): delta = DeltaMessage(role="assistant", reasoning_content="") else: @@ -1339,7 +1347,9 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] new_stream_buffer = stream_buffer + delta - if request.separate_reasoning and is_reasoning_model(request.model): + if tokenizer_manager.server_args.reasoning_parser or ( + request.separate_reasoning and is_reasoning_model(request.model) + ): if index not in reasoning_parser_dict: reasoning_parser_dict[index] = ReasoningParser( request.model, request.stream_reasoning @@ -1530,6 +1540,7 @@ async def generate_stream_resp(): ret, cache_report=tokenizer_manager.server_args.enable_cache_report, tool_call_parser=tokenizer_manager.server_args.tool_call_parser, + reasoning_parser=tokenizer_manager.server_args.reasoning_parser, ) return response diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index e23886b7856c..9a1a44e6c026 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -346,8 +346,8 @@ class ToolCall(BaseModel): class ChatMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None - tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) reasoning_content: Optional[str] = None + tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) class ChatCompletionResponseChoice(BaseModel): @@ -370,8 +370,8 @@ class ChatCompletionResponse(BaseModel): class DeltaMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None - tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) reasoning_content: Optional[str] = None + tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) class ChatCompletionResponseStreamChoice(BaseModel): diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 4c3ddbfebcb2..5475f8e29521 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -1,12 +1,15 @@ import re -from typing import Dict, Optional +from typing import Dict REASONING_MODELS = ["deepseek-r1"] def is_reasoning_model(model_name: str) -> bool: """Checks if the model is a reasoning model.""" - return model_name.lower() in REASONING_MODELS + for model in REASONING_MODELS: + if re.match(f".*{model}.*", model_name, re.IGNORECASE): + return True + return False class StreamingParseResult: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index fd2188dcce79..0de73b4ab620 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -23,6 +23,7 @@ import torch from sglang.srt.hf_transformers_utils import check_gguf_file +from sglang.srt.reasoning_parser import REASONING_MODELS from sglang.srt.utils import ( get_amdgpu_memory_capacity, get_hpu_memory_capacity, @@ -95,6 +96,7 @@ class ServerArgs: api_key: Optional[str] = None file_storage_pth: str = "sglang_storage" enable_cache_report: bool = False + reasoning_parser: Optional[str] = None # Data parallelism dp_size: int = 1 @@ -606,6 +608,13 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.", ) + parser.add_argument( + "--reasoning-parser", + type=str, + choices=REASONING_MODELS, + default=ServerArgs.reasoning_parser, + help="Specify the parser for reasoning models, supported parsers are: {REASONING_MODELS}.", + ) # Data parallelism parser.add_argument( diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 3dc1ae347372..c0fc2baa5200 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -34,6 +34,7 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" +DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 506b87bf6978..7f146e261d4b 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -32,6 +32,7 @@ "test_openai_server.py", "test_pytorch_sampling_backend.py", "test_radix_attention.py", + "test_reasoning_content.py", "test_regex_constrained.py", "test_release_memory_occupation.py", "test_request_length_validation.py", diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py new file mode 100644 index 000000000000..02f7ec96fd5c --- /dev/null +++ b/test/srt/test_reasoning_content.py @@ -0,0 +1,238 @@ +""" +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true +python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_nonstreaming +python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_streaming +""" + +import json +import unittest + +import requests + +from sglang.test.test_utils import ( + DEFAULT_REASONING_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + kill_process_tree, + popen_launch_server, +) + + +class TestReasoningContentAPI(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST + cls.base_url = "http://0.0.0.0:5000" # DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + + @classmethod + def tearDownClass(cls): + # kill_process_tree(cls.process.pid) + pass + + def test_streaming_separate_reasoning_false(self): + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+1?", + } + ], + "max_tokens": 100, + "stream": True, + "separate_reasoning": False, + } + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + stream=True, + ) + assert response.status_code == 200 + reasoning_content = "" + content = "" + for line in response.iter_lines(): + print(f"[test_streaming_separate_reasoning_false] {line}") + if line and not line.startswith(b"data: [DONE]"): + parsed = json.loads(line[6:]) + if ( + "reasoning_content" in parsed["choices"][0]["delta"] + and parsed["choices"][0]["delta"]["reasoning_content"] + ): + reasoning_content += parsed["choices"][0]["delta"][ + "reasoning_content" + ] + if ( + "content" in parsed["choices"][0]["delta"] + and parsed["choices"][0]["delta"]["content"] + ): + content += parsed["choices"][0]["delta"]["content"] + + assert len(reasoning_content) == 0 + assert len(content) > 0 + + def test_streaming_separate_reasoning_true(self): + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+1?", + } + ], + "max_tokens": 100, + "stream": True, + "separate_reasoning": True, + } + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + stream=True, + ) + assert response.status_code == 200 + reasoning_content = "" + content = "" + for line in response.iter_lines(): + print(f"[test_streaming_separate_reasoning_true] {line}") + if line and not line.startswith(b"data: [DONE]"): + parsed = json.loads(line[6:]) + if ( + "reasoning_content" in parsed["choices"][0]["delta"] + and parsed["choices"][0]["delta"]["reasoning_content"] + ): + reasoning_content += parsed["choices"][0]["delta"][ + "reasoning_content" + ] + if ( + "content" in parsed["choices"][0]["delta"] + and parsed["choices"][0]["delta"]["content"] + ): + content += parsed["choices"][0]["delta"]["content"] + + assert len(reasoning_content) > 0 + + def test_nonstreaming_separate_reasoning_false(self): + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+1?", + } + ], + "max_tokens": 100, + "separate_reasoning": False, + } + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + ) + assert response.status_code == 200 + resp = response.json() + assert resp["choices"][0]["message"]["reasoning_content"] == None + assert len(resp["choices"][0]["message"]["content"]) > 0 + + def test_nonstreaming_separate_reasoning_true(self): + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+1?", + } + ], + "max_tokens": 100, + "separate_reasoning": True, + } + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + ) + assert response.status_code == 200 + resp = response.json() + assert len(resp["choices"][0]["message"]["reasoning_content"]) > 0 + + +class TestReasoningContentStartup(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--reasoning-parser", + "deepseek-r1", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_nonstreaming(self): + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+1?", + } + ], + "max_tokens": 100, + } + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + ) + assert response.status_code == 200 + resp = response.json() + assert len(resp["choices"][0]["message"]["reasoning_content"]) > 0 + + def test_streaming(self): + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": "What is 1+1?", + } + ], + "max_tokens": 100, + "stream": True, + } + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + stream=True, + ) + assert response.status_code == 200 + reasoning_content = "" + content = "" + for line in response.iter_lines(): + print(f"[test_streaming_separate_reasoning_true] {line}") + if line and not line.startswith(b"data: [DONE]"): + parsed = json.loads(line[6:]) + if ( + "reasoning_content" in parsed["choices"][0]["delta"] + and parsed["choices"][0]["delta"]["reasoning_content"] + ): + reasoning_content += parsed["choices"][0]["delta"][ + "reasoning_content" + ] + if ( + "content" in parsed["choices"][0]["delta"] + and parsed["choices"][0]["delta"]["content"] + ): + content += parsed["choices"][0]["delta"]["content"] + + assert len(reasoning_content) > 0 From 5a892256cf67700630af06b695f333ae13c2e7c1 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Fri, 28 Feb 2025 02:12:37 +0000 Subject: [PATCH 18/27] Adding missing format string. --- python/sglang/srt/server_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 0de73b4ab620..24825daae377 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -613,7 +613,7 @@ def add_cli_args(parser: argparse.ArgumentParser): type=str, choices=REASONING_MODELS, default=ServerArgs.reasoning_parser, - help="Specify the parser for reasoning models, supported parsers are: {REASONING_MODELS}.", + help=f"Specify the parser for reasoning models, supported parsers are: {REASONING_MODELS}.", ) # Data parallelism From fa85c961c5c218593695ef2a87f306b02b071e9f Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Fri, 28 Feb 2025 02:13:20 +0000 Subject: [PATCH 19/27] Remove local testing hacks. --- test/srt/test_reasoning_content.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py index 02f7ec96fd5c..a04d28eb2a33 100644 --- a/test/srt/test_reasoning_content.py +++ b/test/srt/test_reasoning_content.py @@ -25,7 +25,7 @@ class TestReasoningContentAPI(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST - cls.base_url = "http://0.0.0.0:5000" # DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, @@ -34,7 +34,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - # kill_process_tree(cls.process.pid) + kill_process_tree(cls.process.pid) pass def test_streaming_separate_reasoning_false(self): From 55acaaaf5977ebb67e68387b03c34ea6a633b663 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Fri, 28 Feb 2025 20:52:04 +0000 Subject: [PATCH 20/27] Move reasoning_parser.md to `docs/references` --- docs/references/deepseek.rst | 1 + docs/{backend => references}/reasoning_parser.md | 0 2 files changed, 1 insertion(+) rename docs/{backend => references}/reasoning_parser.md (100%) diff --git a/docs/references/deepseek.rst b/docs/references/deepseek.rst index b45383a4b3b4..f77fe1ba78ac 100644 --- a/docs/references/deepseek.rst +++ b/docs/references/deepseek.rst @@ -4,3 +4,4 @@ Multi-Node Deployment :maxdepth: 1 deepseek.md + reasoning_parser.md diff --git a/docs/backend/reasoning_parser.md b/docs/references/reasoning_parser.md similarity index 100% rename from docs/backend/reasoning_parser.md rename to docs/references/reasoning_parser.md From 2ae4fa4224fda23d5f22fdcc0d233311d7e28e47 Mon Sep 17 00:00:00 2001 From: Lucas Pickup Date: Fri, 28 Feb 2025 21:52:48 +0000 Subject: [PATCH 21/27] Fixup incorrect handling of `request: list` --- python/sglang/srt/openai_api/adapter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index f10b6b43e951..d3161a7a518b 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1106,14 +1106,14 @@ def v1_chat_generate_response( tool_choice = request[idx].tool_choice tools = request[idx].tools model = request[idx].model + separate_reasoning = request[idx].separate_reasoning else: tool_choice = request.tool_choice tools = request.tools model = request.model + separate_reasoning = request.separate_reasoning - if reasoning_parser or ( - request.separate_reasoning and is_reasoning_model(model) - ): + if reasoning_parser or (separate_reasoning and is_reasoning_model(model)): try: parser = ReasoningParser(model, True) parse_result = parser.parse_non_stream(text) From 94bee72576f1a03222f30025cd9761d9820f2d06 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Sun, 2 Mar 2025 21:08:47 +0800 Subject: [PATCH 22/27] [Refactor] Update reasoning handling in ChatCompletionRequest and adjust model references --- python/sglang/srt/openai_api/adapter.py | 31 ++-- python/sglang/srt/openai_api/protocol.py | 2 +- python/sglang/srt/reasoning_parser.py | 72 ++++---- python/sglang/test/test_utils.py | 2 +- test/srt/run_suite.py | 2 +- test/srt/test_reasoning_content.py | 201 ++++++++--------------- 6 files changed, 118 insertions(+), 192 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index d3161a7a518b..a3f93a3ecfcf 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -74,7 +74,7 @@ TopLogprob, UsageInfo, ) -from sglang.srt.reasoning_parser import ReasoningParser, is_reasoning_model +from sglang.srt.reasoning_parser import ReasoningParser from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -1105,23 +1105,17 @@ def v1_chat_generate_response( if isinstance(request, list): tool_choice = request[idx].tool_choice tools = request[idx].tools - model = request[idx].model separate_reasoning = request[idx].separate_reasoning else: tool_choice = request.tool_choice tools = request.tools - model = request.model separate_reasoning = request.separate_reasoning - if reasoning_parser or (separate_reasoning and is_reasoning_model(model)): + if reasoning_parser and separate_reasoning: try: - parser = ReasoningParser(model, True) + parser = ReasoningParser(reasoning_parser, True) parse_result = parser.parse_non_stream(text) - text = ( - None - if parse_result.normal_text and len(parse_result.normal_text) == 0 - else parse_result.normal_text - ) + text = parse_result.normal_text #! text can not be None reasoning_text = parse_result.reasoning_text except Exception as e: logger.error(f"Exception: {e}") @@ -1162,6 +1156,7 @@ def v1_chat_generate_response( "role": "assistant", "content": text if tool_calls is None else None, "tool_calls": tool_calls, + "reasoning_content": reasoning_text, }, "logprobs": choice_logprobs, "finish_reason": (finish_reason["type"] if finish_reason else ""), @@ -1171,8 +1166,6 @@ def v1_chat_generate_response( else None ), } - if reasoning_text: - choice_data["message"]["reasoning_content"] = reasoning_text else: choice_data = ChatCompletionResponseChoice( index=idx, @@ -1321,9 +1314,9 @@ async def generate_stream_resp(): if is_first: # First chunk with role is_first = False - if tokenizer_manager.server_args.reasoning_parser or ( - request.separate_reasoning - and is_reasoning_model(request.model) + if ( + tokenizer_manager.server_args.reasoning_parser + and request.separate_reasoning ): delta = DeltaMessage(role="assistant", reasoning_content="") else: @@ -1354,12 +1347,14 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] new_stream_buffer = stream_buffer + delta - if tokenizer_manager.server_args.reasoning_parser or ( - request.separate_reasoning and is_reasoning_model(request.model) + if ( + tokenizer_manager.server_args.reasoning_parser + and request.separate_reasoning ): if index not in reasoning_parser_dict: reasoning_parser_dict[index] = ReasoningParser( - request.model, request.stream_reasoning + tokenizer_manager.server_args.reasoning_parser, + request.stream_reasoning, ) reasoning_parser = reasoning_parser_dict[index] parse_result = reasoning_parser.parse_stream_chunk(delta) diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 286761c1ea4b..0c0aa09619cc 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -336,7 +336,7 @@ class ChatCompletionRequest(BaseModel): skip_special_tokens: bool = True lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None session_params: Optional[Dict] = None - separate_reasoning: bool = False + separate_reasoning: bool = True stream_reasoning: bool = True diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 5475f8e29521..b13ae8392331 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -4,14 +4,6 @@ REASONING_MODELS = ["deepseek-r1"] -def is_reasoning_model(model_name: str) -> bool: - """Checks if the model is a reasoning model.""" - for model in REASONING_MODELS: - if re.match(f".*{model}.*", model_name, re.IGNORECASE): - return True - return False - - class StreamingParseResult: """Result of streaming incremental parsing.""" @@ -23,41 +15,20 @@ def __init__(self, normal_text: str = "", reasoning_text: str = ""): class BaseReasoningFormatDetector: """Base class providing two sets of interfaces: one-time and streaming incremental.""" - def __init__(self, stream_reasoning: bool = False): - self._buffer = "" - self._in_reasoning = False - self._current_reasoning = "" + def __init__( + self, + think_start_token: str, + think_end_token: str, + force_reasoning: bool = False, + stream_reasoning: bool = False, + ): + self.think_start_token = think_start_token + self.think_end_token = think_end_token + self._in_reasoning = force_reasoning self.stream_reasoning = stream_reasoning - def detect_and_parse(self, text: str) -> StreamingParseResult: - """Parses the text in one go.""" - raise NotImplementedError - - def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: - """Streaming incremental parsing.""" - raise NotImplementedError - - -class DeepSeekR1Detector(BaseReasoningFormatDetector): - """ - Detector for DeepSeek-R1 model. - Assumes reasoning format: - ()*(.*) - Returns all the text before the tag as `reasoning_text` - and the rest of the text as `normal_text`. - - Args: - stream_reasoning (bool): If False, accumulates reasoning content until the end tag. - If True, streams reasoning content as it arrives. - """ - - def __init__(self, stream_reasoning: bool = False): - super().__init__(stream_reasoning=stream_reasoning) - self.think_start_token = "" - self.think_end_token = "" - # DeepSeek-R1 is assumed to be reasoning until `` token - # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599 - self._in_reasoning = True + self._buffer = "" + self._current_reasoning = "" self.stripped_think_start = False def detect_and_parse(self, text: str) -> StreamingParseResult: @@ -134,6 +105,25 @@ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult: return StreamingParseResult() +class DeepSeekR1Detector(BaseReasoningFormatDetector): + """ + Detector for DeepSeek-R1 model. + Assumes reasoning format: + ()*(.*) + Returns all the text before the tag as `reasoning_text` + and the rest of the text as `normal_text`. + + Args: + stream_reasoning (bool): If False, accumulates reasoning content until the end tag. + If True, streams reasoning content as it arrives. + """ + + def __init__(self, stream_reasoning: bool = False): + # DeepSeek-R1 is assumed to be reasoning until `` token + super().__init__("", "", True, stream_reasoning=stream_reasoning) + # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599 + + class ReasoningParser: """ Parser that handles both streaming and non-streaming scenarios for extracting diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index c0fc2baa5200..b1d88a8c49de 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -34,7 +34,7 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" -DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" +DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 12a53cebe99b..d5fd6cbc132e 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -32,7 +32,6 @@ "test_openai_server.py", "test_pytorch_sampling_backend.py", "test_radix_attention.py", - "test_reasoning_content.py", "test_regex_constrained.py", "test_release_memory_occupation.py", "test_request_length_validation.py", @@ -58,6 +57,7 @@ "test_w8a8_quantization.py", "test_fp8_kernel.py", "test_block_int8.py", + "test_reasoning_content.py", ], "nightly": [ "test_nightly_gsm8k_eval.py", diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py index a04d28eb2a33..09cd34c83fe9 100644 --- a/test/srt/test_reasoning_content.py +++ b/test/srt/test_reasoning_content.py @@ -1,4 +1,5 @@ """ +Usage: python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false @@ -10,13 +11,14 @@ import json import unittest +import openai import requests +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_REASONING_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, - kill_process_tree, popen_launch_server, ) @@ -26,213 +28,152 @@ class TestReasoningContentAPI(unittest.TestCase): def setUpClass(cls): cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-1234" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=[ + "--reasoning-parser", + "deepseek-r1", + ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) - pass def test_streaming_separate_reasoning_false(self): + # Test streaming with separate_reasoning=False, reasoning_content should be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) payload = { "model": self.model, "messages": [ { "role": "user", - "content": "What is 1+1?", + "content": "What is 1+3?", } ], "max_tokens": 100, "stream": True, "separate_reasoning": False, } - response = requests.post( - f"{self.base_url}/v1/chat/completions", - json=payload, - stream=True, - ) + response = client.chat_completions.create(**payload) + assert response.status_code == 200 reasoning_content = "" content = "" - for line in response.iter_lines(): - print(f"[test_streaming_separate_reasoning_false] {line}") - if line and not line.startswith(b"data: [DONE]"): - parsed = json.loads(line[6:]) - if ( - "reasoning_content" in parsed["choices"][0]["delta"] - and parsed["choices"][0]["delta"]["reasoning_content"] - ): - reasoning_content += parsed["choices"][0]["delta"][ - "reasoning_content" - ] - if ( - "content" in parsed["choices"][0]["delta"] - and parsed["choices"][0]["delta"]["content"] - ): - content += parsed["choices"][0]["delta"]["content"] + for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content assert len(reasoning_content) == 0 assert len(content) > 0 def test_streaming_separate_reasoning_true(self): + # Test streaming with separate_reasoning=True, reasoning_content should not be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) payload = { "model": self.model, "messages": [ { "role": "user", - "content": "What is 1+1?", + "content": "What is 1+3?", } ], "max_tokens": 100, "stream": True, "separate_reasoning": True, } - response = requests.post( - f"{self.base_url}/v1/chat/completions", - json=payload, - stream=True, - ) + response = client.chat_completions.create(**payload) + assert response.status_code == 200 reasoning_content = "" content = "" - for line in response.iter_lines(): - print(f"[test_streaming_separate_reasoning_true] {line}") - if line and not line.startswith(b"data: [DONE]"): - parsed = json.loads(line[6:]) - if ( - "reasoning_content" in parsed["choices"][0]["delta"] - and parsed["choices"][0]["delta"]["reasoning_content"] - ): - reasoning_content += parsed["choices"][0]["delta"][ - "reasoning_content" - ] - if ( - "content" in parsed["choices"][0]["delta"] - and parsed["choices"][0]["delta"]["content"] - ): - content += parsed["choices"][0]["delta"]["content"] + for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content assert len(reasoning_content) > 0 + assert len(content) > 0 - def test_nonstreaming_separate_reasoning_false(self): - payload = { - "model": self.model, - "messages": [ - { - "role": "user", - "content": "What is 1+1?", - } - ], - "max_tokens": 100, - "separate_reasoning": False, - } - response = requests.post( - f"{self.base_url}/v1/chat/completions", - json=payload, - ) - assert response.status_code == 200 - resp = response.json() - assert resp["choices"][0]["message"]["reasoning_content"] == None - assert len(resp["choices"][0]["message"]["content"]) > 0 - - def test_nonstreaming_separate_reasoning_true(self): + def test_streaming_separate_reasoning_true_stream_reasoning_false(self): + # Test streaming with separate_reasoning=True, reasoning_content should not be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) payload = { "model": self.model, "messages": [ { "role": "user", - "content": "What is 1+1?", + "content": "What is 1+3?", } ], "max_tokens": 100, + "stream": True, "separate_reasoning": True, + "stream_reasoning": False, } - response = requests.post( - f"{self.base_url}/v1/chat/completions", - json=payload, - ) - assert response.status_code == 200 - resp = response.json() - assert len(resp["choices"][0]["message"]["reasoning_content"]) > 0 - + response = client.chat_completions.create(**payload) -class TestReasoningContentStartup(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--reasoning-parser", - "deepseek-r1", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) + assert response.status_code == 200 + reasoning_content = "" + content = "" + first_chunk = False + for chunk in response: + if chunk.choices[0].delta.content: + first_chunk = True + content += chunk.choices[0].delta.content + reasoning_content = chunk.choices[0].delta.reasoning_content + if not first_chunk: + assert len(chunk.choices[0].delta.reasoning_content) == 0 + assert len(reasoning_content) > 0 + assert len(content) > 0 - def test_nonstreaming(self): + def test_nonstreaming_separate_reasoning_false(self): + # Test non-streaming with separate_reasoning=False, reasoning_content should be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) payload = { "model": self.model, "messages": [ { "role": "user", - "content": "What is 1+1?", + "content": "What is 1+3?", } ], "max_tokens": 100, + "separate_reasoning": False, } - response = requests.post( - f"{self.base_url}/v1/chat/completions", - json=payload, - ) + response = client.chat_completions.create(**payload) + assert response.status_code == 200 - resp = response.json() - assert len(resp["choices"][0]["message"]["reasoning_content"]) > 0 + assert len(response.choices[0].message.reasoning_content) == 0 + assert len(response.choices[0].message.content) > 0 - def test_streaming(self): + def test_nonstreaming_separate_reasoning_true(self): + # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty + client = openai.Client(api_key=self.api_key, base_url=self.base_url) payload = { "model": self.model, "messages": [ { "role": "user", - "content": "What is 1+1?", + "content": "What is 1+3?", } ], "max_tokens": 100, - "stream": True, + "separate_reasoning": True, } - response = requests.post( - f"{self.base_url}/v1/chat/completions", - json=payload, - stream=True, - ) + response = client.chat_completions.create(**payload) + assert response.status_code == 200 - reasoning_content = "" - content = "" - for line in response.iter_lines(): - print(f"[test_streaming_separate_reasoning_true] {line}") - if line and not line.startswith(b"data: [DONE]"): - parsed = json.loads(line[6:]) - if ( - "reasoning_content" in parsed["choices"][0]["delta"] - and parsed["choices"][0]["delta"]["reasoning_content"] - ): - reasoning_content += parsed["choices"][0]["delta"][ - "reasoning_content" - ] - if ( - "content" in parsed["choices"][0]["delta"] - and parsed["choices"][0]["delta"]["content"] - ): - content += parsed["choices"][0]["delta"]["content"] + assert len(response.choices[0].message.reasoning_content) > 0 + assert len(response.choices[0].message.content) > 0 - assert len(reasoning_content) > 0 + +if __name__ == "__main__": + unittest.main() From 411473b724f65f9fd8c64e835cae6b3e7d90ec25 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Sun, 2 Mar 2025 21:15:25 +0800 Subject: [PATCH 23/27] revert dockerfile changes --- docker/Dockerfile | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 943b2762656b..3ae74a8cccbe 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -25,13 +25,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN pip3 install datamodel_code_generator WORKDIR /sgl-workspace -ARG SGL_REPO="https://github.com/sgl-project/sglang" -ENV SGL_DEFAULT="main" -ARG SGL_BRANCH=${SGL_DEFAULT} ARG CUDA_VERSION RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ - && git clone ${SGL_REPO} \ + && git clone --depth=1 https://github.com/sgl-project/sglang.git \ && if [ "$CUDA_VERSION" = "12.1.1" ]; then \ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu121; \ elif [ "$CUDA_VERSION" = "12.4.1" ]; then \ @@ -45,12 +42,6 @@ RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \ fi \ && cd sglang \ - && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \ - echo "Using ${SGL_DEFAULT}, default branch."; \ - else \ - echo "Using ${SGL_BRANCH} branch."; \ - git checkout ${SGL_BRANCH}; \ - fi \ && if [ "$BUILD_TYPE" = "srt" ]; then \ if [ "$CUDA_VERSION" = "12.1.1" ]; then \ python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \ From ce6c4850c3cecb0d84aae8a187aa74fa82d9a237 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Sun, 2 Mar 2025 21:20:06 +0800 Subject: [PATCH 24/27] add more testcases --- test/srt/test_reasoning_content.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py index 09cd34c83fe9..dc71758ac0db 100644 --- a/test/srt/test_reasoning_content.py +++ b/test/srt/test_reasoning_content.py @@ -2,6 +2,7 @@ Usage: python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true +python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true_stream_reasoning_false python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false python3 -m unittest test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true python3 -m unittest test_reasoning_content.TestReasoningContentStartup.test_nonstreaming From 022590a9414e60e46a769995f66869d4da36bc99 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Sun, 2 Mar 2025 21:25:05 +0800 Subject: [PATCH 25/27] add main for unit tests --- python/sglang/srt/reasoning_parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index b13ae8392331..5b7dcc71a34c 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -156,3 +156,7 @@ def parse_non_stream(self, full_text: str) -> StreamingParseResult: def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult: """Streaming call: incremental parsing""" return self.detector.parse_streaming_increment(chunk_text) + + +if __name__ == "__main__": + unittest.main() From 98be910d154f81fa85fcfc92e02029c78980944a Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Sun, 2 Mar 2025 21:26:27 +0800 Subject: [PATCH 26/27] revert some typos --- python/sglang/srt/reasoning_parser.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 5b7dcc71a34c..b13ae8392331 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -156,7 +156,3 @@ def parse_non_stream(self, full_text: str) -> StreamingParseResult: def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult: """Streaming call: incremental parsing""" return self.detector.parse_streaming_increment(chunk_text) - - -if __name__ == "__main__": - unittest.main() From 9ff2a19338978f3b9493cd8ce5d8bbdeace32f04 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Sun, 2 Mar 2025 21:33:34 +0800 Subject: [PATCH 27/27] fix(reasoning content): :bug: fix typos --- test/srt/test_reasoning_content.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/srt/test_reasoning_content.py b/test/srt/test_reasoning_content.py index dc71758ac0db..d2f34a35c10f 100644 --- a/test/srt/test_reasoning_content.py +++ b/test/srt/test_reasoning_content.py @@ -60,7 +60,7 @@ def test_streaming_separate_reasoning_false(self): "stream": True, "separate_reasoning": False, } - response = client.chat_completions.create(**payload) + response = client.chat.completions.create(**payload) assert response.status_code == 200 reasoning_content = "" @@ -89,7 +89,7 @@ def test_streaming_separate_reasoning_true(self): "stream": True, "separate_reasoning": True, } - response = client.chat_completions.create(**payload) + response = client.chat.completions.create(**payload) assert response.status_code == 200 reasoning_content = "" @@ -119,7 +119,7 @@ def test_streaming_separate_reasoning_true_stream_reasoning_false(self): "separate_reasoning": True, "stream_reasoning": False, } - response = client.chat_completions.create(**payload) + response = client.chat.completions.create(**payload) assert response.status_code == 200 reasoning_content = "" @@ -149,7 +149,7 @@ def test_nonstreaming_separate_reasoning_false(self): "max_tokens": 100, "separate_reasoning": False, } - response = client.chat_completions.create(**payload) + response = client.chat.completions.create(**payload) assert response.status_code == 200 assert len(response.choices[0].message.reasoning_content) == 0 @@ -169,7 +169,7 @@ def test_nonstreaming_separate_reasoning_true(self): "max_tokens": 100, "separate_reasoning": True, } - response = client.chat_completions.create(**payload) + response = client.chat.completions.create(**payload) assert response.status_code == 200 assert len(response.choices[0].message.reasoning_content) > 0