diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py index de1663408d72..e86ad4c4d236 100644 --- a/tests/reasoning/test_granite_reasoning_parser.py +++ b/tests/reasoning/test_granite_reasoning_parser.py @@ -4,6 +4,7 @@ from transformers import AutoTokenizer from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction +from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.reasoning import ReasoningParser, ReasoningParserManager parser_name = "granite" @@ -333,6 +334,7 @@ def test_streaming_subcases(param_dict): previous_token_ids=previous_token_ids, current_token_ids=current_token_ids, delta_token_ids=delta_token_ids, + request=ChatCompletionRequest(model="test-model"), ) # Streaming currently expects at least one of reasoning content / content, # so the response should return None in that case. diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index 788136e99681..bcf21437fe28 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -118,6 +118,7 @@ def run_reasoning_extraction_streaming( previous_tokens, current_tokens, token_delta, + request, ) if delta_message is not None: reconstructor.append_delta(delta_message) @@ -150,6 +151,7 @@ def run_reasoning_extraction_streaming_mistral( previous_tokens, current_tokens, token_delta, + request, ) if delta_message is not None: reconstructor.append_delta(delta_message) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 51c618e9d51d..a9c83e2b5abd 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -855,6 +855,7 @@ async def chat_completion_stream_generator( previous_token_ids, current_token_ids, output.token_ids, + request, ) ) # When encountering think end id in delta_token_ids @@ -953,6 +954,7 @@ async def chat_completion_stream_generator( previous_token_ids, current_token_ids, output_token_ids, + request, ) ) # When encountering think end id in prompt_token_ids @@ -1039,6 +1041,7 @@ async def chat_completion_stream_generator( previous_token_ids, current_token_ids, output.token_ids, + request, ) ) # handle streaming just a content delta diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 48c5222bccc9..8f2de50d985d 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -1127,6 +1127,7 @@ async def _process_simple_streaming_events( previous_token_ids=previous_token_ids, current_token_ids=previous_token_ids + output.token_ids, delta_token_ids=output.token_ids, + request=request, ) ) else: diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 78d3bf35f2a3..5436c75b5b1f 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -4,6 +4,7 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .basic_parsers import BaseThinkingReasoningParser from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from .deepseek_v31_reasoning_parser import DeepSeekV31ReasoningParser from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser from .gptoss_reasoning_parser import GptOssReasoningParser from .granite_reasoning_parser import GraniteReasoningParser @@ -19,6 +20,7 @@ "BaseThinkingReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser", + "DeepSeekV31ReasoningParser", "GraniteReasoningParser", "HunyuanA13BReasoningParser", "Qwen3ReasoningParser", diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 2d93f0702f72..d4bda3c31654 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -107,6 +107,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """ Instance method that should be implemented for extracting reasoning diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index b4106a4f5794..2ae85354f2c6 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -78,6 +78,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index 264da54b4879..643f2a24d143 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -4,7 +4,11 @@ from collections.abc import Sequence from typing import Union -from vllm.entrypoints.openai.protocol import DeltaMessage +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser @@ -36,6 +40,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: ret = super().extract_reasoning_content_streaming( previous_text, @@ -44,6 +49,7 @@ def extract_reasoning_content_streaming( previous_token_ids, current_token_ids, delta_token_ids, + request, ) if ( ret is not None diff --git a/vllm/reasoning/deepseek_v31_reasoning_parser.py b/vllm/reasoning/deepseek_v31_reasoning_parser.py new file mode 100644 index 000000000000..1636cc4adbae --- /dev/null +++ b/vllm/reasoning/deepseek_v31_reasoning_parser.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from typing import Optional, Union + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) +from vllm.logger import init_logger +from vllm.reasoning import DeepSeekR1ReasoningParser, ReasoningParserManager + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("deepseek_v31") +class DeepSeekV31ReasoningParser(DeepSeekR1ReasoningParser): + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], + ) -> Union[DeltaMessage, None]: + if ( + request.chat_template_kwargs is not None + and request.chat_template_kwargs.get("thinking", False) is True + ): + return super().extract_reasoning_content_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request, + ) + + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest] + ) -> tuple[Optional[str], Optional[str]]: + if ( + request.chat_template_kwargs is not None + and request.chat_template_kwargs.get("thinking", False) is True + ): + return super().extract_reasoning_content(model_output, request) + + return None, model_output diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py index da98515c7e62..309aa61f5ce0 100644 --- a/vllm/reasoning/glm4_moe_reasoning_parser.py +++ b/vllm/reasoning/glm4_moe_reasoning_parser.py @@ -6,7 +6,11 @@ from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -80,6 +84,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -136,7 +141,7 @@ def extract_reasoning_content_streaming( return DeltaMessage(content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest] ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 738c7b51694a..3c36ccb22a06 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -7,7 +7,11 @@ from transformers import PreTrainedTokenizerBase from vllm.entrypoints.harmony_utils import parse_chat_output -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -53,6 +57,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: prev_reasoning, prev_content, _ = parse_chat_output(list(previous_token_ids)) cur_reasoning, cur_content, _ = parse_chat_output(list(current_token_ids)) @@ -77,7 +82,7 @@ def extract_reasoning_content_streaming( def extract_reasoning_content( self, model_output: str, - request: ChatCompletionRequest, + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> tuple[Optional[str], Optional[str]]: raise NotImplementedError( "gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used." # noqa: E501 diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 543b202989ee..955184f5531a 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -7,7 +7,11 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -52,7 +56,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): ) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest] ) -> tuple[Optional[str], Optional[str]]: """Extract the reasoning content & content sections, respectively. If the sequence doesn't match what we expect, i.e., the model generates @@ -82,6 +86,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """Extract the reasoning content / content emitted by granite models; If the sequence doesn't match what we expect, i.e., the model generates diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py index 381f1b5f3466..cb62cf040b7e 100644 --- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py +++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py @@ -7,7 +7,11 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -89,7 +93,7 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]: return [] def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest] ) -> tuple[Optional[str], Optional[str]]: """Extract the reasoning content & content sections, respectively. If the sequence doesn't match what we expect, i.e., the model generates @@ -150,6 +154,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """Extract content using token ID sequence state machine""" # Define sequences diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index b330e8b1fdd5..240e5ee2aeda 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -287,6 +287,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """Extract content using token ID sequence state machine""" diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py index c9f580077b33..481a788cca59 100644 --- a/vllm/reasoning/step3_reasoning_parser.py +++ b/vllm/reasoning/step3_reasoning_parser.py @@ -7,7 +7,11 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -50,6 +54,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -80,7 +85,7 @@ def extract_reasoning_content_streaming( return DeltaMessage(reasoning_content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest] ) -> tuple[Optional[str], Optional[str]]: # Check if the model output contains the token if self.think_end_token not in model_output: