Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/reasoning/test_granite_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from transformers import AutoTokenizer

from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.reasoning import ReasoningParser, ReasoningParserManager

parser_name = "granite"
Expand Down Expand Up @@ -333,6 +334,7 @@ def test_streaming_subcases(param_dict):
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
request=ChatCompletionRequest(model="test-model"),
)
# Streaming currently expects at least one of reasoning content / content,
# so the response should return None in that case.
Expand Down
2 changes: 2 additions & 0 deletions tests/reasoning/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def run_reasoning_extraction_streaming(
previous_tokens,
current_tokens,
token_delta,
request,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
Expand Down Expand Up @@ -150,6 +151,7 @@ def run_reasoning_extraction_streaming_mistral(
previous_tokens,
current_tokens,
token_delta,
request,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,7 @@ async def chat_completion_stream_generator(
previous_token_ids,
current_token_ids,
output.token_ids,
request,
)
)
# When encountering think end id in delta_token_ids
Expand Down Expand Up @@ -953,6 +954,7 @@ async def chat_completion_stream_generator(
previous_token_ids,
current_token_ids,
output_token_ids,
request,
)
)
# When encountering think end id in prompt_token_ids
Expand Down Expand Up @@ -1039,6 +1041,7 @@ async def chat_completion_stream_generator(
previous_token_ids,
current_token_ids,
output.token_ids,
request,
)
)
# handle streaming just a content delta
Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,7 @@ async def _process_simple_streaming_events(
previous_token_ids=previous_token_ids,
current_token_ids=previous_token_ids + output.token_ids,
delta_token_ids=output.token_ids,
request=request,
)
)
else:
Expand Down
2 changes: 2 additions & 0 deletions vllm/reasoning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
from .basic_parsers import BaseThinkingReasoningParser
from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .deepseek_v31_reasoning_parser import DeepSeekV31ReasoningParser
from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
from .gptoss_reasoning_parser import GptOssReasoningParser
from .granite_reasoning_parser import GraniteReasoningParser
Expand All @@ -19,6 +20,7 @@
"BaseThinkingReasoningParser",
"ReasoningParserManager",
"DeepSeekR1ReasoningParser",
"DeepSeekV31ReasoningParser",
"GraniteReasoningParser",
"HunyuanA13BReasoningParser",
"Qwen3ReasoningParser",
Expand Down
1 change: 1 addition & 0 deletions vllm/reasoning/abs_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""
Instance method that should be implemented for extracting reasoning
Expand Down
1 change: 1 addition & 0 deletions vllm/reasoning/basic_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""
Extract reasoning content from a delta message.
Expand Down
8 changes: 7 additions & 1 deletion vllm/reasoning/deepseek_r1_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from collections.abc import Sequence
from typing import Union

from vllm.entrypoints.openai.protocol import DeltaMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser

Expand Down Expand Up @@ -36,6 +40,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
ret = super().extract_reasoning_content_streaming(
previous_text,
Expand All @@ -44,6 +49,7 @@ def extract_reasoning_content_streaming(
previous_token_ids,
current_token_ids,
delta_token_ids,
request,
)
if (
ret is not None
Expand Down
55 changes: 55 additions & 0 deletions vllm/reasoning/deepseek_v31_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence
from typing import Optional, Union

from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import DeepSeekR1ReasoningParser, ReasoningParserManager

logger = init_logger(__name__)


@ReasoningParserManager.register_module("deepseek_v31")
class DeepSeekV31ReasoningParser(DeepSeekR1ReasoningParser):
def extract_reasoning_content_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
if (
request.chat_template_kwargs is not None
and request.chat_template_kwargs.get("thinking", False) is True
):
return super().extract_reasoning_content_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
request,
)

return DeltaMessage(content=delta_text)

def extract_reasoning_content(
self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
) -> tuple[Optional[str], Optional[str]]:
if (
request.chat_template_kwargs is not None
and request.chat_template_kwargs.get("thinking", False) is True
):
return super().extract_reasoning_content(model_output, request)

return None, model_output
9 changes: 7 additions & 2 deletions vllm/reasoning/glm4_moe_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager

Expand Down Expand Up @@ -80,6 +84,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""
Extract reasoning content from a delta message.
Expand Down Expand Up @@ -136,7 +141,7 @@ def extract_reasoning_content_streaming(
return DeltaMessage(content=delta_text)

def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
) -> tuple[Optional[str], Optional[str]]:
"""
Extract reasoning content from the model output.
Expand Down
9 changes: 7 additions & 2 deletions vllm/reasoning/gptoss_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.harmony_utils import parse_chat_output
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager

Expand Down Expand Up @@ -53,6 +57,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
prev_reasoning, prev_content, _ = parse_chat_output(list(previous_token_ids))
cur_reasoning, cur_content, _ = parse_chat_output(list(current_token_ids))
Expand All @@ -77,7 +82,7 @@ def extract_reasoning_content_streaming(
def extract_reasoning_content(
self,
model_output: str,
request: ChatCompletionRequest,
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> tuple[Optional[str], Optional[str]]:
raise NotImplementedError(
"gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used." # noqa: E501
Expand Down
9 changes: 7 additions & 2 deletions vllm/reasoning/granite_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
import regex as re
from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager

Expand Down Expand Up @@ -52,7 +56,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
)

def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
) -> tuple[Optional[str], Optional[str]]:
"""Extract the reasoning content & content sections, respectively.
If the sequence doesn't match what we expect, i.e., the model generates
Expand Down Expand Up @@ -82,6 +86,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""Extract the reasoning content / content emitted by granite models;
If the sequence doesn't match what we expect, i.e., the model generates
Expand Down
9 changes: 7 additions & 2 deletions vllm/reasoning/hunyuan_a13b_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
import regex as re
from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager

Expand Down Expand Up @@ -89,7 +93,7 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return []

def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
) -> tuple[Optional[str], Optional[str]]:
"""Extract the reasoning content & content sections, respectively.
If the sequence doesn't match what we expect, i.e., the model generates
Expand Down Expand Up @@ -150,6 +154,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""Extract content using token ID sequence state machine"""
# Define sequences
Expand Down
1 change: 1 addition & 0 deletions vllm/reasoning/olmo3_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""Extract content using token ID sequence state machine"""

Expand Down
9 changes: 7 additions & 2 deletions vllm/reasoning/step3_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
import regex as re
from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager

Expand Down Expand Up @@ -50,6 +54,7 @@ def extract_reasoning_content_streaming(
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: Union[ChatCompletionRequest, ResponsesRequest],
) -> Union[DeltaMessage, None]:
"""
Extract reasoning content from a delta message.
Expand Down Expand Up @@ -80,7 +85,7 @@ def extract_reasoning_content_streaming(
return DeltaMessage(reasoning_content=delta_text)

def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
) -> tuple[Optional[str], Optional[str]]:
# Check if the model output contains the </think> token
if self.think_end_token not in model_output:
Expand Down