Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9022718
feat(parser): add optional parameter to reasoning parser constructor
taohui Sep 24, 2025
81e0ee4
Update vllm/reasoning/basic_parsers.py
taohui Sep 24, 2025
7d88d77
fix(parser): ensure subclasses forward *args and **kwargs to super().…
taohui Sep 24, 2025
6d7640f
Add a deepseek_v3 reasoning parser that supports dynamically enabling…
taohui Sep 24, 2025
96e7756
docs: update documentation
taohui Sep 24, 2025
f8ba7b0
Update vllm/reasoning/deepseek_v3_reasoning_parser.py
taohui Sep 24, 2025
64c8ae9
fix code style
taohui Sep 24, 2025
8eb2030
fix code style
taohui Sep 24, 2025
4e0949c
fix code style
taohui Sep 24, 2025
2899d98
fix code style
taohui Sep 24, 2025
ae57f85
remove multiple-blanks
taohui Sep 24, 2025
9ebdd28
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
taohui Sep 26, 2025
0aedfd7
Trigger CI
taohui Sep 26, 2025
ad3a8bb
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
taohui Sep 26, 2025
57a20fb
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
taohui Sep 26, 2025
5cdc38e
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
taohui Sep 26, 2025
2bbc1de
refactor: change reasoning parser init to use chat_template_kwargs wr…
taohui Oct 11, 2025
8c2e0dd
Merge branch 'deepseek_v3.1_reasoning_parser_add_parser' of https://g…
taohui Oct 11, 2025
fe680b6
fix code style
taohui Oct 11, 2025
aedd71f
fix code style
taohui Oct 11, 2025
17b1fde
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
chaunceyjiang Oct 11, 2025
3795258
Trigger CI/CD
taohui Oct 11, 2025
e1798b4
Merge branch 'deepseek_v3.1_reasoning_parser_add_parser' of https://g…
taohui Oct 11, 2025
7cd8527
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
taohui Oct 13, 2025
99db960
Merge branch 'main' into deepseek_v3.1_reasoning_parser_add_parser
taohui Oct 13, 2025
a7b8581
refactor: update extract_reasoning_content and extract_reasoning_con…
taohui Oct 13, 2025
008e6eb
Trigger CI/CD
taohui Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
Expand All @@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models:
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |

!!! note
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
DeepSeek-V3.1 tool calling is supported in non-thinking mode.

## Quickstart

Expand Down
76 changes: 76 additions & 0 deletions tests/reasoning/test_deepseekv3_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
from transformers import AutoTokenizer

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.reasoning import (
DeepSeekR1ReasoningParser,
DeepSeekV3ReasoningParser,
IdentityReasoningParser,
)

REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1"


@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)


@pytest.mark.parametrize(
"thinking,expected_parser_type",
[
(True, DeepSeekR1ReasoningParser),
(False, IdentityReasoningParser),
],
)
def test_parser_selection(tokenizer, thinking, expected_parser_type):
parser = DeepSeekV3ReasoningParser(
tokenizer, chat_template_kwargs={"thinking": thinking}
)

assert isinstance(parser._parser, expected_parser_type)


def test_identity_reasoning_parser_basic(tokenizer):
parser = IdentityReasoningParser(tokenizer)

# Test is_reasoning_end always returns True
input_text = "This is some output"
input_tokens = tokenizer.tokenize(input_text)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
assert parser.is_reasoning_end(input_ids) is True

# Test extract_content_ids returns all input_ids
assert parser.extract_content_ids(input_ids) == input_ids

# Test extract_reasoning_content returns (None, model_output)
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
reasoning, content = parser.extract_reasoning_content(input_text, request)
assert reasoning is None
assert content == input_text

# Test extract_reasoning_content_streaming returns DeltaMessage or None
result = parser.extract_reasoning_content_streaming(
previous_text="",
current_text="Hello world",
delta_text="Hello world",
previous_token_ids=[],
current_token_ids=input_ids,
delta_token_ids=input_ids,
)
assert isinstance(result, DeltaMessage)
assert result.content == "Hello world"

# If delta_text is empty, should return None
result_none = parser.extract_reasoning_content_streaming(
previous_text="Hello world",
current_text="Hello world",
delta_text="",
previous_token_ids=input_ids,
current_token_ids=input_ids,
delta_token_ids=[],
)
assert result_none is None
10 changes: 8 additions & 2 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,10 @@ async def chat_completion_stream_generator(

try:
if self.reasoning_parser:
reasoning_parser = self.reasoning_parser(tokenizer)
reasoning_parser = self.reasoning_parser(
tokenizer,
chat_template_kwargs=request.chat_template_kwargs, # type: ignore
)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
data = self.create_streaming_error_response(str(e))
Expand Down Expand Up @@ -1342,7 +1345,10 @@ async def chat_completion_full_generator(

if self.reasoning_parser:
try:
reasoning_parser = self.reasoning_parser(tokenizer)
reasoning_parser = self.reasoning_parser(
tokenizer,
chat_template_kwargs=request.chat_template_kwargs, # type: ignore
)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
return self.create_error_response(str(e))
Expand Down
4 changes: 4 additions & 0 deletions vllm/reasoning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
from .basic_parsers import BaseThinkingReasoningParser
from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
from .ernie45_reasoning_parser import Ernie45ReasoningParser
from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
from .gptoss_reasoning_parser import GptOssReasoningParser
from .granite_reasoning_parser import GraniteReasoningParser
from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
from .identity_reasoning_parser import IdentityReasoningParser
from .mistral_reasoning_parser import MistralReasoningParser
from .olmo3_reasoning_parser import Olmo3ReasoningParser
from .qwen3_reasoning_parser import Qwen3ReasoningParser
Expand All @@ -20,6 +22,8 @@
"BaseThinkingReasoningParser",
"ReasoningParserManager",
"DeepSeekR1ReasoningParser",
"IdentityReasoningParser",
"DeepSeekV3ReasoningParser",
"Ernie45ReasoningParser",
"GraniteReasoningParser",
"HunyuanA13BReasoningParser",
Expand Down
66 changes: 66 additions & 0 deletions vllm/reasoning/deepseek_v3_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence

from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import (
DeepSeekR1ReasoningParser,
ReasoningParser,
ReasoningParserManager,
)

from .identity_reasoning_parser import IdentityReasoningParser

logger = init_logger(__name__)


@ReasoningParserManager.register_module("deepseek_v3")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @taohui, have you tested this ReasoningParser deepseek_v3 with https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not yet, I’ll test it today.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

waiting for your testing results..

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance is perfect — with --reasoning-parser deepseek_v3 enabled, it parses correctly. Without it, the performance is the same as deepseek_v3.1.

model_name = "DeepSeek-V3.2-Exp"
extra_body_thinking = {"chat_template_kwargs": {"thinking": True}}
extra_body_nonthinking = {"chat_template_kwargs": {"thinking": False}}

response = client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "user", "content": "What’s the weather like tomorrow?"}
    ],
    extra_body=extra_body_thinking
)
print("=== Thinking response ===")
print(response)

response = client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "user", "content": "What’s the weather like tomorrow?"}
    ],
    extra_body=extra_body_nonthinking
)
print("=== NonThinking response ===")
print(response)

Output is:

=== Thinking response ===
ChatCompletion(id='chatcmpl-4984722fc97c4ee2ace19026f8825ee3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I can't check real-time weather, but you can get a reliable forecast for tomorrow by:\n\n• Searching online for "weather [your city]" \n• Asking your phone's assistant (Siri/Google Assistant) \n• Checking a weather app like The Weather Channel, AccuWeather, or your default phone app \n\nLet me know if you need help interpreting any weather terms once you have the forecast! ☀️🌧️⛅', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content="Hmm, the user is asking about tomorrow's weather. This is a straightforward request, but I don't have access to real-time data. \n\nI need to acknowledge the limitation upfront while offering helpful alternatives. The user likely wants actionable information, so suggesting reliable weather sources would be useful. \n\nI can list a few trusted options like weather apps and websites, and offer to help interpret the forecast if they provide their location. Keeping it concise but practical seems best here."), stop_reason=None, token_ids=None)], created=1760363936, model='DeepSeek-V3.2-Exp', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=194, prompt_tokens=14, total_tokens=208, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, prompt_token_ids=None, kv_transfer_params=None)
=== NonThinking response ===
ChatCompletion(id='chatcmpl-fc86fdc39d424c46897cdc18406b4267', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I can check the latest weather forecast for you! Please enable "联网搜索" in the app settings, and I’ll fetch real-time weather information for your location. Alternatively, you can tell me your city or region, and I’ll look it up for you! 😊', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=None, token_ids=None)], created=1760363945, model='DeepSeek-V3.2-Exp', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=60, prompt_tokens=14, total_tokens=74, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, prompt_token_ids=None, kv_transfer_params=None)

If start without --reasoning-parser deepseek_v3, the response is:

=== Thinking response ===
ChatCompletion(id='chatcmpl-0e0e975c6eae4e0a97cccbd41dc4c039', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Hmm, the user is asking about tomorrow's weather. This is a straightforward request, but I don't have access to real-time data. \n\nI need to acknowledge the limitation upfront while offering helpful alternatives. The user likely wants actionable information, so suggesting reliable weather sources would be useful. \n\nI can list a few trusted options like weather apps and websites, and offer to help interpret the forecast if they provide their location. Keeping it concise but practical seems best here.I can't check real-time weather, but you can get a reliable forecast for tomorrow by:\n\n• Searching online for "weather [your city]" \n• Asking your phone's assistant (Siri/Google Assistant) \n• Checking a weather app like The Weather Channel, AccuWeather, or your default phone app \n\nLet me know if you need help interpreting any weather terms once you have the forecast! ☀️🌧️⛅', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=None, token_ids=None)], created=1760364966, model='DeepSeek-V3.2-Exp', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=194, prompt_tokens=14, total_tokens=208, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, prompt_token_ids=None, kv_transfer_params=None)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hope to merge this PR into main branch ASAP. 👏

class DeepSeekV3ReasoningParser(ReasoningParser):
"""
V3 parser that delegates to either DeepSeekR1ReasoningParser or
IdentityReasoningParser based on `thinking` and `separate_reasoning`.
"""

def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)

chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
thinking = bool(chat_kwargs.pop("thinking", False))

if thinking:
self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
else:
self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)

def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)

def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning_content(model_output, request)

def extract_reasoning_content_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
return self._parser.extract_reasoning_content_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
58 changes: 58 additions & 0 deletions vllm/reasoning/identity_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence

from transformers import PreTrainedTokenizerBase

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser

logger = init_logger(__name__)


class IdentityReasoningParser(ReasoningParser):
"""
Identity reasoning parser.

This parser does not attempt to parse or strip out reasoning tokens.
It treats the entire model output as content and ignores reasoning.
"""

def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)

def is_reasoning_end(self, input_ids: list[int]) -> bool:
# Always return True, since we never treat reasoning specially
return True

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# Identity: return all tokens as content
return input_ids

def extract_reasoning_content_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
# Just wrap delta_text as content, ignore reasoning
if delta_text:
return DeltaMessage(content=delta_text)
return None

def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
# No reasoning separation: return None for reasoning_content,
# and full model_output as content
return None, model_output