Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions vllm/reasoning/deepseek_v3_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json
from collections.abc import Iterable, Sequence
from typing import TYPE_CHECKING

Expand All @@ -9,6 +10,7 @@
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from vllm.sampling_params import StructuredOutputsParams

from .identity_reasoning_parser import IdentityReasoningParser

Expand All @@ -19,6 +21,16 @@

logger = init_logger(__name__)

# When thinking is enabled the chat template opens a <think> block before
# generation begins. Any structured-output constraint applied from token 0
# would block the closing </think>, so we wrap the constraint in a structural
# tag that only engages after the reasoning section is closed. The trigger is
# the bare closing tag (kicks in as soon as </think> appears) and the
# structure's `begin` forces the canonical \n\n separator before the schema-
# conforming output starts. See vllm-project/vllm#41132 / #33215.
_THINKING_END_TRIGGER = "</think>"
_THINKING_END_BEGIN = "</think>\n\n"


class DeepSeekV3ReasoningParser(ReasoningParser):
"""
Expand Down Expand Up @@ -48,6 +60,42 @@ def reasoning_start_str(self) -> str | None:
def reasoning_end_str(self) -> str | None:
return self._parser.reasoning_end_str

def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
if not isinstance(self._parser, DeepSeekR1ReasoningParser):
return request

response_format = getattr(request, "response_format", None)
if response_format is None:
return request

if response_format.type == "json_schema":
json_schema = response_format.json_schema
assert json_schema is not None
schema = json_schema.json_schema
elif response_format.type == "json_object":
schema = {"type": "object"}
else:
return request

request.structured_outputs = StructuredOutputsParams(
structural_tag=json.dumps(
{
"triggers": [_THINKING_END_TRIGGER],
"structures": [
{
"begin": _THINKING_END_BEGIN,
"schema": schema,
"end": "",
}
],
}
)
)
request.response_format = None
Comment on lines +69 to +96
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The adjust_request implementation has two significant issues:

  1. Missing structured_outputs support: It only checks request.response_format. If a user provides a JSON schema directly via the structured_outputs parameter (a common vLLM-specific usage), this logic is skipped, and the model will likely fail to close the reasoning block because the schema constraint will block the </think> tag.
  2. State Overwrite: It overwrites the entire request.structured_outputs object with a new one. This causes the loss of any other structured output settings the user might have provided (e.g., disable_any_whitespace, whitespace_pattern).

You should check both response_format and structured_outputs, and use vllm.config.utils.replace to update the parameters while preserving existing settings.

        response_format = getattr(request, "response_format", None)
        structured_outputs = getattr(request, "structured_outputs", None)

        schema = None
        if response_format is not None:
            if response_format.type == "json_schema":
                json_schema = response_format.json_schema
                assert json_schema is not None
                schema = json_schema.json_schema
            elif response_format.type == "json_object":
                schema = {"type": "object"}

        if schema is None and structured_outputs is not None:
            schema = structured_outputs.json
            if isinstance(schema, str):
                schema = json.loads(schema)

        if schema is None:
            return request

        from vllm.config.utils import replace
        new_structured_outputs = replace(
            structured_outputs or StructuredOutputsParams(),
            json=None,
            json_object=None,
            structural_tag=json.dumps(
                {
                    "triggers": [_THINKING_END_TRIGGER],
                    "structures": [
                        {
                            "begin": _THINKING_END_BEGIN,
                            "schema": schema,
                            "end": "",
                        }
                    ],
                }
            )
        )
        setattr(request, "structured_outputs", new_structured_outputs)
        if response_format is not None:
            setattr(request, "response_format", None)

return request

def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)

Expand Down
49 changes: 49 additions & 0 deletions vllm/tool_parsers/deepseekv32_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,20 @@
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.logger import init_logger
from vllm.sampling_params import StructuredOutputsParams
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
Tool,
ToolParser,
)
from vllm.tool_parsers.utils import partial_tag_overlap

# Marker the model emits to close the thinking section.
# FSM allows any content up to and including this marker before schema starts.
# Used to separate reasoning from schema-conforming JSON output.
_THINKING_END_TRIGGER = "</think>"
_THINKING_END_BEGIN = "</think>\n\n"

logger = init_logger(__name__)


Expand Down Expand Up @@ -95,8 +102,50 @@ def adjust_request(
# setting skip_special_tokens=False ensures proper handling in
# transformers 5.x where decoding behavior may have changed.
request.skip_special_tokens = False

# When thinking mode is enabled, the chat template opens a <think>
# block before the first generated token. The base adjust_request
# installs a whole-sequence JSON schema constraint for
# `tool_choice: "required"` and named-function tool_choice, which
# makes </think> unreachable in the constrained vocabulary. Wrap
# the schema in a structural tag so the FSM only engages after
# the model emits </think>\n\n. See vllm-project/vllm#33215.
if (
isinstance(request, ChatCompletionRequest)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The isinstance(request, ChatCompletionRequest) check explicitly excludes ResponsesRequest. Since ResponsesRequest is a supported type for tool parsing in the base ToolParser class, this exclusion means tool calls will remain broken (the </think> tag will be unreachable in the constrained vocabulary) when using the Responses API with thinking enabled. If ResponsesRequest does not yet support structural_tag, this limitation should be addressed or at least documented.

and self._thinking_enabled(request)
and request.structured_outputs is not None
and request.structured_outputs.json is not None
):
schema = request.structured_outputs.json
if isinstance(schema, str):
schema = json.loads(schema)
request.structured_outputs = StructuredOutputsParams(
structural_tag=json.dumps(
{
"triggers": [_THINKING_END_TRIGGER],
"structures": [
{
"begin": _THINKING_END_BEGIN,
"schema": schema,
"end": "",
}
],
}
)
)
Comment on lines +122 to +135
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Overwriting request.structured_outputs with a new StructuredOutputsParams object will lose any other parameters set by the user (e.g., disable_any_whitespace). Use vllm.config.utils.replace to preserve existing settings while switching the constraint from json to structural_tag.

Suggested change
request.structured_outputs = StructuredOutputsParams(
structural_tag=json.dumps(
{
"triggers": [_THINKING_END_TRIGGER],
"structures": [
{
"begin": _THINKING_END_BEGIN,
"schema": schema,
"end": "",
}
],
}
)
)
from vllm.config.utils import replace
request.structured_outputs = replace(
request.structured_outputs,
json=None,
structural_tag=json.dumps(
{
"triggers": [_THINKING_END_TRIGGER],
"structures": [
{
"begin": _THINKING_END_BEGIN,
"schema": schema,
"end": "",
}
],
}
)
)

return request

@staticmethod
def _thinking_enabled(
request: ChatCompletionRequest | ResponsesRequest,
) -> bool:
"""Mirror DeepSeekV3ReasoningParser's thinking-enable detection."""
chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
return bool(
chat_kwargs.get("thinking", False)
or chat_kwargs.get("enable_thinking", False)
)

def _generate_tool_call_id(self) -> str:
"""Generate a unique tool call ID."""
return f"call_{uuid.uuid4().hex[:24]}"
Expand Down
Loading