diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index bb79afd8dede..2800082feaad 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
from collections.abc import Iterable, Sequence
from typing import TYPE_CHECKING
@@ -9,6 +10,7 @@
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.sampling_params import StructuredOutputsParams
from .identity_reasoning_parser import IdentityReasoningParser
@@ -19,6 +21,16 @@
logger = init_logger(__name__)
+# When thinking is enabled the chat template opens a block before
+# generation begins. Any structured-output constraint applied from token 0
+# would block the closing , so we wrap the constraint in a structural
+# tag that only engages after the reasoning section is closed. The trigger is
+# the bare closing tag (kicks in as soon as appears) and the
+# structure's `begin` forces the canonical \n\n separator before the schema-
+# conforming output starts. See vllm-project/vllm#41132 / #33215.
+_THINKING_END_TRIGGER = ""
+_THINKING_END_BEGIN = "\n\n"
+
class DeepSeekV3ReasoningParser(ReasoningParser):
"""
@@ -48,6 +60,42 @@ def reasoning_start_str(self) -> str | None:
def reasoning_end_str(self) -> str | None:
return self._parser.reasoning_end_str
+ def adjust_request(
+ self, request: "ChatCompletionRequest | ResponsesRequest"
+ ) -> "ChatCompletionRequest | ResponsesRequest":
+ if not isinstance(self._parser, DeepSeekR1ReasoningParser):
+ return request
+
+ response_format = getattr(request, "response_format", None)
+ if response_format is None:
+ return request
+
+ if response_format.type == "json_schema":
+ json_schema = response_format.json_schema
+ assert json_schema is not None
+ schema = json_schema.json_schema
+ elif response_format.type == "json_object":
+ schema = {"type": "object"}
+ else:
+ return request
+
+ request.structured_outputs = StructuredOutputsParams(
+ structural_tag=json.dumps(
+ {
+ "triggers": [_THINKING_END_TRIGGER],
+ "structures": [
+ {
+ "begin": _THINKING_END_BEGIN,
+ "schema": schema,
+ "end": "",
+ }
+ ],
+ }
+ )
+ )
+ request.response_format = None
+ return request
+
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)
diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py
index b8623592365c..446b49272295 100644
--- a/vllm/tool_parsers/deepseekv32_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv32_tool_parser.py
@@ -21,6 +21,7 @@
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.logger import init_logger
+from vllm.sampling_params import StructuredOutputsParams
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
Tool,
@@ -28,6 +29,12 @@
)
from vllm.tool_parsers.utils import partial_tag_overlap
+# Marker the model emits to close the thinking section.
+# FSM allows any content up to and including this marker before schema starts.
+# Used to separate reasoning from schema-conforming JSON output.
+_THINKING_END_TRIGGER = ""
+_THINKING_END_BEGIN = "\n\n"
+
logger = init_logger(__name__)
@@ -95,8 +102,50 @@ def adjust_request(
# setting skip_special_tokens=False ensures proper handling in
# transformers 5.x where decoding behavior may have changed.
request.skip_special_tokens = False
+
+ # When thinking mode is enabled, the chat template opens a
+ # block before the first generated token. The base adjust_request
+ # installs a whole-sequence JSON schema constraint for
+ # `tool_choice: "required"` and named-function tool_choice, which
+ # makes unreachable in the constrained vocabulary. Wrap
+ # the schema in a structural tag so the FSM only engages after
+ # the model emits \n\n. See vllm-project/vllm#33215.
+ if (
+ isinstance(request, ChatCompletionRequest)
+ and self._thinking_enabled(request)
+ and request.structured_outputs is not None
+ and request.structured_outputs.json is not None
+ ):
+ schema = request.structured_outputs.json
+ if isinstance(schema, str):
+ schema = json.loads(schema)
+ request.structured_outputs = StructuredOutputsParams(
+ structural_tag=json.dumps(
+ {
+ "triggers": [_THINKING_END_TRIGGER],
+ "structures": [
+ {
+ "begin": _THINKING_END_BEGIN,
+ "schema": schema,
+ "end": "",
+ }
+ ],
+ }
+ )
+ )
return request
+ @staticmethod
+ def _thinking_enabled(
+ request: ChatCompletionRequest | ResponsesRequest,
+ ) -> bool:
+ """Mirror DeepSeekV3ReasoningParser's thinking-enable detection."""
+ chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
+ return bool(
+ chat_kwargs.get("thinking", False)
+ or chat_kwargs.get("enable_thinking", False)
+ )
+
def _generate_tool_call_id(self) -> str:
"""Generate a unique tool call ID."""
return f"call_{uuid.uuid4().hex[:24]}"