diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index bb79afd8dede..2800082feaad 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING @@ -9,6 +10,7 @@ from vllm.logger import init_logger from vllm.reasoning import ReasoningParser from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.sampling_params import StructuredOutputsParams from .identity_reasoning_parser import IdentityReasoningParser @@ -19,6 +21,16 @@ logger = init_logger(__name__) +# When thinking is enabled the chat template opens a block before +# generation begins. Any structured-output constraint applied from token 0 +# would block the closing , so we wrap the constraint in a structural +# tag that only engages after the reasoning section is closed. The trigger is +# the bare closing tag (kicks in as soon as appears) and the +# structure's `begin` forces the canonical \n\n separator before the schema- +# conforming output starts. See vllm-project/vllm#41132 / #33215. +_THINKING_END_TRIGGER = "" +_THINKING_END_BEGIN = "\n\n" + class DeepSeekV3ReasoningParser(ReasoningParser): """ @@ -48,6 +60,42 @@ def reasoning_start_str(self) -> str | None: def reasoning_end_str(self) -> str | None: return self._parser.reasoning_end_str + def adjust_request( + self, request: "ChatCompletionRequest | ResponsesRequest" + ) -> "ChatCompletionRequest | ResponsesRequest": + if not isinstance(self._parser, DeepSeekR1ReasoningParser): + return request + + response_format = getattr(request, "response_format", None) + if response_format is None: + return request + + if response_format.type == "json_schema": + json_schema = response_format.json_schema + assert json_schema is not None + schema = json_schema.json_schema + elif response_format.type == "json_object": + schema = {"type": "object"} + else: + return request + + request.structured_outputs = StructuredOutputsParams( + structural_tag=json.dumps( + { + "triggers": [_THINKING_END_TRIGGER], + "structures": [ + { + "begin": _THINKING_END_BEGIN, + "schema": schema, + "end": "", + } + ], + } + ) + ) + request.response_format = None + return request + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py index b8623592365c..446b49272295 100644 --- a/vllm/tool_parsers/deepseekv32_tool_parser.py +++ b/vllm/tool_parsers/deepseekv32_tool_parser.py @@ -21,6 +21,7 @@ ) from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger +from vllm.sampling_params import StructuredOutputsParams from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( Tool, @@ -28,6 +29,12 @@ ) from vllm.tool_parsers.utils import partial_tag_overlap +# Marker the model emits to close the thinking section. +# FSM allows any content up to and including this marker before schema starts. +# Used to separate reasoning from schema-conforming JSON output. +_THINKING_END_TRIGGER = "" +_THINKING_END_BEGIN = "\n\n" + logger = init_logger(__name__) @@ -95,8 +102,50 @@ def adjust_request( # setting skip_special_tokens=False ensures proper handling in # transformers 5.x where decoding behavior may have changed. request.skip_special_tokens = False + + # When thinking mode is enabled, the chat template opens a + # block before the first generated token. The base adjust_request + # installs a whole-sequence JSON schema constraint for + # `tool_choice: "required"` and named-function tool_choice, which + # makes unreachable in the constrained vocabulary. Wrap + # the schema in a structural tag so the FSM only engages after + # the model emits \n\n. See vllm-project/vllm#33215. + if ( + isinstance(request, ChatCompletionRequest) + and self._thinking_enabled(request) + and request.structured_outputs is not None + and request.structured_outputs.json is not None + ): + schema = request.structured_outputs.json + if isinstance(schema, str): + schema = json.loads(schema) + request.structured_outputs = StructuredOutputsParams( + structural_tag=json.dumps( + { + "triggers": [_THINKING_END_TRIGGER], + "structures": [ + { + "begin": _THINKING_END_BEGIN, + "schema": schema, + "end": "", + } + ], + } + ) + ) return request + @staticmethod + def _thinking_enabled( + request: ChatCompletionRequest | ResponsesRequest, + ) -> bool: + """Mirror DeepSeekV3ReasoningParser's thinking-enable detection.""" + chat_kwargs = getattr(request, "chat_template_kwargs", None) or {} + return bool( + chat_kwargs.get("thinking", False) + or chat_kwargs.get("enable_thinking", False) + ) + def _generate_tool_call_id(self) -> str: """Generate a unique tool call ID.""" return f"call_{uuid.uuid4().hex[:24]}"