vllm-project · faizsameerahmed96 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · gemini-code-assist
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import json
 from collections.abc import Iterable, Sequence
 from typing import TYPE_CHECKING
 
@@ -9,6 +10,7 @@
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.sampling_params import StructuredOutputsParams
 
 from .identity_reasoning_parser import IdentityReasoningParser
 
@@ -19,6 +21,16 @@
 
 logger = init_logger(__name__)
 
+# When thinking is enabled the chat template opens a <think> block before
+# generation begins. Any structured-output constraint applied from token 0
+# would block the closing </think>, so we wrap the constraint in a structural
+# tag that only engages after the reasoning section is closed. The trigger is
+# the bare closing tag (kicks in as soon as </think> appears) and the
+# structure's `begin` forces the canonical \n\n separator before the schema-
+# conforming output starts. See vllm-project/vllm#41132 / #33215.
+_THINKING_END_TRIGGER = "</think>"
+_THINKING_END_BEGIN = "</think>\n\n"
+
 
 class DeepSeekV3ReasoningParser(ReasoningParser):
     """
@@ -48,6 +60,42 @@ def reasoning_start_str(self) -> str | None:
     def reasoning_end_str(self) -> str | None:
         return self._parser.reasoning_end_str
 
+    def adjust_request(
+        self, request: "ChatCompletionRequest | ResponsesRequest"
+    ) -> "ChatCompletionRequest | ResponsesRequest":
+        if not isinstance(self._parser, DeepSeekR1ReasoningParser):
+            return request
+
+        response_format = getattr(request, "response_format", None)
+        if response_format is None:
+            return request
+
+        if response_format.type == "json_schema":
+            json_schema = response_format.json_schema
+            assert json_schema is not None
+            schema = json_schema.json_schema
+        elif response_format.type == "json_object":
+            schema = {"type": "object"}
+        else:
+            return request
+
+        request.structured_outputs = StructuredOutputsParams(
+            structural_tag=json.dumps(
+                {
+                    "triggers": [_THINKING_END_TRIGGER],
+                    "structures": [
+                        {
+                            "begin": _THINKING_END_BEGIN,
+                            "schema": schema,
+                            "end": "",
+                        }
+                    ],
+                }
+            )
+        )
+        request.response_format = None
+        return request
+
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         return self._parser.is_reasoning_end(input_ids)
 

@@ -21,13 +21,20 @@
 )
 from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
 from vllm.logger import init_logger
+from vllm.sampling_params import StructuredOutputsParams
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.abstract_tool_parser import (
     Tool,
     ToolParser,
 )
 from vllm.tool_parsers.utils import partial_tag_overlap
 
+# Marker the model emits to close the thinking section.
+# FSM allows any content up to and including this marker before schema starts.
+# Used to separate reasoning from schema-conforming JSON output.
+_THINKING_END_TRIGGER = "</think>"
+_THINKING_END_BEGIN = "</think>\n\n"
+
 logger = init_logger(__name__)
 
 
@@ -95,8 +102,50 @@ def adjust_request(
             # setting skip_special_tokens=False ensures proper handling in
             # transformers 5.x where decoding behavior may have changed.
             request.skip_special_tokens = False
+
+        # When thinking mode is enabled, the chat template opens a <think>
+        # block before the first generated token. The base adjust_request
+        # installs a whole-sequence JSON schema constraint for
+        # `tool_choice: "required"` and named-function tool_choice, which
+        # makes </think> unreachable in the constrained vocabulary. Wrap
+        # the schema in a structural tag so the FSM only engages after
+        # the model emits </think>\n\n. See vllm-project/vllm#33215.
+        if (
+            isinstance(request, ChatCompletionRequest)
+            and self._thinking_enabled(request)
+            and request.structured_outputs is not None
+            and request.structured_outputs.json is not None
+        ):
+            schema = request.structured_outputs.json
+            if isinstance(schema, str):
+                schema = json.loads(schema)
+            request.structured_outputs = StructuredOutputsParams(
+                structural_tag=json.dumps(
+                    {
+                        "triggers": [_THINKING_END_TRIGGER],
+                        "structures": [
+                            {
+                                "begin": _THINKING_END_BEGIN,
+                                "schema": schema,
+                                "end": "",
+                            }
+                        ],
+                    }
+                )
+            )
-            request.structured_outputs = StructuredOutputsParams(
-                structural_tag=json.dumps(
-                    {
-                        "triggers": [_THINKING_END_TRIGGER],
-                        "structures": [
-                            {
-                                "begin": _THINKING_END_BEGIN,
-                                "schema": schema,
-                                "end": "",
-                            }
-                        ],
-                    }
-                )
-            )
+            from vllm.config.utils import replace
+            request.structured_outputs = replace(
+                request.structured_outputs,
+                json=None,
+                structural_tag=json.dumps(
+                    {
+                        "triggers": [_THINKING_END_TRIGGER],
+                        "structures": [
+                            {
+                                "begin": _THINKING_END_BEGIN,
+                                "schema": schema,
+                                "end": "",
+                            }
+                        ],
+                    }
+                )
+            )
-            request.structured_outputs = StructuredOutputsParams(
-                structural_tag=json.dumps(
-                    {
-                        "triggers": [_THINKING_END_TRIGGER],
-                        "structures": [
-                            {
-                                "begin": _THINKING_END_BEGIN,
-                                "schema": schema,
-                                "end": "",
-                            }
-                        ],
-                    }
-                )
-            )
+            from vllm.config.utils import replace
+            request.structured_outputs = replace(
+                request.structured_outputs,
+                json=None,
+                structural_tag=json.dumps(
+                    {
+                        "triggers": [_THINKING_END_TRIGGER],
+                        "structures": [
+                            {
+                                "begin": _THINKING_END_BEGIN,
+                                "schema": schema,
+                                "end": "",
+                            }
+                        ],
+                    }
+                )
+            )
         return request
 
+    @staticmethod
+    def _thinking_enabled(
+        request: ChatCompletionRequest | ResponsesRequest,
+    ) -> bool:
+        """Mirror DeepSeekV3ReasoningParser's thinking-enable detection."""
+        chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
+        return bool(
+            chat_kwargs.get("thinking", False)
+            or chat_kwargs.get("enable_thinking", False)
+        )
+
     def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
         return f"call_{uuid.uuid4().hex[:24]}"