From b5bb916d1722f771ac5b924a1c7a33dbd02caf35 Mon Sep 17 00:00:00 2001
From: raghavan <oneraghavan@gmail.com>
Date: Sun, 24 May 2026 13:10:19 +0000
Subject: [PATCH] [Bugfix] Fix reasoning end token missed by should_advance
 under async scheduling + spec decode

Fixes #43388, #34650

When async scheduling and speculative decoding are both enabled,
should_advance() reconstructs the new tokens delta via placeholder
arithmetic (num_computed_tokens - num_output_placeholders). After spec
decode rejection adjustments and token appending, this arithmetic can
produce start == len(all_token_ids), yielding an empty delta that misses
the reasoning end token (e.g. </think>). As a result, reasoning_ended
never flips to True and JSON grammar constraints are never applied.

Fix: add an optional new_token_ids parameter to should_advance(). When
the caller has the actual new tokens (the token-output path in
update_from_output), pass them directly so the method checks them
instead of re-deriving the delta from counter arithmetic. The fallback
path (draft-token call sites) is unchanged.

Signed-off-by: Raghavan <oneraghavan@gmail.com>
---
 .../llm/test_struct_output_generate.py        |   8 +
 .../test_reasoning_structured_output.py       | 158 ++++++++++++++++++
 vllm/v1/core/sched/scheduler.py               |   4 +-
 vllm/v1/structured_output/__init__.py         |  30 +++-
 4 files changed, 190 insertions(+), 10 deletions(-)
diff --git a/tests/entrypoints/llm/test_struct_output_generate.py b/tests/entrypoints/llm/test_struct_output_generate.py
index 3ece27234368..79c794a3148c 100644
--- a/tests/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/entrypoints/llm/test_struct_output_generate.py
@@ -727,6 +727,14 @@ def test_structured_output(
         ),
         ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False),
         ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True),
+        (
+            "Qwen/Qwen3-1.7B",
+            "xgrammar",
+            "auto",
+            "qwen3",
+            NGRAM_SPEC_CONFIG,
+            True,
+        ),
     ],
 )
 def test_structured_output_with_reasoning_matrices(
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 861e919c102a..6b605acd02b0 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -258,3 +258,161 @@ def test_should_advance_reasoning_already_ended(
 
         # Should return True since reasoning has ended
         assert result is True
+
+    def test_should_advance_with_new_token_ids_detects_reasoning_end(
+        self,
+        manager_with_reasoner,
+        mock_request_with_structured_output,
+    ):
+        """When new_token_ids is passed containing the end token,
+        reasoning_ended should be set regardless of placeholder arithmetic."""
+        END_TOKEN = 999
+        structured_req = mock_request_with_structured_output.structured_output_request
+        structured_req.reasoning_ended = False
+
+        reasoner = MockReasoner(tokenizer=Mock())
+        reasoner.is_reasoning_end_streaming = Mock(
+            side_effect=lambda all_ids, delta: END_TOKEN in delta
+        )
+        structured_req.reasoner = reasoner
+
+        # Simulate async + spec decode where placeholder math would produce
+        # an empty delta window: num_computed_tokens == len(all_token_ids)
+        mock_request_with_structured_output.all_token_ids = [
+            1,
+            2,
+            3,
+            END_TOKEN,
+            10,
+        ]
+        mock_request_with_structured_output.num_computed_tokens = 5
+        mock_request_with_structured_output.num_output_placeholders = 0
+
+        new_token_ids = [9, 198, END_TOKEN, 271]
+
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output, new_token_ids=new_token_ids
+        )
+
+        assert structured_req.reasoning_ended is True
+        # JSON type defers FSM advance to next step
+        assert result is False
+        # Verify we used new_token_ids, not the placeholder-derived delta
+        reasoner.is_reasoning_end_streaming.assert_called_once()
+
+    def test_should_advance_async_spec_decode_empty_delta_misses_end_token(
+        self,
+        manager_with_reasoner,
+        mock_request_with_structured_output,
+    ):
+        """Reproduce the bug: without new_token_ids, async + spec decode
+        placeholder arithmetic produces start == len(all_token_ids), yielding
+        an empty delta that misses the reasoning end token.
+
+        This test documents the known limitation of the fallback path."""
+        END_TOKEN = 999
+        structured_req = mock_request_with_structured_output.structured_output_request
+        structured_req.reasoning_ended = False
+
+        actual_deltas_seen = []
+
+        def capture_delta(all_ids, delta):
+            delta_list = list(delta)
+            actual_deltas_seen.append(delta_list)
+            return END_TOKEN in delta_list
+
+        reasoner = MockReasoner(tokenizer=Mock())
+        reasoner.is_reasoning_end_streaming = Mock(side_effect=capture_delta)
+        structured_req.reasoner = reasoner
+
+        # After async scheduling + spec decode token append:
+        # 4 new tokens appended, num_computed_tokens adjusted to match
+        mock_request_with_structured_output.all_token_ids = [
+            1,
+            2,
+            3,
+            4,
+            5,
+            9,
+            198,
+            END_TOKEN,
+            271,
+        ]
+        mock_request_with_structured_output.num_computed_tokens = 9
+        mock_request_with_structured_output.num_output_placeholders = 0
+
+        # Fallback path (no new_token_ids) computes start = 9 - 0 = 9,
+        # but len(all_token_ids) = 9, so islice yields nothing.
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output
+        )
+
+        assert result is False
+        # The delta was empty, so the end token was missed
+        assert actual_deltas_seen == [[]]
+        assert structured_req.reasoning_ended is False
+
+        # Now try with new_token_ids -- this should find the end token
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output,
+            new_token_ids=[9, 198, END_TOKEN, 271],
+        )
+
+        assert structured_req.reasoning_ended is True
+        assert result is False  # JSON defers
+
+    def test_should_advance_new_token_ids_structural_tag_spec_decode(
+        self,
+        manager_with_reasoner,
+        mock_request_with_structured_output,
+    ):
+        """Structural tags with spec decode should return True on the same
+        step, even when detected via new_token_ids."""
+        END_TOKEN = 999
+        structured_req = mock_request_with_structured_output.structured_output_request
+        structured_req.reasoning_ended = False
+        structured_req.structured_output_key = (
+            StructuredOutputOptions.STRUCTURAL_TAG,
+            "{}",
+        )
+
+        reasoner = MockReasoner(tokenizer=Mock())
+        reasoner.is_reasoning_end_streaming = Mock(
+            side_effect=lambda all_ids, delta: END_TOKEN in delta
+        )
+        structured_req.reasoner = reasoner
+
+        manager_with_reasoner.vllm_config.speculative_config = Mock()
+
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output,
+            new_token_ids=[END_TOKEN, 42],
+        )
+
+        assert structured_req.reasoning_ended is True
+        assert result is True
+
+    def test_should_advance_new_token_ids_no_end_token(
+        self,
+        manager_with_reasoner,
+        mock_request_with_structured_output,
+    ):
+        """When new_token_ids does not contain the end token,
+        reasoning_ended should stay False."""
+        END_TOKEN = 999
+        structured_req = mock_request_with_structured_output.structured_output_request
+        structured_req.reasoning_ended = False
+
+        reasoner = MockReasoner(tokenizer=Mock())
+        reasoner.is_reasoning_end_streaming = Mock(
+            side_effect=lambda all_ids, delta: END_TOKEN in delta
+        )
+        structured_req.reasoner = reasoner
+
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output,
+            new_token_ids=[10, 20, 30],
+        )
+
+        assert structured_req.reasoning_ended is False
+        assert result is False
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 45a5169a3efe..aedfe284dbdc 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1413,7 +1413,9 @@ def update_from_output(
                 request.status = RequestStatus.FINISHED_STOPPED
                 stopped = True
 
-            if new_token_ids and self.structured_output_manager.should_advance(request):
+            if new_token_ids and self.structured_output_manager.should_advance(
+                request, new_token_ids=new_token_ids
+            ):
                 struct_output_request = request.structured_output_request
                 assert struct_output_request is not None
                 assert struct_output_request.grammar is not None
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 6a4fcbb629ff..471c7ffd8109 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -319,7 +319,11 @@ def should_fill_bitmask(self, request: "Request") -> bool:
             return request.structured_output_request.reasoning_ended
         return True
 
-    def should_advance(self, request: "Request") -> bool:
+    def should_advance(
+        self,
+        request: "Request",
+        new_token_ids: list[int] | None = None,
+    ) -> bool:
         if not request.use_structured_output:
             return False
 
@@ -342,15 +346,23 @@ def should_advance(self, request: "Request") -> bool:
         if structured_req.reasoning_ended:
             return True
 
-        # Check if reasoning ends in *this* step
-        delta_from = request.num_computed_tokens - request.num_output_placeholders
+        # Check if reasoning ends in *this* step.
+        # When new_token_ids is provided (token-output path), use it
+        # directly as the delta to avoid fragile placeholder arithmetic
+        # that can miss the end token under async scheduling + spec decode.
         all_token_ids = request.all_token_ids
-        start = (
-            delta_from if delta_from >= 0 else max(len(all_token_ids) + delta_from, 0)
-        )
-        if reasoner.is_reasoning_end_streaming(
-            all_token_ids, itertools.islice(all_token_ids, start, None)
-        ):
+        if new_token_ids is not None:
+            delta: Iterable[int] = new_token_ids
+        else:
+            delta_from = request.num_computed_tokens - request.num_output_placeholders
+            start = (
+                delta_from
+                if delta_from >= 0
+                else max(len(all_token_ids) + delta_from, 0)
+            )
+            delta = itertools.islice(all_token_ids, start, None)
+
+        if reasoner.is_reasoning_end_streaming(all_token_ids, delta):
             structured_req.reasoning_ended = True
 
             # Reasoning just ended this step. Defer FSM advance until the next