From b5bb916d1722f771ac5b924a1c7a33dbd02caf35 Mon Sep 17 00:00:00 2001 From: raghavan Date: Sun, 24 May 2026 13:10:19 +0000 Subject: [PATCH] [Bugfix] Fix reasoning end token missed by should_advance under async scheduling + spec decode Fixes #43388, #34650 When async scheduling and speculative decoding are both enabled, should_advance() reconstructs the new tokens delta via placeholder arithmetic (num_computed_tokens - num_output_placeholders). After spec decode rejection adjustments and token appending, this arithmetic can produce start == len(all_token_ids), yielding an empty delta that misses the reasoning end token (e.g. ). As a result, reasoning_ended never flips to True and JSON grammar constraints are never applied. Fix: add an optional new_token_ids parameter to should_advance(). When the caller has the actual new tokens (the token-output path in update_from_output), pass them directly so the method checks them instead of re-deriving the delta from counter arithmetic. The fallback path (draft-token call sites) is unchanged. Signed-off-by: Raghavan --- .../llm/test_struct_output_generate.py | 8 + .../test_reasoning_structured_output.py | 158 ++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 4 +- vllm/v1/structured_output/__init__.py | 30 +++- 4 files changed, 190 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/llm/test_struct_output_generate.py b/tests/entrypoints/llm/test_struct_output_generate.py index 3ece27234368..79c794a3148c 100644 --- a/tests/entrypoints/llm/test_struct_output_generate.py +++ b/tests/entrypoints/llm/test_struct_output_generate.py @@ -727,6 +727,14 @@ def test_structured_output( ), ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False), ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True), + ( + "Qwen/Qwen3-1.7B", + "xgrammar", + "auto", + "qwen3", + NGRAM_SPEC_CONFIG, + True, + ), ], ) def test_structured_output_with_reasoning_matrices( diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 861e919c102a..6b605acd02b0 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -258,3 +258,161 @@ def test_should_advance_reasoning_already_ended( # Should return True since reasoning has ended assert result is True + + def test_should_advance_with_new_token_ids_detects_reasoning_end( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """When new_token_ids is passed containing the end token, + reasoning_ended should be set regardless of placeholder arithmetic.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock( + side_effect=lambda all_ids, delta: END_TOKEN in delta + ) + structured_req.reasoner = reasoner + + # Simulate async + spec decode where placeholder math would produce + # an empty delta window: num_computed_tokens == len(all_token_ids) + mock_request_with_structured_output.all_token_ids = [ + 1, + 2, + 3, + END_TOKEN, + 10, + ] + mock_request_with_structured_output.num_computed_tokens = 5 + mock_request_with_structured_output.num_output_placeholders = 0 + + new_token_ids = [9, 198, END_TOKEN, 271] + + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, new_token_ids=new_token_ids + ) + + assert structured_req.reasoning_ended is True + # JSON type defers FSM advance to next step + assert result is False + # Verify we used new_token_ids, not the placeholder-derived delta + reasoner.is_reasoning_end_streaming.assert_called_once() + + def test_should_advance_async_spec_decode_empty_delta_misses_end_token( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """Reproduce the bug: without new_token_ids, async + spec decode + placeholder arithmetic produces start == len(all_token_ids), yielding + an empty delta that misses the reasoning end token. + + This test documents the known limitation of the fallback path.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + + actual_deltas_seen = [] + + def capture_delta(all_ids, delta): + delta_list = list(delta) + actual_deltas_seen.append(delta_list) + return END_TOKEN in delta_list + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock(side_effect=capture_delta) + structured_req.reasoner = reasoner + + # After async scheduling + spec decode token append: + # 4 new tokens appended, num_computed_tokens adjusted to match + mock_request_with_structured_output.all_token_ids = [ + 1, + 2, + 3, + 4, + 5, + 9, + 198, + END_TOKEN, + 271, + ] + mock_request_with_structured_output.num_computed_tokens = 9 + mock_request_with_structured_output.num_output_placeholders = 0 + + # Fallback path (no new_token_ids) computes start = 9 - 0 = 9, + # but len(all_token_ids) = 9, so islice yields nothing. + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output + ) + + assert result is False + # The delta was empty, so the end token was missed + assert actual_deltas_seen == [[]] + assert structured_req.reasoning_ended is False + + # Now try with new_token_ids -- this should find the end token + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, + new_token_ids=[9, 198, END_TOKEN, 271], + ) + + assert structured_req.reasoning_ended is True + assert result is False # JSON defers + + def test_should_advance_new_token_ids_structural_tag_spec_decode( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """Structural tags with spec decode should return True on the same + step, even when detected via new_token_ids.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + structured_req.structured_output_key = ( + StructuredOutputOptions.STRUCTURAL_TAG, + "{}", + ) + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock( + side_effect=lambda all_ids, delta: END_TOKEN in delta + ) + structured_req.reasoner = reasoner + + manager_with_reasoner.vllm_config.speculative_config = Mock() + + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, + new_token_ids=[END_TOKEN, 42], + ) + + assert structured_req.reasoning_ended is True + assert result is True + + def test_should_advance_new_token_ids_no_end_token( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """When new_token_ids does not contain the end token, + reasoning_ended should stay False.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock( + side_effect=lambda all_ids, delta: END_TOKEN in delta + ) + structured_req.reasoner = reasoner + + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, + new_token_ids=[10, 20, 30], + ) + + assert structured_req.reasoning_ended is False + assert result is False diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 45a5169a3efe..aedfe284dbdc 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1413,7 +1413,9 @@ def update_from_output( request.status = RequestStatus.FINISHED_STOPPED stopped = True - if new_token_ids and self.structured_output_manager.should_advance(request): + if new_token_ids and self.structured_output_manager.should_advance( + request, new_token_ids=new_token_ids + ): struct_output_request = request.structured_output_request assert struct_output_request is not None assert struct_output_request.grammar is not None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 6a4fcbb629ff..471c7ffd8109 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -319,7 +319,11 @@ def should_fill_bitmask(self, request: "Request") -> bool: return request.structured_output_request.reasoning_ended return True - def should_advance(self, request: "Request") -> bool: + def should_advance( + self, + request: "Request", + new_token_ids: list[int] | None = None, + ) -> bool: if not request.use_structured_output: return False @@ -342,15 +346,23 @@ def should_advance(self, request: "Request") -> bool: if structured_req.reasoning_ended: return True - # Check if reasoning ends in *this* step - delta_from = request.num_computed_tokens - request.num_output_placeholders + # Check if reasoning ends in *this* step. + # When new_token_ids is provided (token-output path), use it + # directly as the delta to avoid fragile placeholder arithmetic + # that can miss the end token under async scheduling + spec decode. all_token_ids = request.all_token_ids - start = ( - delta_from if delta_from >= 0 else max(len(all_token_ids) + delta_from, 0) - ) - if reasoner.is_reasoning_end_streaming( - all_token_ids, itertools.islice(all_token_ids, start, None) - ): + if new_token_ids is not None: + delta: Iterable[int] = new_token_ids + else: + delta_from = request.num_computed_tokens - request.num_output_placeholders + start = ( + delta_from + if delta_from >= 0 + else max(len(all_token_ids) + delta_from, 0) + ) + delta = itertools.islice(all_token_ids, start, None) + + if reasoner.is_reasoning_end_streaming(all_token_ids, delta): structured_req.reasoning_ended = True # Reasoning just ended this step. Defer FSM advance until the next