diff --git a/tests/entrypoints/llm/test_struct_output_generate.py b/tests/entrypoints/llm/test_struct_output_generate.py index 3ece27234368..79c794a3148c 100644 --- a/tests/entrypoints/llm/test_struct_output_generate.py +++ b/tests/entrypoints/llm/test_struct_output_generate.py @@ -727,6 +727,14 @@ def test_structured_output( ), ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False), ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True), + ( + "Qwen/Qwen3-1.7B", + "xgrammar", + "auto", + "qwen3", + NGRAM_SPEC_CONFIG, + True, + ), ], ) def test_structured_output_with_reasoning_matrices( diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 861e919c102a..6b605acd02b0 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -258,3 +258,161 @@ def test_should_advance_reasoning_already_ended( # Should return True since reasoning has ended assert result is True + + def test_should_advance_with_new_token_ids_detects_reasoning_end( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """When new_token_ids is passed containing the end token, + reasoning_ended should be set regardless of placeholder arithmetic.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock( + side_effect=lambda all_ids, delta: END_TOKEN in delta + ) + structured_req.reasoner = reasoner + + # Simulate async + spec decode where placeholder math would produce + # an empty delta window: num_computed_tokens == len(all_token_ids) + mock_request_with_structured_output.all_token_ids = [ + 1, + 2, + 3, + END_TOKEN, + 10, + ] + mock_request_with_structured_output.num_computed_tokens = 5 + mock_request_with_structured_output.num_output_placeholders = 0 + + new_token_ids = [9, 198, END_TOKEN, 271] + + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, new_token_ids=new_token_ids + ) + + assert structured_req.reasoning_ended is True + # JSON type defers FSM advance to next step + assert result is False + # Verify we used new_token_ids, not the placeholder-derived delta + reasoner.is_reasoning_end_streaming.assert_called_once() + + def test_should_advance_async_spec_decode_empty_delta_misses_end_token( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """Reproduce the bug: without new_token_ids, async + spec decode + placeholder arithmetic produces start == len(all_token_ids), yielding + an empty delta that misses the reasoning end token. + + This test documents the known limitation of the fallback path.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + + actual_deltas_seen = [] + + def capture_delta(all_ids, delta): + delta_list = list(delta) + actual_deltas_seen.append(delta_list) + return END_TOKEN in delta_list + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock(side_effect=capture_delta) + structured_req.reasoner = reasoner + + # After async scheduling + spec decode token append: + # 4 new tokens appended, num_computed_tokens adjusted to match + mock_request_with_structured_output.all_token_ids = [ + 1, + 2, + 3, + 4, + 5, + 9, + 198, + END_TOKEN, + 271, + ] + mock_request_with_structured_output.num_computed_tokens = 9 + mock_request_with_structured_output.num_output_placeholders = 0 + + # Fallback path (no new_token_ids) computes start = 9 - 0 = 9, + # but len(all_token_ids) = 9, so islice yields nothing. + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output + ) + + assert result is False + # The delta was empty, so the end token was missed + assert actual_deltas_seen == [[]] + assert structured_req.reasoning_ended is False + + # Now try with new_token_ids -- this should find the end token + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, + new_token_ids=[9, 198, END_TOKEN, 271], + ) + + assert structured_req.reasoning_ended is True + assert result is False # JSON defers + + def test_should_advance_new_token_ids_structural_tag_spec_decode( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """Structural tags with spec decode should return True on the same + step, even when detected via new_token_ids.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + structured_req.structured_output_key = ( + StructuredOutputOptions.STRUCTURAL_TAG, + "{}", + ) + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock( + side_effect=lambda all_ids, delta: END_TOKEN in delta + ) + structured_req.reasoner = reasoner + + manager_with_reasoner.vllm_config.speculative_config = Mock() + + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, + new_token_ids=[END_TOKEN, 42], + ) + + assert structured_req.reasoning_ended is True + assert result is True + + def test_should_advance_new_token_ids_no_end_token( + self, + manager_with_reasoner, + mock_request_with_structured_output, + ): + """When new_token_ids does not contain the end token, + reasoning_ended should stay False.""" + END_TOKEN = 999 + structured_req = mock_request_with_structured_output.structured_output_request + structured_req.reasoning_ended = False + + reasoner = MockReasoner(tokenizer=Mock()) + reasoner.is_reasoning_end_streaming = Mock( + side_effect=lambda all_ids, delta: END_TOKEN in delta + ) + structured_req.reasoner = reasoner + + result = manager_with_reasoner.should_advance( + mock_request_with_structured_output, + new_token_ids=[10, 20, 30], + ) + + assert structured_req.reasoning_ended is False + assert result is False diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 45a5169a3efe..aedfe284dbdc 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1413,7 +1413,9 @@ def update_from_output( request.status = RequestStatus.FINISHED_STOPPED stopped = True - if new_token_ids and self.structured_output_manager.should_advance(request): + if new_token_ids and self.structured_output_manager.should_advance( + request, new_token_ids=new_token_ids + ): struct_output_request = request.structured_output_request assert struct_output_request is not None assert struct_output_request.grammar is not None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 6a4fcbb629ff..471c7ffd8109 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -319,7 +319,11 @@ def should_fill_bitmask(self, request: "Request") -> bool: return request.structured_output_request.reasoning_ended return True - def should_advance(self, request: "Request") -> bool: + def should_advance( + self, + request: "Request", + new_token_ids: list[int] | None = None, + ) -> bool: if not request.use_structured_output: return False @@ -342,15 +346,23 @@ def should_advance(self, request: "Request") -> bool: if structured_req.reasoning_ended: return True - # Check if reasoning ends in *this* step - delta_from = request.num_computed_tokens - request.num_output_placeholders + # Check if reasoning ends in *this* step. + # When new_token_ids is provided (token-output path), use it + # directly as the delta to avoid fragile placeholder arithmetic + # that can miss the end token under async scheduling + spec decode. all_token_ids = request.all_token_ids - start = ( - delta_from if delta_from >= 0 else max(len(all_token_ids) + delta_from, 0) - ) - if reasoner.is_reasoning_end_streaming( - all_token_ids, itertools.islice(all_token_ids, start, None) - ): + if new_token_ids is not None: + delta: Iterable[int] = new_token_ids + else: + delta_from = request.num_computed_tokens - request.num_output_placeholders + start = ( + delta_from + if delta_from >= 0 + else max(len(all_token_ids) + delta_from, 0) + ) + delta = itertools.islice(all_token_ids, start, None) + + if reasoner.is_reasoning_end_streaming(all_token_ids, delta): structured_req.reasoning_ended = True # Reasoning just ended this step. Defer FSM advance until the next