Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions tests/entrypoints/llm/test_struct_output_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,14 @@ def test_structured_output(
),
("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False),
("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True),
(
"Qwen/Qwen3-1.7B",
"xgrammar",
"auto",
"qwen3",
NGRAM_SPEC_CONFIG,
True,
),
],
)
def test_structured_output_with_reasoning_matrices(
Expand Down
158 changes: 158 additions & 0 deletions tests/v1/structured_output/test_reasoning_structured_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,161 @@ def test_should_advance_reasoning_already_ended(

# Should return True since reasoning has ended
assert result is True

def test_should_advance_with_new_token_ids_detects_reasoning_end(
self,
manager_with_reasoner,
mock_request_with_structured_output,
):
"""When new_token_ids is passed containing the end token,
reasoning_ended should be set regardless of placeholder arithmetic."""
END_TOKEN = 999
structured_req = mock_request_with_structured_output.structured_output_request
structured_req.reasoning_ended = False

reasoner = MockReasoner(tokenizer=Mock())
reasoner.is_reasoning_end_streaming = Mock(
side_effect=lambda all_ids, delta: END_TOKEN in delta
)
structured_req.reasoner = reasoner

# Simulate async + spec decode where placeholder math would produce
# an empty delta window: num_computed_tokens == len(all_token_ids)
mock_request_with_structured_output.all_token_ids = [
1,
2,
3,
END_TOKEN,
10,
]
mock_request_with_structured_output.num_computed_tokens = 5
mock_request_with_structured_output.num_output_placeholders = 0

new_token_ids = [9, 198, END_TOKEN, 271]

result = manager_with_reasoner.should_advance(
mock_request_with_structured_output, new_token_ids=new_token_ids
)

assert structured_req.reasoning_ended is True
# JSON type defers FSM advance to next step
assert result is False
# Verify we used new_token_ids, not the placeholder-derived delta
reasoner.is_reasoning_end_streaming.assert_called_once()

def test_should_advance_async_spec_decode_empty_delta_misses_end_token(
self,
manager_with_reasoner,
mock_request_with_structured_output,
):
"""Reproduce the bug: without new_token_ids, async + spec decode
placeholder arithmetic produces start == len(all_token_ids), yielding
an empty delta that misses the reasoning end token.

This test documents the known limitation of the fallback path."""
END_TOKEN = 999
structured_req = mock_request_with_structured_output.structured_output_request
structured_req.reasoning_ended = False

actual_deltas_seen = []

def capture_delta(all_ids, delta):
delta_list = list(delta)
actual_deltas_seen.append(delta_list)
return END_TOKEN in delta_list

reasoner = MockReasoner(tokenizer=Mock())
reasoner.is_reasoning_end_streaming = Mock(side_effect=capture_delta)
structured_req.reasoner = reasoner

# After async scheduling + spec decode token append:
# 4 new tokens appended, num_computed_tokens adjusted to match
mock_request_with_structured_output.all_token_ids = [
1,
2,
3,
4,
5,
9,
198,
END_TOKEN,
271,
]
mock_request_with_structured_output.num_computed_tokens = 9
mock_request_with_structured_output.num_output_placeholders = 0

# Fallback path (no new_token_ids) computes start = 9 - 0 = 9,
# but len(all_token_ids) = 9, so islice yields nothing.
result = manager_with_reasoner.should_advance(
mock_request_with_structured_output
)

assert result is False
# The delta was empty, so the end token was missed
assert actual_deltas_seen == [[]]
assert structured_req.reasoning_ended is False

# Now try with new_token_ids -- this should find the end token
result = manager_with_reasoner.should_advance(
mock_request_with_structured_output,
new_token_ids=[9, 198, END_TOKEN, 271],
)

assert structured_req.reasoning_ended is True
assert result is False # JSON defers

def test_should_advance_new_token_ids_structural_tag_spec_decode(
self,
manager_with_reasoner,
mock_request_with_structured_output,
):
"""Structural tags with spec decode should return True on the same
step, even when detected via new_token_ids."""
END_TOKEN = 999
structured_req = mock_request_with_structured_output.structured_output_request
structured_req.reasoning_ended = False
structured_req.structured_output_key = (
StructuredOutputOptions.STRUCTURAL_TAG,
"{}",
)

reasoner = MockReasoner(tokenizer=Mock())
reasoner.is_reasoning_end_streaming = Mock(
side_effect=lambda all_ids, delta: END_TOKEN in delta
)
structured_req.reasoner = reasoner

manager_with_reasoner.vllm_config.speculative_config = Mock()

result = manager_with_reasoner.should_advance(
mock_request_with_structured_output,
new_token_ids=[END_TOKEN, 42],
)

assert structured_req.reasoning_ended is True
assert result is True

def test_should_advance_new_token_ids_no_end_token(
self,
manager_with_reasoner,
mock_request_with_structured_output,
):
"""When new_token_ids does not contain the end token,
reasoning_ended should stay False."""
END_TOKEN = 999
structured_req = mock_request_with_structured_output.structured_output_request
structured_req.reasoning_ended = False

reasoner = MockReasoner(tokenizer=Mock())
reasoner.is_reasoning_end_streaming = Mock(
side_effect=lambda all_ids, delta: END_TOKEN in delta
)
structured_req.reasoner = reasoner

result = manager_with_reasoner.should_advance(
mock_request_with_structured_output,
new_token_ids=[10, 20, 30],
)

assert structured_req.reasoning_ended is False
assert result is False
4 changes: 3 additions & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,9 @@ def update_from_output(
request.status = RequestStatus.FINISHED_STOPPED
stopped = True

if new_token_ids and self.structured_output_manager.should_advance(request):
if new_token_ids and self.structured_output_manager.should_advance(
request, new_token_ids=new_token_ids
):
struct_output_request = request.structured_output_request
assert struct_output_request is not None
assert struct_output_request.grammar is not None
Expand Down
30 changes: 21 additions & 9 deletions vllm/v1/structured_output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,11 @@ def should_fill_bitmask(self, request: "Request") -> bool:
return request.structured_output_request.reasoning_ended
return True

def should_advance(self, request: "Request") -> bool:
def should_advance(
self,
request: "Request",
new_token_ids: list[int] | None = None,
) -> bool:
if not request.use_structured_output:
return False

Expand All @@ -342,15 +346,23 @@ def should_advance(self, request: "Request") -> bool:
if structured_req.reasoning_ended:
return True

# Check if reasoning ends in *this* step
delta_from = request.num_computed_tokens - request.num_output_placeholders
# Check if reasoning ends in *this* step.
# When new_token_ids is provided (token-output path), use it
# directly as the delta to avoid fragile placeholder arithmetic
# that can miss the end token under async scheduling + spec decode.
all_token_ids = request.all_token_ids
start = (
delta_from if delta_from >= 0 else max(len(all_token_ids) + delta_from, 0)
)
if reasoner.is_reasoning_end_streaming(
all_token_ids, itertools.islice(all_token_ids, start, None)
):
if new_token_ids is not None:
delta: Iterable[int] = new_token_ids
else:
delta_from = request.num_computed_tokens - request.num_output_placeholders
start = (
delta_from
if delta_from >= 0
else max(len(all_token_ids) + delta_from, 0)
)
delta = itertools.islice(all_token_ids, start, None)

if reasoner.is_reasoning_end_streaming(all_token_ids, delta):
structured_req.reasoning_ended = True

# Reasoning just ended this step. Defer FSM advance until the next
Expand Down
Loading