Skip to content
3 changes: 3 additions & 0 deletions docs/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner

def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids

def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
return self.end_token_id in delta_token_ids
...
```

Expand Down
35 changes: 35 additions & 0 deletions tests/reasoning/test_base_thinking_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,41 @@ def test_is_reasoning_end(self, test_tokenizer):
is False
)

def test_is_reasoning_end_streaming(self, test_tokenizer):
"""Test the is_reasoning_end_streaming method."""
parser = TestThinkingReasoningParser(test_tokenizer)
end_token_id = parser.end_token_id
start_token_id = parser.start_token_id

assert (
parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
is True
)
assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
assert parser.is_reasoning_end_streaming([], []) is False
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id], [end_token_id]
)
is True
)
assert (
parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
)
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
[2],
)
is False
)
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id, 2, 2], [2]
)
is False
)

def test_extract_content_ids(self, test_tokenizer):
"""Test the extract_content_ids method."""
parser = TestThinkingReasoningParser(test_tokenizer)
Expand Down
1 change: 1 addition & 0 deletions tests/reasoning/test_deepseekv3_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
input_tokens = tokenizer.tokenize(input_text)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
assert parser.is_reasoning_end(input_ids) is True
assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True

# Test extract_content_ids returns all input_ids
assert parser.extract_content_ids(input_ids) == input_ids
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def mock_request_with_structured_output(self):
request.use_structured_output = True
request.prompt_token_ids = [1, 2, 3, 4, 5]
request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
request.num_computed_tokens = 5
return request

def test_should_fill_bitmask_with_enable_in_reasoning(
Expand Down
25 changes: 25 additions & 0 deletions vllm/reasoning/abs_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,31 @@ def is_reasoning_end(self, input_ids: list[int]) -> bool:
True if the reasoning content ends in the input_ids.
"""

def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
"""
Check if the reasoning content ends in the input_ids on a
decode step.

It is used in structured engines like `xgrammar` to check if the
reasoning content ends in the model output during a decode step.
`input_ids` the entire model output and `delta_ids` are the last few
computed tokens of the model output (like during a decode step).

Parameters:
input_ids: list[int]
The entire model output.
delta_ids: list[int]
The last few computed tokens of the model output at the current decode step.

Returns:
bool
True if the reasoning content ends in the `delta_ids` on a
decode step.
"""
return self.is_reasoning_end(input_ids)

@abstractmethod
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
Expand Down
6 changes: 6 additions & 0 deletions vllm/reasoning/basic_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ def is_reasoning_end(self, input_ids: list[int]) -> bool:
return True
return False

def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
end_token_id = self.end_token_id
return end_token_id in delta_ids

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
Extract the content after the end tokens
Expand Down
5 changes: 5 additions & 0 deletions vllm/reasoning/deepseek_v3_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)

def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)

Expand Down
5 changes: 5 additions & 0 deletions vllm/reasoning/holo2_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)

def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)

Expand Down
5 changes: 5 additions & 0 deletions vllm/reasoning/identity_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def is_reasoning_end(self, input_ids: list[int]) -> bool:
# Always return True, since we never treat reasoning specially
return True

def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return True

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# Identity: return all tokens as content
return input_ids
Expand Down
4 changes: 3 additions & 1 deletion vllm/v1/structured_output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,9 @@ def should_advance(self, request: Request) -> bool:
return True

# Check if reasoning ends in *this* step
if self.reasoner.is_reasoning_end(request.all_token_ids):
if self.reasoner.is_reasoning_end_streaming(
request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
):
Comment thread
hdlj-h marked this conversation as resolved.
# Reasoning just ended, so we shouldn't advance til
Comment thread
hdlj-h marked this conversation as resolved.
# next pass
structured_req.reasoning_ended = True
Expand Down