Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
ea395b6
added newest structural tag support for xgrammar
Hanchenli Sep 12, 2025
64d86b1
added interface, need to add tag content
Hanchenli Sep 17, 2025
52c8c77
added tag for no server
Hanchenli Sep 18, 2025
bf19e3f
added tag for no server
Hanchenli Sep 18, 2025
91ef2e8
added demo tool support to reasoning structural output
Hanchenli Sep 19, 2025
3ca306c
added demo tool support to reasoning structural output
Hanchenli Sep 19, 2025
c983d62
cleanup
Hanchenli Sep 23, 2025
9776a23
cleanup
Hanchenli Sep 23, 2025
155268c
cleanup
Hanchenli Sep 23, 2025
2b00781
Update vllm/reasoning/gptoss_reasoning_parser.py
Hanchenli Sep 24, 2025
007af2f
Update vllm/reasoning/gptoss_reasoning_parser.py
Hanchenli Sep 24, 2025
c40eedd
cleanup
Hanchenli Oct 7, 2025
c77bea3
addressed comment
Hanchenli Oct 7, 2025
30a8b2c
add ut
frank-wei Oct 8, 2025
4fe8af8
merge with main but __init__ is not solved yet
frank-wei Oct 8, 2025
762ff0f
fix various issues and make e2e test work
frank-wei Oct 9, 2025
ad6c91c
address comments
frank-wei Oct 9, 2025
7069617
Merge pull request #1 from frank-wei/frank_oss_try_fix
Hanchenli Oct 9, 2025
b1e2a66
pre-commit
frank-wei Oct 10, 2025
bb0e0a2
pre-commit fix2
frank-wei Oct 10, 2025
e9d99be
pre-commit fix3
frank-wei Oct 10, 2025
4db5f11
pre-commit fix4
frank-wei Oct 10, 2025
50de32d
Merge pull request #2 from frank-wei/frank_oss_try_fix
frank-wei Oct 10, 2025
0f67ce9
pre-commit fix5
frank-wei Oct 10, 2025
8b8917b
pre-commit fix6
frank-wei Oct 10, 2025
e937486
Merge pull request #4 from frank-wei/frank_oss_try_fix
frank-wei Oct 10, 2025
bfd05d5
pre-commit fix7
frank-wei Oct 10, 2025
44363e0
Merge pull request #5 from frank-wei/frank_oss_try_fix
frank-wei Oct 10, 2025
53668bb
pre-commit fix8
frank-wei Oct 10, 2025
df8ea16
Merge pull request #6 from frank-wei/frank_oss_try_fix
frank-wei Oct 10, 2025
443dfd9
change the need_structured_in_reasoning to structured_outputs_config
frank-wei Oct 10, 2025
a554c37
pre-commit
frank-wei Oct 10, 2025
dadf8c1
Merge pull request #7 from frank-wei/frank_oss_try_fix
frank-wei Oct 10, 2025
2d70752
fix bad merge
frank-wei Oct 11, 2025
91d1b03
Merge pull request #8 from frank-wei/frank_oss_try_fix
frank-wei Oct 11, 2025
f8a7165
change need_structured_in_reasoning to enable_in_reasoning
frank-wei Oct 11, 2025
8d2956b
pre commit
frank-wei Oct 11, 2025
14f7780
Merge pull request #9 from frank-wei/frank_oss_try_fix
frank-wei Oct 11, 2025
9763a3e
remove extra
frank-wei Oct 11, 2025
dec8673
Merge pull request #10 from frank-wei/frank_oss_try_fix
frank-wei Oct 11, 2025
f62c5b4
remove a line
frank-wei Oct 11, 2025
cf78b0c
Merge pull request #11 from frank-wei/frank_oss_try_fix
frank-wei Oct 11, 2025
c927587
merge with main
frank-wei Oct 13, 2025
435d565
pre-commit
frank-wei Oct 13, 2025
1f6d518
Merge pull request #12 from frank-wei/frank_oss_try_fix
frank-wei Oct 13, 2025
917ae4f
address comments
frank-wei Oct 14, 2025
4d47592
pre-comit
frank-wei Oct 14, 2025
3c7c0b4
Merge pull request #13 from frank-wei/frank_oss_try_fix
frank-wei Oct 14, 2025
00b6063
Merge branch 'main' into lhc/oss-try
frank-wei Oct 15, 2025
7f36c11
merge main
frank-wei Oct 15, 2025
07173f1
fix the bug when StructuredOutputsParams is empty for constraints
frank-wei Oct 15, 2025
78642d0
pre-commit
frank-wei Oct 15, 2025
90b5a70
Merge pull request #14 from frank-wei/frank_oss_try_fix
frank-wei Oct 15, 2025
ce880b1
fix a bug when other constraints are not empty
frank-wei Oct 16, 2025
a781978
pre-commit
frank-wei Oct 16, 2025
677364e
Merge pull request #15 from frank-wei/frank_oss_try_fix
frank-wei Oct 16, 2025
8be9e91
Merge branch 'main' into lhc/oss-try
DarkLight1337 Oct 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
280 changes: 280 additions & 0 deletions tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""Integration tests for GPT-OSS structural tags functionality (PR #25515)."""

import json
from unittest.mock import Mock

import pytest

from vllm.entrypoints.openai.protocol import (
StructuredOutputsParams,
)
from vllm.entrypoints.tool_server import ToolServer
from vllm.reasoning.gptoss_reasoning_parser import (
GptOssReasoningParser,
)


class TestGptOssStructuralTagsIntegration:
"""Integration tests for structural tags in GPT-OSS tool calls."""

@pytest.fixture
def mock_tokenizer(self):
"""Create a mock tokenizer."""
tokenizer = Mock()
tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
return tokenizer

@pytest.fixture
def gptoss_parser(self, mock_tokenizer):
"""Create a real GptOssReasoningParser instance."""
return GptOssReasoningParser(mock_tokenizer)

@pytest.fixture
def tool_server_with_python(self):
"""Create a tool server with Python tool enabled."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
return tool_server

@pytest.fixture
def tool_server_empty(self):
"""Create a tool server with no tools."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(return_value=False)
return tool_server

def test_end_to_end_no_tools(self, gptoss_parser):
"""Test end-to-end flow when no tools are available."""
# Test the parser directly
result = gptoss_parser.prepare_structured_tag(None, None)
parsed_result = json.loads(result)

# Verify basic structure
assert parsed_result["type"] == "structural_tag"
assert parsed_result["format"]["type"] == "triggered_tags"
assert len(parsed_result["format"]["tags"]) == 1

# Verify only analysis channel is allowed
analysis_tag = parsed_result["format"]["tags"][0]
assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
assert analysis_tag["content"]["type"] == "any_text"
assert analysis_tag["end"] == "<|end|>"

# Verify triggers
assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
assert parsed_result["format"]["stop_after_first"] is False

def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
"""Test end-to-end flow with Python tool enabled."""
result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
parsed_result = json.loads(result)

# Should have analysis tag + 2 python tags
assert len(parsed_result["format"]["tags"]) == 3

# Verify all expected tags are present
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
expected_begins = [
"<|channel|>analysis<|message|>",
"<|channel|>commentary to=python",
"<|channel|>analysis to=python",
]

for expected in expected_begins:
assert expected in tag_begins

# Verify triggers include commentary
assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]

def test_structured_outputs_params_integration(
self, gptoss_parser, tool_server_with_python
):
"""Test integration with StructuredOutputsParams."""
# Generate structural tag
structural_tag = gptoss_parser.prepare_structured_tag(
None, tool_server_with_python
)

# Create StructuredOutputsParams
params = StructuredOutputsParams(structural_tag=structural_tag)

# Verify the tag is properly stored and accessible
assert params.structural_tag == structural_tag

# Verify the tag is valid JSON
parsed_tag = json.loads(params.structural_tag)
assert parsed_tag["type"] == "structural_tag"

@pytest.mark.parametrize(
"browser, python, container, expected_tags",
[
# No tools
(False, False, False, 1),
# Single tool
(True, False, False, 3),
# Multiple tools
(True, True, False, 5),
# All tools
(True, True, True, 7),
],
)
def test_tool_server_interaction_flow(
self, gptoss_parser, browser, python, container, expected_tags
):
"""Test the complete tool server interaction flow."""

# Create a mock ToolServer
tool_server = Mock(spec=ToolServer)

# Simulate tool availability based on parameters
tool_server.has_tool = Mock(
side_effect=lambda tool: {
"browser": browser,
"python": python,
"container": container,
}.get(tool, False)
)

# Run the parser and verify results
result = gptoss_parser.prepare_structured_tag(None, tool_server)
parsed_result = json.loads(result)

# Validate number of tags
assert len(parsed_result["format"]["tags"]) == expected_tags

# Verify tool-specific tags exist for enabled tools
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
for tool, enabled in {
"browser": browser,
"python": python,
"container": container,
}.items():
if enabled:
assert f"<|channel|>commentary to={tool}" in tag_begins
assert f"<|channel|>analysis to={tool}" in tag_begins

def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
"""Test that original tags are preserved when provided."""
original_tag = '{"type": "custom_tag", "data": "preserved"}'

result = gptoss_parser.prepare_structured_tag(
original_tag, tool_server_with_python
)

# Should return original tag unchanged
assert result == original_tag

@pytest.mark.parametrize(
"tools",
[
[],
["browser"],
["python"],
["container"],
["browser", "python"],
["browser", "container"],
["python", "container"],
["browser", "python", "container"],
],
)
def test_json_validity_comprehensive(self, gptoss_parser, tools):
"""Test JSON validity across all possible tool combinations."""

tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)

result = gptoss_parser.prepare_structured_tag(None, tool_server)

# Should be valid JSON
parsed_result = json.loads(result)

# Should have correct structure
assert parsed_result["type"] == "structural_tag"
assert "format" in parsed_result
assert "tags" in parsed_result["format"]
assert "triggers" in parsed_result["format"]

# Tag count should be: 1 (analysis) + 2 * len(tools)
expected_tag_count = 1 + (2 * len(tools))
assert len(parsed_result["format"]["tags"]) == expected_tag_count

def test_error_handling_invalid_tool_server(self, gptoss_parser):
"""Test error handling with invalid tool server."""
# Tool server that raises exceptions
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))

# Should handle gracefully and still return a valid tag
with pytest.raises(Exception, match="Tool server error"):
gptoss_parser.prepare_structured_tag(None, tool_server)

def test_concurrent_requests_isolation(self, gptoss_parser):
"""Test that concurrent requests don't interfere with each other."""
# Simulate concurrent requests with different tool servers
tool_server_1 = Mock(spec=ToolServer)
tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")

tool_server_2 = Mock(spec=ToolServer)
tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")

# Generate tags concurrently
result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)

# Parse results
parsed_1 = json.loads(result_1)
parsed_2 = json.loads(result_2)

# Verify they have different tool configurations
tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]

# Result 1 should have python tags
assert "<|channel|>commentary to=python" in tags_1
assert "<|channel|>commentary to=browser" not in tags_1

# Result 2 should have browser tags
assert "<|channel|>commentary to=browser" in tags_2
assert "<|channel|>commentary to=python" not in tags_2

def test_tag_format_consistency(self, gptoss_parser):
"""Test that all generated tags follow consistent format."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(
side_effect=lambda tool: tool in ["python", "browser"]
)

result = gptoss_parser.prepare_structured_tag(None, tool_server)
parsed_result = json.loads(result)

# Verify all tags have required fields
for tag in parsed_result["format"]["tags"]:
assert "begin" in tag
assert "content" in tag
assert "end" in tag
assert tag["content"]["type"] == "any_text"
assert tag["end"] == "<|end|>"

# Verify begin format
assert tag["begin"].startswith("<|channel|>")

def test_trigger_configuration(self, gptoss_parser):
"""Test trigger configuration for different tool setups."""
# Test with no tools
result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
parsed_no_tools = json.loads(result_no_tools)
assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]

# Test with tools
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")

result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
parsed_with_tools = json.loads(result_with_tools)

expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)
46 changes: 46 additions & 0 deletions tests/v1/entrypoints/llm/test_struct_output_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,3 +864,49 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
# non-structured outputs requests should not return a valid JSON here
with pytest.raises(ValueError):
output_json = json.loads(generated_text)


@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
def test_structured_output_with_structural_tag(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is more of an e2e test, can we add some unit test coverage?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also for e2e test, can we also cover logprobs usage here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@frank-wei Could you help with the unit tests? Things are a little caught up here on my end.

Copy link
Contributor

@frank-wei frank-wei Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Hanchenli , NP. I planned to add some UT. Will also cover the logprobs as the output. cc @yeqcharlotte

monkeypatch: pytest.MonkeyPatch,
guided_decoding_backend: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")

llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct",
guided_decoding_backend=guided_decoding_backend,
)

structural_tag_config = {
"type": "structural_tag",
"format": {
"type": "triggered_tags",
"tags": [
{"begin": "hello_flag", "content": {"type": "any_text"}, "end": "hello"}
],
"triggers": ["hello"],
"stop_after_first": False,
},
}

sampling_params = SamplingParams(
temperature=0.0,
max_tokens=500,
guided_decoding=StructuredOutputsParams(
structural_tag=json.dumps(structural_tag_config)
),
)

prompt = "Hello and repete hello 10 times, do not say anything else. Only say hello hello hello, now start"
outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
assert "hello_flag" in generated_text, (
f"Expected 'hello_flag' to be in generated text, but got: {generated_text}"
)
Loading