From 0985a564f4884e339ceda19b1b9667df77df6936 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Tue, 17 Mar 2026 12:16:55 -0400 Subject: [PATCH 01/10] feat(responses): unified tool_choice + structured output via triggered tags Extend prepare_structured_tag() to be the single authority for all generation constraints in GPT-OSS Harmony models: channel structure, tool enforcement, argument validation, and content constraints. tool_choice=required support: - New from_function_tool_to_tag() and tag_with_function_tools() helpers - prepare_structured_tag() extended with tool_choice, function_tools params - Channel blocking: omit <|channel|>final trigger to force tool calls - Remove NotImplementedError for non-auto tool_choice in Harmony path Absorbed from upstream PR #35904 (structured output + reasoning): - Content constraint embedding in <|channel|>final tag - _constraint_to_content_format() and _extract_response_format_schema() - struct_out is None branch (reasoning tags always applied) - inject_response_formats() for Harmony cookbook compliance - json_object format handling (was silently ignored) - Streaming .model_dump() alias bug fix Signed-off-by: Will Deines --- .../openai/parser/test_harmony_utils.py | 39 ++ .../openai/responses/test_response_formats.py | 96 ++++ .../openai/responses/test_sampling_params.py | 19 + .../responses/test_structured_output.py | 71 +++ .../reasoning/test_gptoss_reasoning_parser.py | 415 ++++++++++++++++++ .../openai/parser/harmony_utils.py | 20 + vllm/entrypoints/openai/responses/protocol.py | 4 + vllm/entrypoints/openai/responses/serving.py | 167 ++++++- vllm/reasoning/abs_reasoning_parsers.py | 18 +- vllm/reasoning/gptoss_reasoning_parser.py | 125 ++++-- 10 files changed, 926 insertions(+), 48 deletions(-) create mode 100644 tests/entrypoints/openai/responses/test_response_formats.py diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 21b53dff1507..01ac12bd9408 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -10,6 +10,7 @@ get_encoding, get_system_message, has_custom_tools, + inject_response_formats, parse_chat_input_to_harmony_message, parse_chat_output, ) @@ -928,3 +929,41 @@ def test_reasoning_with_empty_content_returns_none(self): msg = response_input_to_harmony(item, prev_responses=[]) assert msg is None + + +class TestInjectResponseFormats: + def test_appends_to_existing_instructions(self): + result = inject_response_formats("You are helpful.", {"type": "object"}) + assert result.startswith("You are helpful.") + assert "# Response Formats" in result + assert '{"type":"object"}' in result + + def test_none_instructions_creates_section(self): + result = inject_response_formats(None, {"type": "object"}) + assert result.startswith("# Response Formats") + assert '{"type":"object"}' in result + + def test_custom_format_name(self): + result = inject_response_formats( + None, {"type": "object"}, format_name="order" + ) + assert "## order" in result + + def test_compact_json_no_spaces(self): + schema = { + "type": "object", + "properties": {"name": {"type": "string"}}, + } + result = inject_response_formats(None, schema) + assert ( + '{"type":"object","properties":{"name":{"type":"string"}}}' + in result + ) + + def test_section_separated_by_blank_lines(self): + result = inject_response_formats( + "Instructions here.", {"type": "object"} + ) + assert ( + "\n\n# Response Formats\n\n## structured_output\n\n" in result + ) diff --git a/tests/entrypoints/openai/responses/test_response_formats.py b/tests/entrypoints/openai/responses/test_response_formats.py new file mode 100644 index 000000000000..61681dfbf8ec --- /dev/null +++ b/tests/entrypoints/openai/responses/test_response_formats.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Tests for response format schema extraction and developer message injection. + +These tests verify that structured output schemas are correctly extracted from +ResponsesRequest and injected into the Harmony developer message per the +Harmony cookbook specification. +""" + +from openai.types.responses.response_format_text_json_schema_config import ( + ResponseFormatTextJSONSchemaConfig, +) + +from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponseTextConfig, +) +from vllm.entrypoints.openai.responses.serving import ( + _extract_response_format_schema, +) +from vllm.sampling_params import StructuredOutputsParams + + +def _make_json_schema_text_config(schema: dict) -> ResponseTextConfig: + text_config = ResponseTextConfig() + text_config.format = ResponseFormatTextJSONSchemaConfig( + type="json_schema", + name="test_schema", + schema=schema, + ) + return text_config + + +class TestExtractResponseFormatSchema: + def test_extracts_from_text_format_json_schema(self): + schema = { + "type": "object", + "properties": {"name": {"type": "string"}}, + } + request = ResponsesRequest( + model="test-model", + input="test", + text=_make_json_schema_text_config(schema), + ) + result = _extract_response_format_schema(request) + assert result == schema + + def test_extracts_from_structured_outputs_json(self): + schema = { + "type": "object", + "properties": {"id": {"type": "integer"}}, + } + request = ResponsesRequest( + model="test-model", + input="test", + structured_outputs=StructuredOutputsParams(json=schema), + ) + result = _extract_response_format_schema(request) + assert result == schema + + def test_returns_none_for_text_format(self): + request = ResponsesRequest( + model="test-model", + input="test", + text=ResponseTextConfig(format={"type": "text"}), + ) + result = _extract_response_format_schema(request) + assert result is None + + def test_returns_none_for_no_format(self): + request = ResponsesRequest( + model="test-model", + input="test", + ) + result = _extract_response_format_schema(request) + assert result is None + + def test_text_format_takes_precedence(self): + """text.format.json_schema is checked before structured_outputs.""" + text_schema = { + "type": "object", + "properties": {"a": {"type": "string"}}, + } + so_schema = { + "type": "object", + "properties": {"b": {"type": "string"}}, + } + request = ResponsesRequest( + model="test-model", + input="test", + text=_make_json_schema_text_config(text_schema), + structured_outputs=StructuredOutputsParams(json=so_schema), + ) + result = _extract_response_format_schema(request) + assert result == text_schema diff --git a/tests/entrypoints/openai/responses/test_sampling_params.py b/tests/entrypoints/openai/responses/test_sampling_params.py index 87910271dd75..7509489ca3c4 100644 --- a/tests/entrypoints/openai/responses/test_sampling_params.py +++ b/tests/entrypoints/openai/responses/test_sampling_params.py @@ -132,6 +132,25 @@ def test_structured_outputs_passed_through(self): assert sampling_params.structured_outputs is not None assert sampling_params.structured_outputs.grammar == "root ::= 'hello'" + def test_json_object_format_produces_structured_outputs(self): + """Test that text.format.type=json_object creates StructuredOutputsParams.""" + from openai.types.shared.response_format_json_object import ( + ResponseFormatJSONObject, + ) + + text_config = ResponseTextConfig() + text_config.format = ResponseFormatJSONObject(type="json_object") + request = ResponsesRequest( + model="test-model", + input="test input", + text=text_config, + ) + + sampling_params = request.to_sampling_params(default_max_tokens=1000) + + assert sampling_params.structured_outputs is not None + assert sampling_params.structured_outputs.json_object is True + def test_structured_outputs_and_json_schema_conflict(self): """Test that specifying both structured_outputs and json_schema raises.""" structured_outputs = StructuredOutputsParams(grammar="root ::= 'hello'") diff --git a/tests/entrypoints/openai/responses/test_structured_output.py b/tests/entrypoints/openai/responses/test_structured_output.py index db8b87768e44..16742708041c 100644 --- a/tests/entrypoints/openai/responses/test_structured_output.py +++ b/tests/entrypoints/openai/responses/test_structured_output.py @@ -6,6 +6,11 @@ import pytest from pydantic import BaseModel +from vllm.entrypoints.openai.responses.serving import ( + _constraint_to_content_format, +) +from vllm.sampling_params import StructuredOutputsParams + @pytest.mark.asyncio async def test_structured_output(client: openai.AsyncOpenAI): @@ -76,3 +81,69 @@ class CalendarEvent(BaseModel): assert len(participants) == 2 assert participants[0] == "Alice" assert participants[1] == "Bob" + + +class TestConstraintToContentFormat: + """Test _constraint_to_content_format helper.""" + + def test_json_schema_string_is_parsed(self): + """JSON schema passed as a string gets json.loads'd into a dict.""" + schema = {"type": "object", "properties": {"age": {"type": "integer"}}} + params = StructuredOutputsParams(json=json.dumps(schema)) + result = _constraint_to_content_format(params) + + assert result == {"type": "json_schema", "json_schema": schema} + + def test_json_schema_dict(self): + """JSON schema passed as a dict is used directly.""" + schema = {"type": "object", "properties": {"age": {"type": "integer"}}} + params = StructuredOutputsParams(json=schema) + result = _constraint_to_content_format(params) + + assert result == {"type": "json_schema", "json_schema": schema} + + def test_json_object(self): + """json_object maps to minimal JSON schema.""" + params = StructuredOutputsParams(json_object=True) + result = _constraint_to_content_format(params) + + assert result == { + "type": "json_schema", + "json_schema": {"type": "object"}, + } + + def test_regex(self): + """Regex constraint is converted correctly.""" + params = StructuredOutputsParams(regex=r"\d+") + result = _constraint_to_content_format(params) + + assert result == {"type": "regex", "pattern": r"\d+"} + + def test_grammar(self): + """Grammar constraint is converted correctly.""" + params = StructuredOutputsParams(grammar="root ::= 'hello'") + result = _constraint_to_content_format(params) + + assert result == {"type": "grammar", "grammar": "root ::= 'hello'"} + + def test_choice(self): + """Choice constraint is converted correctly.""" + params = StructuredOutputsParams(choice=["yes", "no"]) + result = _constraint_to_content_format(params) + + assert result == { + "type": "or", + "elements": [ + {"type": "const_string", "value": "yes"}, + {"type": "const_string", "value": "no"}, + ], + } + + def test_structural_tag_only_returns_none(self): + """structural_tag is not a content constraint -- should return None.""" + params = StructuredOutputsParams( + structural_tag='{"type": "structural_tag"}' + ) + result = _constraint_to_content_format(params) + + assert result is None diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py index 3b1327acb688..1329e05e269f 100644 --- a/tests/reasoning/test_gptoss_reasoning_parser.py +++ b/tests/reasoning/test_gptoss_reasoning_parser.py @@ -12,7 +12,10 @@ from vllm.reasoning.gptoss_reasoning_parser import ( GptOssReasoningParser, from_builtin_tool_to_tag, + from_function_tool_to_tag, no_func_reasoning_tag, + tag_with_builtin_funcs, + tag_with_function_tools, ) REASONING_MODEL_NAME = "openai/gpt-oss-120b" @@ -280,3 +283,415 @@ def test_tag_format_consistency(self, reasoning_parser): assert tag["content"]["type"] == "any_text" assert tag["end"] == "<|end|>" assert tag["begin"].startswith("<|channel|>") + + # --- Fixtures for tool_choice / function_tools tests --- + + @pytest.fixture + def mock_tool_server_empty(self): + """Create a mock ToolServer with no tools.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock(return_value=False) + return tool_server + + @pytest.fixture + def mock_tool_server_with_browser(self): + """Create a mock ToolServer with browser tool.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock(side_effect=lambda tool: tool == "browser") + return tool_server + + @pytest.fixture + def mock_tool_server_with_all_tools(self): + """Create a mock ToolServer with all builtin tools.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock( + side_effect=lambda tool: tool in ["browser", "python", "container"] + ) + return tool_server + + # --- Tests from structured output PR --- + + def test_prepare_structured_tag_with_all_tools( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """Test prepare_structured_tag with all builtin tools.""" + result = reasoning_parser.prepare_structured_tag( + None, mock_tool_server_with_all_tools + ) + parsed = json.loads(result) + + # Should have analysis tag + tags for all 3 tools (2 tags each) + assert len(parsed["format"]["tags"]) == 7 # 1 analysis + 6 tool tags + + # Check all tool tags are present + tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]] + for tool in ["browser", "python", "container"]: + assert f"<|channel|>commentary to={tool}" in tag_begins + assert f"<|channel|>analysis to={tool}" in tag_begins + + def test_tag_with_builtin_funcs(self): + """Test tag_with_builtin_funcs function.""" + builtin_tools = ["browser", "python"] + result = tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tools) + + assert result["type"] == "structural_tag" + # Should have original analysis tag + 2 tags per tool + assert len(result["format"]["tags"]) == 5 # 1 + 2*2 + + # Should have added commentary trigger + assert "<|channel|>commentary to=" in result["format"]["triggers"] + assert "<|channel|>analysis" in result["format"]["triggers"] + + def test_tag_structure_invariants(self): + """Test that the basic tag structure follows expected format.""" + assert no_func_reasoning_tag["type"] == "structural_tag" + assert no_func_reasoning_tag["format"]["type"] == "triggered_tags" + assert no_func_reasoning_tag["format"]["stop_after_first"] is False + + # Verify analysis tag structure + analysis_tag = no_func_reasoning_tag["format"]["tags"][0] + assert analysis_tag["begin"] == "<|channel|>analysis<|message|>" + assert analysis_tag["content"]["type"] == "any_text" + assert analysis_tag["end"] == "<|end|>" + + def test_json_serialization_valid( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """Test that all generated tags produce valid JSON.""" + # Test with no tool server + result1 = reasoning_parser.prepare_structured_tag(None, None) + json.loads(result1) # Should not raise + + # Test with empty tool server + empty_server = Mock(spec=ToolServer) + empty_server.has_tool = Mock(return_value=False) + result2 = reasoning_parser.prepare_structured_tag(None, empty_server) + json.loads(result2) # Should not raise + + # Test with tools + result3 = reasoning_parser.prepare_structured_tag( + None, mock_tool_server_with_all_tools + ) + json.loads(result3) # Should not raise + + @pytest.mark.parametrize("tool_name", ["browser", "python", "container"]) + def test_single_tool_integration(self, reasoning_parser, tool_name): + """Test integration with individual tools.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock(side_effect=lambda tool: tool == tool_name) + + result = reasoning_parser.prepare_structured_tag(None, tool_server) + parsed = json.loads(result) + + # Should have 1 analysis + 2 tool-specific tags + assert len(parsed["format"]["tags"]) == 3 + + tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]] + assert f"<|channel|>commentary to={tool_name}" in tag_begins + assert f"<|channel|>analysis to={tool_name}" in tag_begins + + # --- final_content_format tests --- + + def test_prepare_structured_tag_with_json_schema(self, reasoning_parser): + """Test that final channel tag has json_schema content constraint.""" + content_format = { + "type": "json_schema", + "json_schema": { + "type": "object", + "properties": {"name": {"type": "string"}}, + }, + } + result = reasoning_parser.prepare_structured_tag( + None, None, final_content_format=content_format + ) + parsed = json.loads(result) + + # Should have analysis tag + final channel tag + assert len(parsed["format"]["tags"]) == 2 + + # Verify analysis tag is unchanged + assert ( + parsed["format"]["tags"][0]["begin"] + == "<|channel|>analysis<|message|>" + ) + assert parsed["format"]["tags"][0]["content"]["type"] == "any_text" + + # Verify final channel tag has the json_schema content constraint + final_tag = parsed["format"]["tags"][1] + assert final_tag["begin"] == "<|channel|>final<|message|>" + assert final_tag["end"] == "<|end|>" + assert final_tag["content"] == content_format + + # Verify triggers include both analysis and final + assert "<|channel|>analysis" in parsed["format"]["triggers"] + assert "<|channel|>final" in parsed["format"]["triggers"] + + def test_prepare_structured_tag_original_tag_ignores_constraint( + self, reasoning_parser + ): + """When original_tag is provided, final_content_format is ignored.""" + original_tag = '{"custom": "tag"}' + content_format = { + "type": "json_schema", + "json_schema": {"type": "object"}, + } + result = reasoning_parser.prepare_structured_tag( + original_tag, None, final_content_format=content_format + ) + + # Should return the original tag unchanged + assert result == original_tag + + def test_prepare_structured_tag_with_tools_and_constraint( + self, reasoning_parser, mock_tool_server_with_browser + ): + """Test that tools and content constraint coexist in the tag.""" + content_format = { + "type": "json_schema", + "json_schema": {"type": "object"}, + } + result = reasoning_parser.prepare_structured_tag( + None, + mock_tool_server_with_browser, + final_content_format=content_format, + ) + parsed = json.loads(result) + + # Should have analysis + 2 browser tags + final channel tag = 4 + assert len(parsed["format"]["tags"]) == 4 + + tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]] + assert "<|channel|>analysis<|message|>" in tag_begins + assert "<|channel|>commentary to=browser" in tag_begins + assert "<|channel|>analysis to=browser" in tag_begins + assert "<|channel|>final<|message|>" in tag_begins + + # Verify final tag has the constraint + final_tag = next( + t + for t in parsed["format"]["tags"] + if t["begin"] == "<|channel|>final<|message|>" + ) + assert final_tag["content"] == content_format + + # --- Function tool and tool_choice tests --- + + def test_function_tool_tags_on_both_channels(self): + """Verify from_function_tool_to_tag creates commentary + analysis.""" + tags = from_function_tool_to_tag("get_weather", None) + + assert len(tags) == 2 + assert ( + tags[0]["begin"] + == "<|channel|>commentary to=functions.get_weather<|message|>" + ) + assert ( + tags[1]["begin"] + == "<|channel|>analysis to=functions.get_weather<|message|>" + ) + assert tags[0]["end"] == "<|end|>" + assert tags[1]["end"] == "<|end|>" + # No parameters -> any_text + assert tags[0]["content"] == {"type": "any_text"} + assert tags[1]["content"] == {"type": "any_text"} + + def test_function_tool_json_schema_content(self): + """Verify JSON schema from tool parameters is used as content.""" + schema = { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + } + tags = from_function_tool_to_tag("get_weather", schema) + + expected_content = {"type": "json_schema", "json_schema": schema} + assert tags[0]["content"] == expected_content + assert tags[1]["content"] == expected_content + + def test_tool_choice_required_blocks_final(self, reasoning_parser): + """No final trigger/tag when tool_choice=required (no tools).""" + result = reasoning_parser.prepare_structured_tag( + None, None, tool_choice="required" + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + assert not any("final" in b for b in tag_begins) + assert "<|channel|>final" not in parsed["format"]["triggers"] + + def test_tool_choice_required_with_function_tools(self, reasoning_parser): + """Tool tags present but no final when tool_choice=required.""" + fn_tools = [ + {"name": "get_weather", "parameters": {"type": "object"}}, + ] + result = reasoning_parser.prepare_structured_tag( + None, None, tool_choice="required", function_tools=fn_tools + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Function tool tags present + assert ( + "<|channel|>commentary to=functions.get_weather<|message|>" + in tag_begins + ) + assert ( + "<|channel|>analysis to=functions.get_weather<|message|>" + in tag_begins + ) + # No final + assert not any("final" in b for b in tag_begins) + assert "<|channel|>final" not in parsed["format"]["triggers"] + + def test_tool_choice_required_ignores_final_content_format( + self, reasoning_parser + ): + """Final is blocked even when final_content_format is provided.""" + content_fmt = { + "type": "json_schema", + "json_schema": {"type": "object"}, + } + fn_tools = [{"name": "my_func"}] + result = reasoning_parser.prepare_structured_tag( + None, + None, + final_content_format=content_fmt, + tool_choice="required", + function_tools=fn_tools, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + assert not any("final" in b for b in tag_begins) + + def test_tool_choice_auto_with_tools_and_content_format( + self, reasoning_parser + ): + """Tool tags + final with content constraint for auto.""" + schema = {"type": "object", "properties": {"x": {"type": "integer"}}} + content_fmt = {"type": "json_schema", "json_schema": schema} + fn_tools = [{"name": "compute", "parameters": schema}] + + result = reasoning_parser.prepare_structured_tag( + None, + None, + final_content_format=content_fmt, + tool_choice="auto", + function_tools=fn_tools, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Function tool tags + assert ( + "<|channel|>commentary to=functions.compute<|message|>" + in tag_begins + ) + # Final tag with content constraint + assert "<|channel|>final<|message|>" in tag_begins + assert "<|channel|>final" in parsed["format"]["triggers"] + + final_tag = next( + t + for t in parsed["format"]["tags"] + if t["begin"] == "<|channel|>final<|message|>" + ) + assert final_tag["content"] == content_fmt + + def test_tool_choice_auto_with_tools_final_is_any_text( + self, reasoning_parser + ): + """auto + function tools but no content format -> final allows free text.""" + fn_tools = [{"name": "get_weather", "parameters": {"type": "object"}}] + result = reasoning_parser.prepare_structured_tag( + None, + None, + tool_choice="auto", + function_tools=fn_tools, + ) + parsed = json.loads(result) + + final_tag = next( + t + for t in parsed["format"]["tags"] + if t["begin"] == "<|channel|>final<|message|>" + ) + # No content format -> model can respond with any text + assert final_tag["content"] == {"type": "any_text"} + + def test_tool_choice_none_strips_tool_tags( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """No tool tags with tool_choice=none, analysis only.""" + fn_tools = [{"name": "get_weather"}] + result = reasoning_parser.prepare_structured_tag( + None, + mock_tool_server_with_all_tools, + tool_choice="none", + function_tools=fn_tools, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Only analysis tag, no tool tags + assert tag_begins == ["<|channel|>analysis<|message|>"] + assert parsed["format"]["triggers"] == ["<|channel|>analysis"] + + def test_mixed_builtin_and_function_tools( + self, reasoning_parser, mock_tool_server_with_browser + ): + """Both builtin and function tool tags coexist.""" + fn_tools = [{"name": "get_weather"}] + result = reasoning_parser.prepare_structured_tag( + None, + mock_tool_server_with_browser, + tool_choice="auto", + function_tools=fn_tools, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Builtin tool tags + assert "<|channel|>commentary to=browser" in tag_begins + assert "<|channel|>analysis to=browser" in tag_begins + # Function tool tags + assert ( + "<|channel|>commentary to=functions.get_weather<|message|>" + in tag_begins + ) + assert ( + "<|channel|>analysis to=functions.get_weather<|message|>" + in tag_begins + ) + # Final tag (auto + function tools) + assert "<|channel|>final<|message|>" in tag_begins + # General commentary trigger covers both builtin and function + assert "<|channel|>commentary to=" in parsed["format"]["triggers"] + + def test_named_tool_choice(self, reasoning_parser): + """Only the named tool's tags present, final blocked.""" + fn_tools = [ + {"name": "get_weather", "parameters": {"type": "object"}}, + {"name": "get_stock", "parameters": {"type": "object"}}, + ] + result = reasoning_parser.prepare_structured_tag( + None, + None, + tool_choice={"type": "function", "name": "get_weather"}, + function_tools=fn_tools, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Only get_weather tags, not get_stock + assert ( + "<|channel|>commentary to=functions.get_weather<|message|>" + in tag_begins + ) + assert ( + "<|channel|>analysis to=functions.get_weather<|message|>" + in tag_begins + ) + assert not any("get_stock" in b for b in tag_begins) + # No final (named tool choice blocks final) + assert not any("final" in b for b in tag_begins) diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 9b4264456c51..e8202516985e 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime +import json from collections.abc import Iterable, Sequence from typing import Literal @@ -150,6 +151,25 @@ def get_developer_message( return dev_msg +def inject_response_formats( + instructions: str | None, + schema: dict, + format_name: str = "structured_output", +) -> str: + """Append a Harmony cookbook ``# Response Formats`` section. + + Per the cookbook, structured output schemas should appear in the + developer message under a ``# Response Formats`` heading so the + model knows what format to produce. This complements grammar + enforcement via structural tags. + """ + schema_json = json.dumps(schema, separators=(",", ":")) + section = f"\n\n# Response Formats\n\n## {format_name}\n\n{schema_json}" + if instructions: + return instructions + section + return section.lstrip("\n") + + def get_user_message(content: str) -> Message: return Message.from_role_and_content(Role.USER, content) diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index a5f62bdd8c39..831fb1077243 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -346,6 +346,10 @@ def to_sampling_params( # --follow-imports skip hides the class definition but also hides # multiple third party conflicts, so best of both evils ) + elif response_format.type == "json_object": + structured_outputs = StructuredOutputsParams( + json_object=True # type: ignore[call-arg] + ) stop = self.stop if self.stop else [] if isinstance(stop, str): diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index b2428e97e20d..48d08a8784d8 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import json as json_mod import time import uuid from collections import deque @@ -65,6 +66,7 @@ get_system_message, get_user_message, has_custom_tools, + inject_response_formats, render_for_completion, ) from vllm.entrypoints.openai.responses.context import ( @@ -125,6 +127,54 @@ logger = init_logger(__name__) +def _extract_response_format_schema(request: ResponsesRequest) -> dict | None: + """Extract JSON schema from the request's structured output config.""" + if ( + request.text is not None + and request.text.format is not None + and request.text.format.type == "json_schema" + and request.text.format.schema_ is not None + ): + return request.text.format.schema_ + if ( + request.structured_outputs is not None + and request.structured_outputs.json is not None + ): + val = request.structured_outputs.json + if isinstance(val, str): + return json_mod.loads(val) + return val + return None + + +def _constraint_to_content_format( + params: StructuredOutputsParams, +) -> dict | None: + """Convert a StructuredOutputsParams constraint into an xgrammar + content format dict suitable for embedding in a structural tag.""" + if params.json is not None: + schema = ( + params.json + if isinstance(params.json, dict) + else json_mod.loads(params.json) + ) + return {"type": "json_schema", "json_schema": schema} + if params.json_object: + return {"type": "json_schema", "json_schema": {"type": "object"}} + if params.regex is not None: + return {"type": "regex", "pattern": params.regex} + if params.grammar is not None: + return {"type": "grammar", "grammar": params.grammar} + if params.choice is not None: + return { + "type": "or", + "elements": [ + {"type": "const_string", "value": c} for c in params.choice + ], + } + return None + + def _extract_allowed_tools_from_mcp_requests( tools: list[Tool], ) -> dict[str, list[str] | None]: @@ -470,21 +520,88 @@ async def create_responses( else: context = SimpleContext() + # Extract function tools for the reasoning parser + function_tools_for_parser = None + if request.tools: + ft = [ + { + "name": t.name, + **( + {"parameters": t.parameters} + if t.parameters + else {} + ), + } + for t in request.tools + if getattr(t, "type", None) == "function" + ] + if ft: + function_tools_for_parser = ft + if self.parser and self.parser.reasoning_parser_cls is not None: reasoning_parser = self.parser.reasoning_parser_cls(tokenizer) - if ( - isinstance( - struct_out := sampling_params.structured_outputs, - StructuredOutputsParams, - ) - and struct_out.all_non_structural_tag_constraints_none() - ): - sampling_params.structured_outputs = replace( - struct_out, - structural_tag=reasoning_parser.prepare_structured_tag( - struct_out.structural_tag, self.tool_server - ), + struct_out = sampling_params.structured_outputs + + if isinstance(struct_out, StructuredOutputsParams): + if struct_out.all_non_structural_tag_constraints_none(): + # No content constraint — just apply reasoning + # channel tags + tool_choice + function tools + sampling_params.structured_outputs = replace( + struct_out, + structural_tag=( + reasoning_parser.prepare_structured_tag( + struct_out.structural_tag, + self.tool_server, + tool_choice=request.tool_choice, + function_tools=function_tools_for_parser, + ) + ), + ) + else: + # Content constraint present (json, regex, + # grammar, choice, json_object). Embed it in the + # final channel tag within the structural tag. + content_fmt = _constraint_to_content_format( + struct_out + ) + if content_fmt is not None: + structural_tag = ( + reasoning_parser.prepare_structured_tag( + None, + self.tool_server, + final_content_format=content_fmt, + tool_choice=request.tool_choice, + function_tools=function_tools_for_parser, + ) + ) + if structural_tag is not None: + # Clear content constraints, set + # structural_tag, but preserve options + # like disable_any_whitespace. + sampling_params.structured_outputs = replace( + struct_out, + json=None, + regex=None, + choice=None, + grammar=None, + json_object=None, + structural_tag=structural_tag, + ) + elif struct_out is None: + # No structured output requested, but still need + # reasoning channel tags + tool_choice + function tools + tag = reasoning_parser.prepare_structured_tag( + None, + self.tool_server, + tool_choice=request.tool_choice, + function_tools=function_tools_for_parser, ) + if tag is not None: + sampling_params.structured_outputs = ( + StructuredOutputsParams( + structural_tag=tag # type: ignore[call-arg] + ) + ) generator = self._generate_with_builtin_tools( request_id=request.request_id, engine_prompt=engine_prompt, @@ -712,11 +829,6 @@ def _make_request_with_harmony( request: ResponsesRequest, prev_response: ResponsesResponse | None, ): - if request.tool_choice != "auto": - raise NotImplementedError( - "Only 'auto' tool_choice is supported in response API with Harmony" - ) - messages = self._construct_input_messages_with_harmony(request, prev_response) prompt_token_ids = render_for_completion(messages) engine_prompt = token_inputs(prompt_token_ids) @@ -1143,9 +1255,24 @@ def _construct_input_messages_with_harmony( request, with_custom_tools, tool_types ) messages.append(sys_msg) - if with_custom_tools: + + # Determine if we need a developer message. + # Per Harmony cookbook: developer message holds instructions, + # function tools, AND response format schemas. + response_format_schema = _extract_response_format_schema(request) + needs_dev_msg = ( + with_custom_tools or response_format_schema is not None + ) + + if needs_dev_msg: + dev_instructions = request.instructions + if response_format_schema is not None: + dev_instructions = inject_response_formats( + dev_instructions, response_format_schema + ) dev_msg = get_developer_message( - instructions=request.instructions, tools=request.tools + instructions=dev_instructions, + tools=request.tools if with_custom_tools else None, ) messages.append(dev_msg) messages += construct_harmony_previous_input_messages(request) @@ -1985,7 +2112,7 @@ def _increment_sequence_number_and_return( output=[], status="in_progress", usage=None, - ).model_dump() + ) yield _increment_sequence_number_and_return( ResponseCreatedEvent( type="response.created", diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 5271a307075e..ee3acd92b675 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -154,10 +154,24 @@ def prepare_structured_tag( self, original_tag: str | None, tool_server: ToolServer | None, + final_content_format: dict | None = None, + tool_choice: str | dict | None = None, + function_tools: list[dict] | None = None, ) -> str | None: """ - Instance method that is implemented for preparing the structured tag - Otherwise, None is returned + Instance method that is implemented for preparing the structured tag. + Otherwise, None is returned. + + Args: + original_tag: An existing structural tag string, if any. + tool_server: The tool server for builtin tool support. + final_content_format: Optional xgrammar content format dict + (e.g. json_schema, regex) to embed in the <|channel|>final + tag for constraining the model's final output region. + tool_choice: The tool_choice setting from the request + ("auto", "required", "none", or a named tool dict). + function_tools: List of function tool dicts with "name" and + optional "parameters" keys. """ return None diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 89299d4b12b8..016864ac6361 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import json from collections.abc import Sequence from typing import TYPE_CHECKING @@ -52,8 +53,6 @@ def from_builtin_tool_to_tag(tool: str) -> list[dict]: def tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tool_list: list[str]) -> dict: - import copy - new_tag = copy.deepcopy(no_func_reasoning_tag) new_tag["format"]["triggers"].append("<|channel|>commentary to=") @@ -62,6 +61,45 @@ def tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tool_list: list[str]) return new_tag +def from_function_tool_to_tag(name: str, parameters: dict | None) -> list[dict]: + content = ( + {"type": "json_schema", "json_schema": parameters} + if parameters + else {"type": "any_text"} + ) + return [ + { + "begin": f"<|channel|>commentary to=functions.{name}<|message|>", + "content": content, + "end": "<|end|>", + }, + { + "begin": f"<|channel|>analysis to=functions.{name}<|message|>", + "content": content, + "end": "<|end|>", + }, + ] + + +def tag_with_function_tools( + base_tag: dict, function_tools: list[dict] +) -> dict: + new_tag = copy.deepcopy(base_tag) + + # Add commentary trigger for function tools if not already covered + # by the general commentary trigger (added by builtin tools). + if "<|channel|>commentary to=" not in new_tag["format"]["triggers"]: + new_tag["format"]["triggers"].append( + "<|channel|>commentary to=functions." + ) + + for tool in function_tools: + new_tag["format"]["tags"].extend( + from_function_tool_to_tag(tool["name"], tool.get("parameters")) + ) + return new_tag + + class GptOssReasoningParser(ReasoningParser): """ Reasoning parser for GptOss model. @@ -158,30 +196,65 @@ def extract_reasoning( # This function prepares the structural tag to format reasoning output def prepare_structured_tag( - self, original_tag: str | None, tool_server: ToolServer | None + self, + original_tag: str | None, + tool_server: ToolServer | None, + final_content_format: dict | None = None, + tool_choice: str | dict | None = None, + function_tools: list[dict] | None = None, ) -> str | None: - if original_tag is None: - if tool_server is None: - return json.dumps(no_func_reasoning_tag) - else: - builtin_tool_list: list[str] = [] - if tool_server.has_tool("browser"): - builtin_tool_list.append("browser") - if tool_server.has_tool("python"): - builtin_tool_list.append("python") - if tool_server.has_tool("container"): - builtin_tool_list.append("container") - - if len(builtin_tool_list) > 0: - logger.info("Builtin_tool_list: %s", builtin_tool_list) - func_tag = json.dumps( - tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tool_list) - ) - else: - logger.info("Builtin_tool_list is empty") - func_tag = json.dumps(no_func_reasoning_tag) - - return func_tag - else: + if original_tag is not None: # There is potential risk for appending the tag to the original tag return original_tag + + # Build base tag with analysis channel + base_tag = copy.deepcopy(no_func_reasoning_tag) + + # Add builtin tool tags (unless tool_choice is "none") + if tool_choice != "none" and tool_server is not None: + builtin_tool_list: list[str] = [] + if tool_server.has_tool("browser"): + builtin_tool_list.append("browser") + if tool_server.has_tool("python"): + builtin_tool_list.append("python") + if tool_server.has_tool("container"): + builtin_tool_list.append("container") + + if builtin_tool_list: + logger.info("Builtin_tool_list: %s", builtin_tool_list) + base_tag = tag_with_builtin_funcs(base_tag, builtin_tool_list) + else: + logger.info("Builtin_tool_list is empty") + + # Add function tool tags (unless tool_choice is "none") + effective_function_tools = None + if tool_choice != "none" and function_tools: + effective_function_tools = function_tools + # If named tool choice, filter to only the named tool + if isinstance(tool_choice, dict): + named = tool_choice.get("name") + effective_function_tools = [ + t for t in function_tools if t["name"] == named + ] + if effective_function_tools: + base_tag = tag_with_function_tools( + base_tag, effective_function_tools + ) + + # Add final channel tag unless tool_choice blocks it + if tool_choice != "required" and not isinstance(tool_choice, dict): + has_function_tools = bool(effective_function_tools) + if has_function_tools or final_content_format: + final_content = ( + final_content_format + if final_content_format + else {"type": "any_text"} + ) + base_tag["format"]["tags"].append({ + "begin": "<|channel|>final<|message|>", + "content": final_content, + "end": "<|end|>", + }) + base_tag["format"]["triggers"].append("<|channel|>final") + + return json.dumps(base_tag) From b1a04b3037f3525dc34b059e7081d8c3b9588dfb Mon Sep 17 00:00:00 2001 From: Will Deines Date: Tue, 17 Mar 2026 15:23:45 -0400 Subject: [PATCH 02/10] fix(responses): suppress tool descriptions in prompt when tool_choice=none When tool_choice=none, the structural tag grammar correctly blocks tool-calling channels, but the system/developer messages still described the tools to the model. The model would then attempt tool calls that leaked through the output parser. Introduce a tools_visible flag that is false when tool_choice=none, suppressing both the commentary channel in the system message and tool descriptions in the developer message. Signed-off-by: Will Deines --- vllm/entrypoints/openai/responses/serving.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 48d08a8784d8..672f4028b00a 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -1251,8 +1251,17 @@ def _construct_input_messages_with_harmony( tool_types = extract_tool_types(request.tools) with_custom_tools = has_custom_tools(tool_types) + # When tool_choice=none, suppress tool awareness in the + # prompt so the model doesn't attempt tool calls. The + # structural tag grammar already blocks tool channels, but + # omitting tools from the system/developer messages + # prevents the model from even reasoning about calling them. + tools_visible = ( + with_custom_tools and request.tool_choice != "none" + ) + sys_msg = self._construct_harmony_system_input_message( - request, with_custom_tools, tool_types + request, tools_visible, tool_types ) messages.append(sys_msg) @@ -1261,7 +1270,7 @@ def _construct_input_messages_with_harmony( # function tools, AND response format schemas. response_format_schema = _extract_response_format_schema(request) needs_dev_msg = ( - with_custom_tools or response_format_schema is not None + tools_visible or response_format_schema is not None ) if needs_dev_msg: @@ -1272,7 +1281,7 @@ def _construct_input_messages_with_harmony( ) dev_msg = get_developer_message( instructions=dev_instructions, - tools=request.tools if with_custom_tools else None, + tools=request.tools if tools_visible else None, ) messages.append(dev_msg) messages += construct_harmony_previous_input_messages(request) From 6fe76535744e1109f6ae3ecff2634f4bbeaad05d Mon Sep 17 00:00:00 2001 From: Will Deines Date: Tue, 17 Mar 2026 16:12:22 -0400 Subject: [PATCH 03/10] fix(responses): enforce tool_choice=required via at_least_one grammar constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit triggered_tags allows free text between triggers, so omitting <|channel|>final from the tag list was not sufficient to prevent the model from using it. xgrammar's at_least_one=True forces the grammar to begin with a triggered channel immediately, blocking <|channel|>final and EOS at the token level. With this flag set on tool_choice=required or a named tool, only 2 tokens are allowed at generation start (the <|channel|> special token), and after that only analysis/commentary continuations — not "final" — are valid. Signed-off-by: Will Deines --- vllm/reasoning/gptoss_reasoning_parser.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 016864ac6361..d7cea68d13b7 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -257,4 +257,10 @@ def prepare_structured_tag( }) base_tag["format"]["triggers"].append("<|channel|>final") + # For tool_choice=required or named tool, force at least one triggered + # tag. This blocks <|channel|>final and EOS at the grammar level until + # the model has emitted at least one tool-call channel. + if tool_choice == "required" or isinstance(tool_choice, dict): + base_tag["format"]["at_least_one"] = True + return json.dumps(base_tag) From d2ecd4abfd0e6b0dfb9b4514fde3980bc16878db Mon Sep 17 00:00:00 2001 From: Will Deines Date: Wed, 18 Mar 2026 07:16:53 -0400 Subject: [PATCH 04/10] =?UTF-8?q?fix(responses):=20fix=20tool=5Fchoice=3Dr?= =?UTF-8?q?equired=20grammar=20=E2=80=94=20remove=20pure=20analysis=20from?= =?UTF-8?q?=20at=5Fleast=5Fone=20scope?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous approach (empty base with pre-set triggers) caused duplicate triggers after tag_with_function_tools ran, crashing xgrammar with 500. New approach: use the normal base tag, then filter out the pure analysis tag (<|channel|>analysis<|message|>) for required/named tool_choice before setting at_least_one=True. The analysis trigger is kept so analysis-to-functions tags remain reachable in triggered_tags_sub. Verified locally: grammar compiles, triggered_tags_first has only tool call options, after <|channel|> only analysis/commentary continuations allowed (11 tokens), final channel blocked. Signed-off-by: Will Deines --- vllm/reasoning/gptoss_reasoning_parser.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index d7cea68d13b7..2e1f522aae52 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -207,7 +207,6 @@ def prepare_structured_tag( # There is potential risk for appending the tag to the original tag return original_tag - # Build base tag with analysis channel base_tag = copy.deepcopy(no_func_reasoning_tag) # Add builtin tool tags (unless tool_choice is "none") @@ -261,6 +260,16 @@ def prepare_structured_tag( # tag. This blocks <|channel|>final and EOS at the grammar level until # the model has emitted at least one tool-call channel. if tool_choice == "required" or isinstance(tool_choice, dict): + # Remove the pure analysis tag (no recipient) from the tag list so + # that triggered_tags_first only contains function-call tags. The + # analysis trigger is kept so analysis-to-functions tags remain + # reachable in triggered_tags_sub. This prevents the model from + # satisfying at_least_one with a pure reasoning channel instead of + # an actual tool call. + base_tag["format"]["tags"] = [ + t for t in base_tag["format"]["tags"] + if t.get("begin") != "<|channel|>analysis<|message|>" + ] base_tag["format"]["at_least_one"] = True return json.dumps(base_tag) From a933921864dfd826ad5f38e8b649a71fc8090ea2 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Wed, 18 Mar 2026 09:19:41 -0400 Subject: [PATCH 05/10] =?UTF-8?q?style:=20fix=20pre-commit=20lint=20?= =?UTF-8?q?=E2=80=94=20ruff=20formatting=20and=20mypy=20type=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ruff: reformat long lines in tests and serving.py - mypy: annotate base_tag as dict[str, Any] in gptoss_reasoning_parser to allow nested dict indexing through copy.deepcopy result - mypy: suppress arg-type on list(name) in abs_reasoning_parsers where TypeIs narrowing is not respected under --python-version 3.10 Signed-off-by: Will Deines --- .../openai/parser/test_harmony_utils.py | 17 ++---- .../responses/test_structured_output.py | 4 +- .../reasoning/test_gptoss_reasoning_parser.py | 53 ++++--------------- vllm/entrypoints/openai/responses/serving.py | 42 +++++---------- vllm/reasoning/abs_reasoning_parsers.py | 3 +- vllm/reasoning/gptoss_reasoning_parser.py | 31 +++++------ 6 files changed, 45 insertions(+), 105 deletions(-) diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 01ac12bd9408..f82eb1fdf47b 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -944,9 +944,7 @@ def test_none_instructions_creates_section(self): assert '{"type":"object"}' in result def test_custom_format_name(self): - result = inject_response_formats( - None, {"type": "object"}, format_name="order" - ) + result = inject_response_formats(None, {"type": "object"}, format_name="order") assert "## order" in result def test_compact_json_no_spaces(self): @@ -955,15 +953,8 @@ def test_compact_json_no_spaces(self): "properties": {"name": {"type": "string"}}, } result = inject_response_formats(None, schema) - assert ( - '{"type":"object","properties":{"name":{"type":"string"}}}' - in result - ) + assert '{"type":"object","properties":{"name":{"type":"string"}}}' in result def test_section_separated_by_blank_lines(self): - result = inject_response_formats( - "Instructions here.", {"type": "object"} - ) - assert ( - "\n\n# Response Formats\n\n## structured_output\n\n" in result - ) + result = inject_response_formats("Instructions here.", {"type": "object"}) + assert "\n\n# Response Formats\n\n## structured_output\n\n" in result diff --git a/tests/entrypoints/openai/responses/test_structured_output.py b/tests/entrypoints/openai/responses/test_structured_output.py index 16742708041c..1f155e15456e 100644 --- a/tests/entrypoints/openai/responses/test_structured_output.py +++ b/tests/entrypoints/openai/responses/test_structured_output.py @@ -141,9 +141,7 @@ def test_choice(self): def test_structural_tag_only_returns_none(self): """structural_tag is not a content constraint -- should return None.""" - params = StructuredOutputsParams( - structural_tag='{"type": "structural_tag"}' - ) + params = StructuredOutputsParams(structural_tag='{"type": "structural_tag"}') result = _constraint_to_content_format(params) assert result is None diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py index 1329e05e269f..500b03861cea 100644 --- a/tests/reasoning/test_gptoss_reasoning_parser.py +++ b/tests/reasoning/test_gptoss_reasoning_parser.py @@ -15,7 +15,6 @@ from_function_tool_to_tag, no_func_reasoning_tag, tag_with_builtin_funcs, - tag_with_function_tools, ) REASONING_MODEL_NAME = "openai/gpt-oss-120b" @@ -410,10 +409,7 @@ def test_prepare_structured_tag_with_json_schema(self, reasoning_parser): assert len(parsed["format"]["tags"]) == 2 # Verify analysis tag is unchanged - assert ( - parsed["format"]["tags"][0]["begin"] - == "<|channel|>analysis<|message|>" - ) + assert parsed["format"]["tags"][0]["begin"] == "<|channel|>analysis<|message|>" assert parsed["format"]["tags"][0]["content"]["type"] == "any_text" # Verify final channel tag has the json_schema content constraint @@ -531,21 +527,13 @@ def test_tool_choice_required_with_function_tools(self, reasoning_parser): tag_begins = [t["begin"] for t in parsed["format"]["tags"]] # Function tool tags present - assert ( - "<|channel|>commentary to=functions.get_weather<|message|>" - in tag_begins - ) - assert ( - "<|channel|>analysis to=functions.get_weather<|message|>" - in tag_begins - ) + assert "<|channel|>commentary to=functions.get_weather<|message|>" in tag_begins + assert "<|channel|>analysis to=functions.get_weather<|message|>" in tag_begins # No final assert not any("final" in b for b in tag_begins) assert "<|channel|>final" not in parsed["format"]["triggers"] - def test_tool_choice_required_ignores_final_content_format( - self, reasoning_parser - ): + def test_tool_choice_required_ignores_final_content_format(self, reasoning_parser): """Final is blocked even when final_content_format is provided.""" content_fmt = { "type": "json_schema", @@ -564,9 +552,7 @@ def test_tool_choice_required_ignores_final_content_format( tag_begins = [t["begin"] for t in parsed["format"]["tags"]] assert not any("final" in b for b in tag_begins) - def test_tool_choice_auto_with_tools_and_content_format( - self, reasoning_parser - ): + def test_tool_choice_auto_with_tools_and_content_format(self, reasoning_parser): """Tool tags + final with content constraint for auto.""" schema = {"type": "object", "properties": {"x": {"type": "integer"}}} content_fmt = {"type": "json_schema", "json_schema": schema} @@ -583,10 +569,7 @@ def test_tool_choice_auto_with_tools_and_content_format( tag_begins = [t["begin"] for t in parsed["format"]["tags"]] # Function tool tags - assert ( - "<|channel|>commentary to=functions.compute<|message|>" - in tag_begins - ) + assert "<|channel|>commentary to=functions.compute<|message|>" in tag_begins # Final tag with content constraint assert "<|channel|>final<|message|>" in tag_begins assert "<|channel|>final" in parsed["format"]["triggers"] @@ -598,9 +581,7 @@ def test_tool_choice_auto_with_tools_and_content_format( ) assert final_tag["content"] == content_fmt - def test_tool_choice_auto_with_tools_final_is_any_text( - self, reasoning_parser - ): + def test_tool_choice_auto_with_tools_final_is_any_text(self, reasoning_parser): """auto + function tools but no content format -> final allows free text.""" fn_tools = [{"name": "get_weather", "parameters": {"type": "object"}}] result = reasoning_parser.prepare_structured_tag( @@ -655,14 +636,8 @@ def test_mixed_builtin_and_function_tools( assert "<|channel|>commentary to=browser" in tag_begins assert "<|channel|>analysis to=browser" in tag_begins # Function tool tags - assert ( - "<|channel|>commentary to=functions.get_weather<|message|>" - in tag_begins - ) - assert ( - "<|channel|>analysis to=functions.get_weather<|message|>" - in tag_begins - ) + assert "<|channel|>commentary to=functions.get_weather<|message|>" in tag_begins + assert "<|channel|>analysis to=functions.get_weather<|message|>" in tag_begins # Final tag (auto + function tools) assert "<|channel|>final<|message|>" in tag_begins # General commentary trigger covers both builtin and function @@ -684,14 +659,8 @@ def test_named_tool_choice(self, reasoning_parser): tag_begins = [t["begin"] for t in parsed["format"]["tags"]] # Only get_weather tags, not get_stock - assert ( - "<|channel|>commentary to=functions.get_weather<|message|>" - in tag_begins - ) - assert ( - "<|channel|>analysis to=functions.get_weather<|message|>" - in tag_begins - ) + assert "<|channel|>commentary to=functions.get_weather<|message|>" in tag_begins + assert "<|channel|>analysis to=functions.get_weather<|message|>" in tag_begins assert not any("get_stock" in b for b in tag_begins) # No final (named tool choice blocks final) assert not any("final" in b for b in tag_begins) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 672f4028b00a..6297cb6cbc33 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -168,9 +168,7 @@ def _constraint_to_content_format( if params.choice is not None: return { "type": "or", - "elements": [ - {"type": "const_string", "value": c} for c in params.choice - ], + "elements": [{"type": "const_string", "value": c} for c in params.choice], } return None @@ -526,11 +524,7 @@ async def create_responses( ft = [ { "name": t.name, - **( - {"parameters": t.parameters} - if t.parameters - else {} - ), + **({"parameters": t.parameters} if t.parameters else {}), } for t in request.tools if getattr(t, "type", None) == "function" @@ -561,18 +555,14 @@ async def create_responses( # Content constraint present (json, regex, # grammar, choice, json_object). Embed it in the # final channel tag within the structural tag. - content_fmt = _constraint_to_content_format( - struct_out - ) + content_fmt = _constraint_to_content_format(struct_out) if content_fmt is not None: - structural_tag = ( - reasoning_parser.prepare_structured_tag( - None, - self.tool_server, - final_content_format=content_fmt, - tool_choice=request.tool_choice, - function_tools=function_tools_for_parser, - ) + structural_tag = reasoning_parser.prepare_structured_tag( + None, + self.tool_server, + final_content_format=content_fmt, + tool_choice=request.tool_choice, + function_tools=function_tools_for_parser, ) if structural_tag is not None: # Clear content constraints, set @@ -597,10 +587,8 @@ async def create_responses( function_tools=function_tools_for_parser, ) if tag is not None: - sampling_params.structured_outputs = ( - StructuredOutputsParams( - structural_tag=tag # type: ignore[call-arg] - ) + sampling_params.structured_outputs = StructuredOutputsParams( + structural_tag=tag # type: ignore[call-arg] ) generator = self._generate_with_builtin_tools( request_id=request.request_id, @@ -1256,9 +1244,7 @@ def _construct_input_messages_with_harmony( # structural tag grammar already blocks tool channels, but # omitting tools from the system/developer messages # prevents the model from even reasoning about calling them. - tools_visible = ( - with_custom_tools and request.tool_choice != "none" - ) + tools_visible = with_custom_tools and request.tool_choice != "none" sys_msg = self._construct_harmony_system_input_message( request, tools_visible, tool_types @@ -1269,9 +1255,7 @@ def _construct_input_messages_with_harmony( # Per Harmony cookbook: developer message holds instructions, # function tools, AND response format schemas. response_format_schema = _extract_response_format_schema(request) - needs_dev_msg = ( - tools_visible or response_format_schema is not None - ) + needs_dev_msg = tools_visible or response_format_schema is not None if needs_dev_msg: dev_instructions = request.instructions diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index ee3acd92b675..8edc45bbb6d9 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -312,7 +312,8 @@ def _decorator(obj: type[ReasoningParser]) -> type[ReasoningParser]: if isinstance(name, str): names = [name] elif is_list_of(name, str): - names = name + assert name is not None + names = list(name) else: names = [class_name] diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 2e1f522aae52..9b117501b453 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -3,7 +3,7 @@ import copy import json from collections.abc import Sequence -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from transformers import PreTrainedTokenizerBase @@ -81,17 +81,13 @@ def from_function_tool_to_tag(name: str, parameters: dict | None) -> list[dict]: ] -def tag_with_function_tools( - base_tag: dict, function_tools: list[dict] -) -> dict: +def tag_with_function_tools(base_tag: dict, function_tools: list[dict]) -> dict: new_tag = copy.deepcopy(base_tag) # Add commentary trigger for function tools if not already covered # by the general commentary trigger (added by builtin tools). if "<|channel|>commentary to=" not in new_tag["format"]["triggers"]: - new_tag["format"]["triggers"].append( - "<|channel|>commentary to=functions." - ) + new_tag["format"]["triggers"].append("<|channel|>commentary to=functions.") for tool in function_tools: new_tag["format"]["tags"].extend( @@ -207,7 +203,7 @@ def prepare_structured_tag( # There is potential risk for appending the tag to the original tag return original_tag - base_tag = copy.deepcopy(no_func_reasoning_tag) + base_tag: dict[str, Any] = copy.deepcopy(no_func_reasoning_tag) # Add builtin tool tags (unless tool_choice is "none") if tool_choice != "none" and tool_server is not None: @@ -236,9 +232,7 @@ def prepare_structured_tag( t for t in function_tools if t["name"] == named ] if effective_function_tools: - base_tag = tag_with_function_tools( - base_tag, effective_function_tools - ) + base_tag = tag_with_function_tools(base_tag, effective_function_tools) # Add final channel tag unless tool_choice blocks it if tool_choice != "required" and not isinstance(tool_choice, dict): @@ -249,11 +243,13 @@ def prepare_structured_tag( if final_content_format else {"type": "any_text"} ) - base_tag["format"]["tags"].append({ - "begin": "<|channel|>final<|message|>", - "content": final_content, - "end": "<|end|>", - }) + base_tag["format"]["tags"].append( + { + "begin": "<|channel|>final<|message|>", + "content": final_content, + "end": "<|end|>", + } + ) base_tag["format"]["triggers"].append("<|channel|>final") # For tool_choice=required or named tool, force at least one triggered @@ -267,7 +263,8 @@ def prepare_structured_tag( # satisfying at_least_one with a pure reasoning channel instead of # an actual tool call. base_tag["format"]["tags"] = [ - t for t in base_tag["format"]["tags"] + t + for t in base_tag["format"]["tags"] if t.get("begin") != "<|channel|>analysis<|message|>" ] base_tag["format"]["at_least_one"] = True From d23f78ff4c53d4544961ad5be2be52f8e19316c3 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Wed, 18 Mar 2026 14:20:45 -0400 Subject: [PATCH 06/10] =?UTF-8?q?fix:=20address=20P1=20review=20comments?= =?UTF-8?q?=20=E2=80=94=20three=20tool=5Fchoice=20bugs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Preserve developer instructions when tool_choice="none": add `request.instructions is not None` to `needs_dev_msg` condition so instructions flow through even when tools are hidden. 2. Suppress builtin tool descriptions for tool_choice="none": rename `with_custom_tools` param to `tools_visible` in `_construct_harmony_system_input_message` and gate browser/python/ container descriptions on it. 3. Exclude builtin channels for named function tool_choice: skip builtin tool tags when `isinstance(tool_choice, dict)` so they cannot satisfy the `at_least_one` grammar constraint instead of the named function. Signed-off-by: Will Deines --- .../responses/test_tool_choice_harmony.py | 110 ++++++++++++++++++ .../reasoning/test_gptoss_reasoning_parser.py | 58 +++++++++ vllm/entrypoints/openai/responses/serving.py | 24 +++- vllm/reasoning/gptoss_reasoning_parser.py | 11 +- 4 files changed, 195 insertions(+), 8 deletions(-) create mode 100644 tests/entrypoints/openai/responses/test_tool_choice_harmony.py diff --git a/tests/entrypoints/openai/responses/test_tool_choice_harmony.py b/tests/entrypoints/openai/responses/test_tool_choice_harmony.py new file mode 100644 index 000000000000..f873aeb5db24 --- /dev/null +++ b/tests/entrypoints/openai/responses/test_tool_choice_harmony.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for tool_choice handling in the Harmony-based Responses API. + +These tests verify that: +- Developer instructions are preserved when tool_choice="none" (Bug 1) +- Builtin tool descriptions are suppressed when tool_choice="none" (Bug 2) +""" + +from __future__ import annotations + +from unittest.mock import Mock + +from openai_harmony import Role, ToolNamespaceConfig + +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_developer_message, + get_system_message, +) + + +class TestToolChoiceNoneInstructions: + """Bug 1: Developer instructions must not be dropped when + tool_choice='none' causes tools to be hidden.""" + + def test_developer_message_with_instructions_no_tools(self): + """get_developer_message must include instructions even when + tools=None (the condition that arises from tool_choice='none' + with no custom tools).""" + dev_msg = get_developer_message( + instructions="Be helpful and concise", tools=None + ) + assert dev_msg.author.role == Role.DEVELOPER + rendered = str(dev_msg) + assert "Be helpful and concise" in rendered + + def test_developer_message_with_instructions_and_tools(self): + """Baseline: instructions + tools both appear in the developer + message when tools are visible.""" + tool = Mock() + tool.type = "function" + tool.name = "get_weather" + tool.description = "Get weather" + tool.parameters = {"type": "object", "properties": {}} + + dev_msg = get_developer_message(instructions="Be helpful", tools=[tool]) + rendered = str(dev_msg) + assert "Be helpful" in rendered + assert "get_weather" in rendered + + def test_developer_message_no_instructions_no_tools(self): + """When neither instructions nor tools are provided, the + developer message is still valid (just empty content).""" + dev_msg = get_developer_message(instructions=None, tools=None) + assert dev_msg.author.role == Role.DEVELOPER + + +class TestToolChoiceNoneSystemMessage: + """Bug 2: Builtin tool descriptions in the system message must be + suppressed when tool_choice='none'.""" + + def test_system_message_no_tool_descriptions(self): + """When all tool descriptions are None (as happens when + tools_visible=False), the system message must not contain + tool descriptions.""" + sys_msg = get_system_message( + browser_description=None, + python_description=None, + container_description=None, + with_custom_tools=False, + ) + assert sys_msg.author.role == Role.SYSTEM + # tools should be None or empty when no descriptions are provided + assert not sys_msg.content[0].tools + + def test_system_message_with_browser_description(self): + """Baseline: when a ToolNamespaceConfig is provided, it appears + in the system message tools.""" + browser_ns = ToolNamespaceConfig.browser() + sys_msg = get_system_message( + browser_description=browser_ns, + python_description=None, + container_description=None, + with_custom_tools=False, + ) + assert sys_msg.author.role == Role.SYSTEM + assert "browser" in sys_msg.content[0].tools + + def test_system_message_with_python_description(self): + """Python tool description appears in system message when provided.""" + python_ns = ToolNamespaceConfig.python() + sys_msg = get_system_message( + browser_description=None, + python_description=python_ns, + container_description=None, + with_custom_tools=False, + ) + assert sys_msg.author.role == Role.SYSTEM + assert "python" in sys_msg.content[0].tools + + def test_none_descriptions_mean_no_tools(self): + """Passing None for all tool descriptions (as happens when + tools_visible=False) must result in no tools in the system msg.""" + sys_msg = get_system_message( + browser_description=None, + python_description=None, + container_description=None, + with_custom_tools=False, + ) + assert not sys_msg.content[0].tools diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py index 500b03861cea..3b3671b71795 100644 --- a/tests/reasoning/test_gptoss_reasoning_parser.py +++ b/tests/reasoning/test_gptoss_reasoning_parser.py @@ -664,3 +664,61 @@ def test_named_tool_choice(self, reasoning_parser): assert not any("get_stock" in b for b in tag_begins) # No final (named tool choice blocks final) assert not any("final" in b for b in tag_begins) + + def test_named_tool_choice_excludes_builtins( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """Named function tool_choice must exclude builtin tool tags. + + With at_least_one=True, builtin channels (browser/python/container) + could satisfy the grammar constraint instead of the named function.""" + fn_tools = [{"name": "get_weather", "parameters": {"type": "object"}}] + result = reasoning_parser.prepare_structured_tag( + None, + mock_tool_server_with_all_tools, + tool_choice={"type": "function", "name": "get_weather"}, + function_tools=fn_tools, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Named function tags present + assert "<|channel|>commentary to=functions.get_weather<|message|>" in tag_begins + # No builtin tags + assert not any("to=browser" in b for b in tag_begins) + assert not any("to=python" in b for b in tag_begins) + assert not any("to=container" in b for b in tag_begins) + + def test_tool_choice_none_excludes_builtins( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """tool_choice='none' must suppress builtin tool tags even when + a tool_server with builtins is present.""" + result = reasoning_parser.prepare_structured_tag( + None, + mock_tool_server_with_all_tools, + tool_choice="none", + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + # Only the base analysis tag — no builtin channels + assert tag_begins == ["<|channel|>analysis<|message|>"] + + @pytest.mark.parametrize("tool_choice", ["auto", "required", None]) + def test_tool_choice_auto_required_include_builtins( + self, reasoning_parser, mock_tool_server_with_all_tools, tool_choice + ): + """tool_choice='auto'/'required'/None should include builtin tool + tags when a tool_server has builtins.""" + result = reasoning_parser.prepare_structured_tag( + None, + mock_tool_server_with_all_tools, + tool_choice=tool_choice, + ) + parsed = json.loads(result) + + tag_begins = [t["begin"] for t in parsed["format"]["tags"]] + assert "<|channel|>commentary to=browser" in tag_begins + assert "<|channel|>commentary to=python" in tag_begins + assert "<|channel|>commentary to=container" in tag_begins diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 6297cb6cbc33..7f5729977392 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -1178,7 +1178,10 @@ def _extract_system_message_from_request( return system_msg def _construct_harmony_system_input_message( - self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str] + self, + request: ResponsesRequest, + tools_visible: bool, + tool_types: set[str], ) -> OpenAIHarmonyMessage: model_identity = self._extract_system_message_from_request(request) @@ -1189,11 +1192,14 @@ def _construct_harmony_system_input_message( # Get filtered tool descriptions first. # If get_tool_description returns None (due to filtering), the tool is disabled. + # When tools_visible is False (e.g. tool_choice="none"), suppress all + # builtin tool descriptions so the model doesn't see them. browser_description = ( self.tool_server.get_tool_description( "browser", allowed_tools_map.get("web_search_preview") ) - if "web_search_preview" in tool_types + if tools_visible + and "web_search_preview" in tool_types and self.tool_server is not None and self.tool_server.has_tool("browser") else None @@ -1202,7 +1208,8 @@ def _construct_harmony_system_input_message( self.tool_server.get_tool_description( "python", allowed_tools_map.get("code_interpreter") ) - if "code_interpreter" in tool_types + if tools_visible + and "code_interpreter" in tool_types and self.tool_server is not None and self.tool_server.has_tool("python") else None @@ -1211,7 +1218,8 @@ def _construct_harmony_system_input_message( self.tool_server.get_tool_description( "container", allowed_tools_map.get("container") ) - if "container" in tool_types + if tools_visible + and "container" in tool_types and self.tool_server is not None and self.tool_server.has_tool("container") else None @@ -1224,7 +1232,7 @@ def _construct_harmony_system_input_message( python_description=python_description, container_description=container_description, instructions=request.instructions, - with_custom_tools=with_custom_tools, + with_custom_tools=tools_visible, ) return sys_msg @@ -1255,7 +1263,11 @@ def _construct_input_messages_with_harmony( # Per Harmony cookbook: developer message holds instructions, # function tools, AND response format schemas. response_format_schema = _extract_response_format_schema(request) - needs_dev_msg = tools_visible or response_format_schema is not None + needs_dev_msg = ( + tools_visible + or response_format_schema is not None + or request.instructions is not None + ) if needs_dev_msg: dev_instructions = request.instructions diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 9b117501b453..2d72930441bc 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -205,8 +205,15 @@ def prepare_structured_tag( base_tag: dict[str, Any] = copy.deepcopy(no_func_reasoning_tag) - # Add builtin tool tags (unless tool_choice is "none") - if tool_choice != "none" and tool_server is not None: + # Add builtin tool tags unless tool_choice is "none" or a named + # function dict — named forcing should only allow the specific + # function, not builtin channels that could satisfy at_least_one. + is_named_function_choice = isinstance(tool_choice, dict) + if ( + tool_choice != "none" + and not is_named_function_choice + and tool_server is not None + ): builtin_tool_list: list[str] = [] if tool_server.has_tool("browser"): builtin_tool_list.append("browser") From 0a5c9f9b534c99bf1d846c26f1bfebe8c595e3fa Mon Sep 17 00:00:00 2001 From: Will Deines Date: Thu, 19 Mar 2026 14:03:21 -0400 Subject: [PATCH 07/10] fix(responses): detect tool_choice violation when grammar enforcement fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When tool_choice="required" or a named tool is specified, the grammar layer (at_least_one + final channel removal) is the primary enforcement. But if the parser discards all tokens or the model produces text instead of a tool call, the response silently returns status="completed" with no function call — violating the contract. Add _check_tool_choice_violation() as a post-generation safety net that detects when no ResponseFunctionToolCall is present in output and forces status="incomplete" with a diagnostic warning log. Signed-off-by: Will Deines --- vllm/entrypoints/openai/responses/serving.py | 32 ++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 7f5729977392..f9e306777c8f 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -928,6 +928,7 @@ async def responses_full_generator( num_tool_output_tokens = 0 assert isinstance(context, (SimpleContext, HarmonyContext, ParsableContext)) + status = self._check_tool_choice_violation(request, output, status, context) num_prompt_tokens = context.num_prompt_tokens num_generated_tokens = context.num_output_tokens num_cached_tokens = context.num_cached_tokens @@ -1139,6 +1140,37 @@ def _make_response_output_items( ) ] + def _check_tool_choice_violation( + self, + request: ResponsesRequest, + output: list[ResponseOutputItem], + status: ResponseStatus, + context: ConversationContext, + ) -> ResponseStatus: + """Detect when tool_choice requires a function call but none was + produced. Returns ``"incomplete"`` if the constraint is violated, + otherwise returns *status* unchanged.""" + if request.tool_choice != "required" and not isinstance( + request.tool_choice, dict + ): + return status + has_function_call = any( + isinstance(item, ResponseFunctionToolCall) for item in output + ) + if not has_function_call: + logger.warning( + "tool_choice=%r but no function tool call in output " + "(output_items=%d, status=%s, finish_reason=%s, " + "output_tokens=%d). Grammar enforcement may have failed.", + request.tool_choice, + len(output), + status, + getattr(context, "finish_reason", None), + context.num_output_tokens, + ) + return "incomplete" + return status + def _make_response_output_items_with_harmony( self, context: HarmonyContext, From 19e4867308328f99bac0b6f9c3b252c8badc4443 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Thu, 19 Mar 2026 23:13:42 -0400 Subject: [PATCH 08/10] fix: resolve mypy error for num_output_tokens on ConversationContext ABC Signed-off-by: Will Deines --- vllm/entrypoints/openai/responses/serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 4a15b65350b0..3c061ed60899 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -1159,7 +1159,7 @@ def _check_tool_choice_violation( len(output), status, getattr(context, "finish_reason", None), - context.num_output_tokens, + getattr(context, "num_output_tokens", -1), ) return "incomplete" return status From 16b2394f65c589d98ad9134b71bf29d02fe83ed4 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Fri, 20 Mar 2026 13:20:32 -0400 Subject: [PATCH 09/10] feat: add structural tag support to Chat Completions for Harmony models Inject structural tags into sampling_params for the Chat Completions path, mirroring the existing Responses API implementation. Without this, tool_choice="auto" fails for Harmony models because the model stops after analysis reasoning and never emits the actual tool call. Also suppress tools in the prompt when tool_choice="none" for Harmony models, matching the Responses API behavior. Signed-off-by: Will Deines --- .../openai/chat_completion/serving.py | 87 ++++++++++++++++++- vllm/entrypoints/serve/render/serving.py | 6 +- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 62a0192e7b7a..edd3f6d90362 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -14,6 +14,7 @@ from fastapi import Request from partial_json_parser.core.options import Allow +from vllm.config.utils import replace from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, @@ -61,6 +62,7 @@ get_streamable_parser_for_assistant, parse_chat_output, ) +from vllm.entrypoints.openai.responses.serving import _constraint_to_content_format from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import get_max_tokens, should_include_usage from vllm.inputs.data import ProcessorInputs @@ -70,7 +72,11 @@ from vllm.parser import ParserManager from vllm.reasoning import ReasoningParser from vllm.renderers import ChatParams -from vllm.sampling_params import BeamSearchParams, SamplingParams +from vllm.sampling_params import ( + BeamSearchParams, + SamplingParams, + StructuredOutputsParams, +) from vllm.tokenizers import TokenizerLike from vllm.tool_parsers import ToolParser from vllm.tool_parsers.mistral_tool_parser import MistralToolCall @@ -227,6 +233,33 @@ async def create_chat_completion( tokenizer, chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg] ) + + # Pre-compute function tools and tool_choice for structural tags + function_tools_for_parser: list[dict] | None = None + tool_choice_for_parser: str | dict | None = None + if self.use_harmony and reasoning_parser is not None: + if request.tools: + ft = [ + { + "name": t.function.name, + **( + {"parameters": t.function.parameters} + if t.function.parameters + else {} + ), + } + for t in request.tools + ] + if ft: + function_tools_for_parser = ft + + # Convert ChatCompletionNamedToolChoiceParam to dict format + tc = request.tool_choice + if isinstance(tc, ChatCompletionNamedToolChoiceParam): + tool_choice_for_parser = {"name": tc.function.name} + else: + tool_choice_for_parser = tc + result = await self.render_chat_request(request) if isinstance(result, ErrorResponse): return result @@ -281,6 +314,58 @@ async def create_chat_completion( self.default_sampling_params, ) + # Inject structural tags for Harmony models + if ( + self.use_harmony + and reasoning_parser is not None + and isinstance(sampling_params, SamplingParams) + ): + struct_out = sampling_params.structured_outputs + if isinstance(struct_out, StructuredOutputsParams): + if struct_out.all_non_structural_tag_constraints_none(): + sampling_params.structured_outputs = replace( + struct_out, + structural_tag=( + reasoning_parser.prepare_structured_tag( + struct_out.structural_tag, + None, # tool_server + tool_choice=tool_choice_for_parser, + function_tools=function_tools_for_parser, + ) + ), + ) + else: + content_fmt = _constraint_to_content_format(struct_out) + if content_fmt is not None: + structural_tag = reasoning_parser.prepare_structured_tag( + None, + None, # tool_server + final_content_format=content_fmt, + tool_choice=tool_choice_for_parser, + function_tools=function_tools_for_parser, + ) + if structural_tag is not None: + sampling_params.structured_outputs = replace( + struct_out, + json=None, + regex=None, + choice=None, + grammar=None, + json_object=None, + structural_tag=structural_tag, + ) + elif struct_out is None: + tag = reasoning_parser.prepare_structured_tag( + None, + None, # tool_server + tool_choice=tool_choice_for_parser, + function_tools=function_tools_for_parser, + ) + if tag is not None: + sampling_params.structured_outputs = StructuredOutputsParams( + structural_tag=tag # type: ignore[call-arg] + ) + self._log_inputs( sub_request_id, engine_prompt, diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index d1c5acad8c72..26f437e6485c 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -245,8 +245,10 @@ async def render_chat( tool_parser=tool_parser, ) else: - # For GPT-OSS. - should_include_tools = tool_dicts is not None + # For GPT-OSS: always suppress tools when tool_choice="none" + should_include_tools = ( + tool_dicts is not None and request.tool_choice != "none" + ) conversation, engine_prompts = self._make_request_with_harmony( request, should_include_tools ) From 1b63a00f39cd0942be584bb9069c9e2d94611091 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Fri, 20 Mar 2026 13:38:12 -0400 Subject: [PATCH 10/10] fix(test): update test_function_calling_required for tool_choice support tool_choice="required" now works (added in feat/responses-tool-choice-required), so the test should expect a successful function call instead of InternalServerError. Signed-off-by: Will Deines Signed-off-by: Will Deines --- .../openai/responses/test_harmony.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/entrypoints/openai/responses/test_harmony.py b/tests/entrypoints/openai/responses/test_harmony.py index 74f3360df45f..b6f1ab71f4be 100644 --- a/tests/entrypoints/openai/responses/test_harmony.py +++ b/tests/entrypoints/openai/responses/test_harmony.py @@ -13,7 +13,7 @@ import pytest import pytest_asyncio import requests -from openai import InternalServerError, NotFoundError, OpenAI +from openai import NotFoundError, OpenAI from openai_harmony import Message from tests.utils import RemoteOpenAIServer @@ -697,15 +697,22 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling_required(client: OpenAI, model_name: str): + """tool_choice='required' must force at least one function call.""" tools = [GET_WEATHER_SCHEMA] - with pytest.raises(InternalServerError): - await client.responses.create( - model=model_name, - input="What's the weather like in Paris today?", - tools=tools, - tool_choice="required", - ) + response = await retry_for_tool_call( + client, + model=model_name, + expected_tool_type="function_call", + input="What's the weather like in Paris today?", + tools=tools, + tool_choice="required", + ) + tool_calls = [item for item in response.output if item.type == "function_call"] + assert tool_calls, ( + f"tool_choice='required' should force a function call, " + f"got: {[item.type for item in response.output]}" + ) @pytest.mark.asyncio