diff --git a/.gitignore b/.gitignore index d62536cfb91d..8b38903cd999 100644 --- a/.gitignore +++ b/.gitignore @@ -239,5 +239,5 @@ vllm/grpc/vllm_engine_pb2.py vllm/grpc/vllm_engine_pb2_grpc.py vllm/grpc/vllm_engine_pb2.pyi -# Ignore generated cpu headers +# Ignore generated cpu headers csrc/cpu/cpu_attn_dispatch_generated.h diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 21b53dff1507..6b8e9c26e365 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -1,15 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import patch + import pytest from openai_harmony import Message, Role from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( auto_drop_analysis_messages, + get_developer_message, get_encoding, get_system_message, has_custom_tools, + inject_response_formats, parse_chat_input_to_harmony_message, parse_chat_output, ) @@ -928,3 +932,99 @@ def test_reasoning_with_empty_content_returns_none(self): msg = response_input_to_harmony(item, prev_responses=[]) assert msg is None + + +class TestInjectResponseFormats: + def test_appends_to_existing_instructions(self): + result = inject_response_formats("You are helpful.", {"type": "object"}) + assert result.startswith("You are helpful.") + assert "# Response Formats" in result + assert '{"type":"object"}' in result + + def test_none_instructions_creates_section(self): + result = inject_response_formats(None, {"type": "object"}) + assert result.startswith("# Response Formats") + assert '{"type":"object"}' in result + + def test_custom_format_name(self): + result = inject_response_formats(None, {"type": "object"}, format_name="order") + assert "## order" in result + + def test_compact_json_no_spaces(self): + schema = {"type": "object", "properties": {"name": {"type": "string"}}} + result = inject_response_formats(None, schema) + assert '{"type":"object","properties":{"name":{"type":"string"}}}' in result + + def test_section_separated_by_blank_lines(self): + result = inject_response_formats("Instructions here.", {"type": "object"}) + assert "\n\n# Response Formats\n\n## structured_output\n\n" in result + + +class TestGetDeveloperMessageResponseFormats: + """Tests for response_format_section parameter in get_developer_message.""" + + ENV_VAR = ( + "vllm.entrypoints.openai.parser.harmony_utils" + ".envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS" + ) + + def _extract_instructions_text(self, dev_msg: Message) -> str | None: + """Extract the raw text from a developer message's instructions.""" + for content_item in dev_msg.content: + instructions = getattr(content_item, "instructions", None) + if instructions is not None: + return instructions + return None + + def test_response_format_preserved_with_system_instructions(self): + """When VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS is True, + user instructions should be dropped but response format schema + should still appear in the developer message.""" + schema_section = "# Response Formats\n\n## structured_output\n\n{}" + with patch(self.ENV_VAR, True): + dev_msg = get_developer_message( + instructions="Be concise.", + response_format_section=schema_section, + ) + text = self._extract_instructions_text(dev_msg) + assert text is not None + assert "# Response Formats" in text + # User instructions should NOT be present + assert "Be concise." not in text + + def test_response_format_and_instructions_without_system_instructions(self): + """When VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS is False, + both instructions and response format schema should appear.""" + schema_section = "# Response Formats\n\n## structured_output\n\n{}" + with patch(self.ENV_VAR, False): + dev_msg = get_developer_message( + instructions="Be concise.", + response_format_section=schema_section, + ) + text = self._extract_instructions_text(dev_msg) + assert text is not None + assert "Be concise." in text + assert "# Response Formats" in text + + def test_response_format_only_no_instructions(self): + """With instructions=None, only the response format section appears.""" + schema_section = "# Response Formats\n\n## structured_output\n\n{}" + with patch(self.ENV_VAR, False): + dev_msg = get_developer_message( + instructions=None, + response_format_section=schema_section, + ) + text = self._extract_instructions_text(dev_msg) + assert text is not None + assert "# Response Formats" in text + + def test_backward_compat_no_response_format(self): + """Without response_format_section, behavior matches the original.""" + with patch(self.ENV_VAR, False): + dev_msg = get_developer_message( + instructions="Be concise.", + ) + text = self._extract_instructions_text(dev_msg) + assert text is not None + assert "Be concise." in text + assert "# Response Formats" not in text diff --git a/tests/entrypoints/openai/responses/test_response_formats.py b/tests/entrypoints/openai/responses/test_response_formats.py new file mode 100644 index 000000000000..9c688f9bbd4b --- /dev/null +++ b/tests/entrypoints/openai/responses/test_response_formats.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Tests for response format schema extraction and developer message injection. + +These tests verify that structured output schemas are correctly extracted from +ResponsesRequest and injected into the Harmony developer message per the +Harmony cookbook specification. +""" + +from openai.types.responses.response_format_text_json_schema_config import ( + ResponseFormatTextJSONSchemaConfig, +) + +from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponseTextConfig, +) +from vllm.entrypoints.openai.responses.serving import ( + _extract_response_format_schema, +) +from vllm.sampling_params import StructuredOutputsParams + + +def _make_json_schema_text_config(schema: dict) -> ResponseTextConfig: + text_config = ResponseTextConfig() + text_config.format = ResponseFormatTextJSONSchemaConfig( + type="json_schema", + name="test_schema", + schema=schema, + ) + return text_config + + +class TestExtractResponseFormatSchema: + def test_extracts_from_text_format_json_schema(self): + schema = {"type": "object", "properties": {"name": {"type": "string"}}} + request = ResponsesRequest( + model="test-model", + input="test", + text=_make_json_schema_text_config(schema), + ) + result = _extract_response_format_schema(request) + assert result == schema + + def test_extracts_from_structured_outputs_json(self): + schema = {"type": "object", "properties": {"id": {"type": "integer"}}} + request = ResponsesRequest( + model="test-model", + input="test", + structured_outputs=StructuredOutputsParams(json=schema), + ) + result = _extract_response_format_schema(request) + assert result == schema + + def test_returns_none_for_text_format(self): + request = ResponsesRequest( + model="test-model", + input="test", + text=ResponseTextConfig(format={"type": "text"}), + ) + result = _extract_response_format_schema(request) + assert result is None + + def test_returns_none_for_no_format(self): + request = ResponsesRequest( + model="test-model", + input="test", + ) + result = _extract_response_format_schema(request) + assert result is None + + def test_text_format_takes_precedence(self): + """text.format.json_schema is checked before structured_outputs.""" + text_schema = { + "type": "object", + "properties": {"a": {"type": "string"}}, + } + so_schema = { + "type": "object", + "properties": {"b": {"type": "string"}}, + } + request = ResponsesRequest( + model="test-model", + input="test", + text=_make_json_schema_text_config(text_schema), + structured_outputs=StructuredOutputsParams(json=so_schema), + ) + result = _extract_response_format_schema(request) + assert result == text_schema diff --git a/tests/entrypoints/openai/responses/test_sampling_params.py b/tests/entrypoints/openai/responses/test_sampling_params.py index 87910271dd75..7509489ca3c4 100644 --- a/tests/entrypoints/openai/responses/test_sampling_params.py +++ b/tests/entrypoints/openai/responses/test_sampling_params.py @@ -132,6 +132,25 @@ def test_structured_outputs_passed_through(self): assert sampling_params.structured_outputs is not None assert sampling_params.structured_outputs.grammar == "root ::= 'hello'" + def test_json_object_format_produces_structured_outputs(self): + """Test that text.format.type=json_object creates StructuredOutputsParams.""" + from openai.types.shared.response_format_json_object import ( + ResponseFormatJSONObject, + ) + + text_config = ResponseTextConfig() + text_config.format = ResponseFormatJSONObject(type="json_object") + request = ResponsesRequest( + model="test-model", + input="test input", + text=text_config, + ) + + sampling_params = request.to_sampling_params(default_max_tokens=1000) + + assert sampling_params.structured_outputs is not None + assert sampling_params.structured_outputs.json_object is True + def test_structured_outputs_and_json_schema_conflict(self): """Test that specifying both structured_outputs and json_schema raises.""" structured_outputs = StructuredOutputsParams(grammar="root ::= 'hello'") diff --git a/tests/entrypoints/openai/responses/test_structured_output.py b/tests/entrypoints/openai/responses/test_structured_output.py index db8b87768e44..4d092b7e7154 100644 --- a/tests/entrypoints/openai/responses/test_structured_output.py +++ b/tests/entrypoints/openai/responses/test_structured_output.py @@ -1,11 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for structured output helpers in the Responses API.""" + import json import openai import pytest from pydantic import BaseModel +from vllm.entrypoints.openai.responses.serving import ( + _constraint_to_content_format, +) +from vllm.sampling_params import StructuredOutputsParams + @pytest.mark.asyncio async def test_structured_output(client: openai.AsyncOpenAI): @@ -76,3 +83,22 @@ class CalendarEvent(BaseModel): assert len(participants) == 2 assert participants[0] == "Alice" assert participants[1] == "Bob" + + +class TestConstraintToContentFormat: + """Test _constraint_to_content_format helper.""" + + def test_json_schema_string_is_parsed(self): + """JSON schema passed as a string gets json.loads'd into a dict.""" + schema = {"type": "object", "properties": {"age": {"type": "integer"}}} + params = StructuredOutputsParams(json=json.dumps(schema)) + result = _constraint_to_content_format(params) + + assert result == {"type": "json_schema", "json_schema": schema} + + def test_structural_tag_only_returns_none(self): + """structural_tag is not a content constraint — should return None.""" + params = StructuredOutputsParams(structural_tag='{"type": "structural_tag"}') + result = _constraint_to_content_format(params) + + assert result is None diff --git a/tests/v1/structured_output/test_gptoss_structural_tags.py b/tests/v1/structured_output/test_gptoss_structural_tags.py new file mode 100644 index 000000000000..aa0fbe15f119 --- /dev/null +++ b/tests/v1/structured_output/test_gptoss_structural_tags.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for GPT-OSS structural tag support in reasoning (PR #25515).""" + +import json +from unittest.mock import Mock + +import pytest + +from vllm.entrypoints.mcp.tool_server import ToolServer +from vllm.reasoning.gptoss_reasoning_parser import ( + GptOssReasoningParser, + from_builtin_tool_to_tag, + no_func_reasoning_tag, + tag_with_builtin_funcs, +) + + +class TestGptOssReasoningParser: + """Test cases for GptOssReasoningParser structural tag functionality.""" + + @pytest.fixture + def mock_tokenizer(self): + """Create a mock tokenizer for testing.""" + tokenizer = Mock() + tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5]) + tokenizer.vocab = {"<|end|>": 6} + tokenizer.get_vocab = Mock(return_value={"<|end|>": 6}) + return tokenizer + + @pytest.fixture + def reasoning_parser(self, mock_tokenizer): + """Create a GptOssReasoningParser instance.""" + return GptOssReasoningParser(mock_tokenizer) + + @pytest.fixture + def mock_tool_server_empty(self): + """Create a mock ToolServer with no tools.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock(return_value=False) + return tool_server + + @pytest.fixture + def mock_tool_server_with_browser(self): + """Create a mock ToolServer with browser tool.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock(side_effect=lambda tool: tool == "browser") + return tool_server + + @pytest.fixture + def mock_tool_server_with_all_tools(self): + """Create a mock ToolServer with all builtin tools.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock( + side_effect=lambda tool: tool in ["browser", "python", "container"] + ) + return tool_server + + def test_prepare_structured_tag_no_tool_server(self, reasoning_parser): + """Test prepare_structured_tag with no tool server.""" + result = reasoning_parser.prepare_structured_tag(None, None) + expected = json.dumps(no_func_reasoning_tag) + + assert result == expected + + # Verify the structure is correct + parsed = json.loads(result) + assert parsed["type"] == "structural_tag" + assert parsed["format"]["type"] == "triggered_tags" + assert len(parsed["format"]["tags"]) == 1 + assert parsed["format"]["tags"][0]["begin"] == "<|channel|>analysis<|message|>" + assert parsed["format"]["triggers"] == ["<|channel|>analysis"] + + def test_prepare_structured_tag_with_all_tools( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """Test prepare_structured_tag with all builtin tools.""" + result = reasoning_parser.prepare_structured_tag( + None, mock_tool_server_with_all_tools + ) + parsed = json.loads(result) + + # Should have analysis tag + tags for all 3 tools (2 tags each) + assert len(parsed["format"]["tags"]) == 7 # 1 analysis + 6 tool tags + + # Check all tool tags are present + tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]] + for tool in ["browser", "python", "container"]: + assert f"<|channel|>commentary to={tool}" in tag_begins + assert f"<|channel|>analysis to={tool}" in tag_begins + + def test_prepare_structured_tag_with_original_tag(self, reasoning_parser): + """Test prepare_structured_tag when original_tag is provided.""" + original_tag = '{"custom": "tag"}' + result = reasoning_parser.prepare_structured_tag(original_tag, None) + + # Should return the original tag unchanged + assert result == original_tag + + def test_from_builtin_tool_to_tag(self): + """Test from_builtin_tool_to_tag function.""" + tags = from_builtin_tool_to_tag("python") + + assert len(tags) == 2 + assert tags[0]["begin"] == "<|channel|>commentary to=python" + assert tags[0]["content"]["type"] == "any_text" + assert tags[0]["end"] == "<|end|>" + + assert tags[1]["begin"] == "<|channel|>analysis to=python" + assert tags[1]["content"]["type"] == "any_text" + assert tags[1]["end"] == "<|end|>" + + def test_tag_with_builtin_funcs(self): + """Test tag_with_builtin_funcs function.""" + builtin_tools = ["browser", "python"] + result = tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tools) + + assert result["type"] == "structural_tag" + # Should have original analysis tag + 2 tags per tool + assert len(result["format"]["tags"]) == 5 # 1 + 2*2 + + # Should have added commentary trigger + assert "<|channel|>commentary to=" in result["format"]["triggers"] + assert "<|channel|>analysis" in result["format"]["triggers"] + + def test_tag_structure_invariants(self): + """Test that the basic tag structure follows expected format.""" + # Test the base no_func_reasoning_tag structure + assert no_func_reasoning_tag["type"] == "structural_tag" + assert no_func_reasoning_tag["format"]["type"] == "triggered_tags" + assert no_func_reasoning_tag["format"]["stop_after_first"] is False + + # Verify analysis tag structure + analysis_tag = no_func_reasoning_tag["format"]["tags"][0] + assert analysis_tag["begin"] == "<|channel|>analysis<|message|>" + assert analysis_tag["content"]["type"] == "any_text" + assert analysis_tag["end"] == "<|end|>" + + def test_json_serialization_valid( + self, reasoning_parser, mock_tool_server_with_all_tools + ): + """Test that all generated tags produce valid JSON.""" + # Test with no tool server + result1 = reasoning_parser.prepare_structured_tag(None, None) + json.loads(result1) # Should not raise + + # Test with empty tool server + empty_server = Mock(spec=ToolServer) + empty_server.has_tool = Mock(return_value=False) + result2 = reasoning_parser.prepare_structured_tag(None, empty_server) + json.loads(result2) # Should not raise + + # Test with tools + result3 = reasoning_parser.prepare_structured_tag( + None, mock_tool_server_with_all_tools + ) + json.loads(result3) # Should not raise + + @pytest.mark.parametrize("tool_name", ["browser", "python", "container"]) + def test_single_tool_integration(self, reasoning_parser, tool_name): + """Test integration with individual tools.""" + tool_server = Mock(spec=ToolServer) + tool_server.has_tool = Mock(side_effect=lambda tool: tool == tool_name) + + result = reasoning_parser.prepare_structured_tag(None, tool_server) + parsed = json.loads(result) + + # Should have 1 analysis + 2 tool-specific tags + assert len(parsed["format"]["tags"]) == 3 + + tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]] + assert f"<|channel|>commentary to={tool_name}" in tag_begins + assert f"<|channel|>analysis to={tool_name}" in tag_begins + + def test_prepare_structured_tag_with_json_schema(self, reasoning_parser): + """Test that final channel tag has json_schema content constraint.""" + content_format = { + "type": "json_schema", + "json_schema": { + "type": "object", + "properties": {"name": {"type": "string"}}, + }, + } + result = reasoning_parser.prepare_structured_tag( + None, None, final_content_format=content_format + ) + parsed = json.loads(result) + + # Should have analysis tag + final channel tag + assert len(parsed["format"]["tags"]) == 2 + + # Verify analysis tag is unchanged + assert parsed["format"]["tags"][0]["begin"] == "<|channel|>analysis<|message|>" + assert parsed["format"]["tags"][0]["content"]["type"] == "any_text" + + # Verify final channel tag has the json_schema content constraint + final_tag = parsed["format"]["tags"][1] + assert final_tag["begin"] == "<|channel|>final<|message|>" + assert final_tag["end"] == "<|end|>" + assert final_tag["content"] == content_format + + # Verify triggers include both analysis and final + assert "<|channel|>analysis" in parsed["format"]["triggers"] + assert "<|channel|>final" in parsed["format"]["triggers"] + + def test_prepare_structured_tag_original_tag_ignores_constraint( + self, reasoning_parser + ): + """When original_tag is provided, final_content_format is ignored.""" + original_tag = '{"custom": "tag"}' + content_format = {"type": "json_schema", "json_schema": {"type": "object"}} + result = reasoning_parser.prepare_structured_tag( + original_tag, None, final_content_format=content_format + ) + + # Should return the original tag unchanged + assert result == original_tag + + def test_prepare_structured_tag_with_tools_and_constraint( + self, reasoning_parser, mock_tool_server_with_browser + ): + """Test that tools and content constraint coexist in the tag.""" + content_format = {"type": "json_schema", "json_schema": {"type": "object"}} + result = reasoning_parser.prepare_structured_tag( + None, mock_tool_server_with_browser, final_content_format=content_format + ) + parsed = json.loads(result) + + # Should have analysis + 2 browser tags + final channel tag = 4 + assert len(parsed["format"]["tags"]) == 4 + + tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]] + assert "<|channel|>analysis<|message|>" in tag_begins + assert "<|channel|>commentary to=browser" in tag_begins + assert "<|channel|>analysis to=browser" in tag_begins + assert "<|channel|>final<|message|>" in tag_begins + + # Verify final tag has the constraint + final_tag = next( + t + for t in parsed["format"]["tags"] + if t["begin"] == "<|channel|>final<|message|>" + ) + assert final_tag["content"] == content_format diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 9b4264456c51..9004045f30e7 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime +import json from collections.abc import Iterable, Sequence from typing import Literal @@ -121,10 +122,16 @@ def create_tool_definition(tool: ChatCompletionToolsParam | Tool): def get_developer_message( instructions: str | None = None, tools: list[Tool | ChatCompletionToolsParam] | None = None, + response_format_section: str | None = None, ) -> Message: dev_msg_content = DeveloperContent.new() + parts: list[str] = [] if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: - dev_msg_content = dev_msg_content.with_instructions(instructions) + parts.append(instructions) + if response_format_section is not None: + parts.append(response_format_section) + if parts: + dev_msg_content = dev_msg_content.with_instructions("\n\n".join(parts)) if tools is not None: function_tools: list[Tool | ChatCompletionToolsParam] = [] for tool in tools: @@ -150,6 +157,25 @@ def get_developer_message( return dev_msg +def inject_response_formats( + instructions: str | None, + schema: dict, + format_name: str = "structured_output", +) -> str: + """Append a Harmony cookbook ``# Response Formats`` section. + + Per the cookbook, structured output schemas should appear in the + developer message under a ``# Response Formats`` heading so the + model knows what format to produce. This complements grammar + enforcement via structural tags. + """ + schema_json = json.dumps(schema, separators=(",", ":")) + section = f"\n\n# Response Formats\n\n## {format_name}\n\n{schema_json}" + if instructions: + return instructions + section + return section.lstrip("\n") + + def get_user_message(content: str) -> Message: return Message.from_role_and_content(Role.USER, content) diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index a5f62bdd8c39..831fb1077243 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -346,6 +346,10 @@ def to_sampling_params( # --follow-imports skip hides the class definition but also hides # multiple third party conflicts, so best of both evils ) + elif response_format.type == "json_object": + structured_outputs = StructuredOutputsParams( + json_object=True # type: ignore[call-arg] + ) stop = self.stop if self.stop else [] if isinstance(stop, str): diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 574282c4cdc6..064ec5ae74c1 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import json as json_mod import time import uuid from collections import deque @@ -66,6 +67,7 @@ get_system_message, get_user_message, has_custom_tools, + inject_response_formats, render_for_completion, ) from vllm.entrypoints.openai.responses.context import ( @@ -126,6 +128,26 @@ logger = init_logger(__name__) +def _extract_response_format_schema(request: ResponsesRequest) -> dict | None: + """Extract JSON schema from the request's structured output config.""" + if ( + request.text is not None + and request.text.format is not None + and request.text.format.type == "json_schema" + and request.text.format.schema_ is not None + ): + return request.text.format.schema_ + if ( + request.structured_outputs is not None + and request.structured_outputs.json is not None + ): + val = request.structured_outputs.json + if isinstance(val, str): + return json_mod.loads(val) + return val + return None + + def _extract_allowed_tools_from_mcp_requests( tools: list[Tool], ) -> dict[str, list[str] | None]: @@ -165,6 +187,32 @@ def _extract_allowed_tools_from_mcp_requests( return allowed_tools_map +def _constraint_to_content_format( + params: StructuredOutputsParams, +) -> dict | None: + """Convert a StructuredOutputsParams constraint into an xgrammar + content format dict suitable for embedding in a structural tag.""" + if params.json is not None: + schema = ( + params.json + if isinstance(params.json, dict) + else json_mod.loads(params.json) + ) + return {"type": "json_schema", "json_schema": schema} + if params.json_object: + return {"type": "json_schema", "json_schema": {"type": "object"}} + if params.regex is not None: + return {"type": "regex", "pattern": params.regex} + if params.grammar is not None: + return {"type": "grammar", "grammar": params.grammar} + if params.choice is not None: + return { + "type": "or", + "elements": [{"type": "const_string", "value": c} for c in params.choice], + } + return None + + class OpenAIServingResponses(OpenAIServing): def __init__( self, @@ -411,83 +459,126 @@ async def create_responses( else: assert len(builtin_tool_list) == 0 available_tools = [] - tokenizer = self.renderer.get_tokenizer() - - for engine_prompt in engine_prompts: - maybe_error = self._validate_generator_input(engine_prompt) - if maybe_error is not None: - return maybe_error - - default_max_tokens = get_max_tokens( - max_model_len, - request.max_output_tokens, - self._extract_prompt_len(engine_prompt), - self.default_sampling_params, - self.override_max_tokens, - ) + try: + tokenizer = self.renderer.get_tokenizer() - sampling_params = request.to_sampling_params( - default_max_tokens, self.default_sampling_params - ) + for engine_prompt in engine_prompts: + maybe_error = self._validate_generator_input(engine_prompt) + if maybe_error is not None: + return maybe_error - trace_headers = ( - None - if raw_request is None - else await self._get_trace_headers(raw_request.headers) - ) + default_max_tokens = get_max_tokens( + max_model_len, + request.max_output_tokens, + self._extract_prompt_len(engine_prompt), + self.default_sampling_params, + self.override_max_tokens, + ) - context: ConversationContext - if self.use_harmony: - if request.stream: - context = StreamingHarmonyContext(messages, available_tools) - else: - context = HarmonyContext(messages, available_tools) - else: - if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: - # This is a feature in development for parsing - # tokens during generation instead of at the end - context = ParsableContext( - response_messages=messages, - tokenizer=tokenizer, - reasoning_parser_cls=self.parser.reasoning_parser_cls - if self.parser - else None, - request=request, - tool_parser_cls=self.parser.tool_parser_cls - if self.parser - else None, - available_tools=available_tools, - chat_template=self.chat_template, - chat_template_content_format=self.chat_template_content_format, - ) - else: - context = SimpleContext() + sampling_params = request.to_sampling_params( + default_max_tokens, self.default_sampling_params + ) - if self.parser and self.parser.reasoning_parser_cls is not None: - reasoning_parser = self.parser.reasoning_parser_cls(tokenizer) - if ( - isinstance( - struct_out := sampling_params.structured_outputs, - StructuredOutputsParams, - ) - and struct_out.all_non_structural_tag_constraints_none() - ): - sampling_params.structured_outputs = replace( - struct_out, - structural_tag=reasoning_parser.prepare_structured_tag( - struct_out.structural_tag, self.tool_server - ), - ) - generator = self._generate_with_builtin_tools( - request_id=request.request_id, - engine_prompt=engine_prompt, - sampling_params=sampling_params, - context=context, - lora_request=lora_request, - priority=request.priority, - trace_headers=trace_headers, - ) - generators.append(generator) + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + context: ConversationContext + if self.use_harmony: + if request.stream: + context = StreamingHarmonyContext(messages, available_tools) + else: + context = HarmonyContext(messages, available_tools) + else: + if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: + # This is a feature in development for parsing + # tokens during generation instead of at the end + context = ParsableContext( + response_messages=messages, + tokenizer=tokenizer, + reasoning_parser_cls=self.parser.reasoning_parser_cls + if self.parser + else None, + request=request, + tool_parser_cls=self.parser.tool_parser_cls + if self.parser + else None, + available_tools=available_tools, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, + ) + else: + context = SimpleContext() + + if self.parser and self.parser.reasoning_parser_cls is not None: + reasoning_parser = self.parser.reasoning_parser_cls(tokenizer) + struct_out = sampling_params.structured_outputs + + if isinstance(struct_out, StructuredOutputsParams): + if struct_out.all_non_structural_tag_constraints_none(): + # No content constraint — just apply reasoning + # channel tags + sampling_params.structured_outputs = replace( + struct_out, + structural_tag=( + reasoning_parser.prepare_structured_tag( + struct_out.structural_tag, + self.tool_server, + ) + ), + ) + else: + # Content constraint present (json, regex, + # grammar, choice, json_object). Embed it in the + # final channel tag within the structural tag. + content_fmt = _constraint_to_content_format(struct_out) + if content_fmt is not None: + structural_tag = ( + reasoning_parser.prepare_structured_tag( + None, + self.tool_server, + final_content_format=content_fmt, + ) + ) + if structural_tag is not None: + # Clear content constraints, set + # structural_tag, but preserve options + # like disable_any_whitespace. + sampling_params.structured_outputs = replace( + struct_out, + json=None, + regex=None, + choice=None, + grammar=None, + json_object=None, + structural_tag=structural_tag, + ) + elif struct_out is None: + # No structured output requested, but still need + # reasoning channel tags + tag = reasoning_parser.prepare_structured_tag( + None, self.tool_server + ) + if tag is not None: + sampling_params.structured_outputs = ( + StructuredOutputsParams( + structural_tag=tag # type: ignore[call-arg] + ) + ) + generator = self._generate_with_builtin_tools( + request_id=request.request_id, + engine_prompt=engine_prompt, + sampling_params=sampling_params, + context=context, + lora_request=lora_request, + priority=request.priority, + trace_headers=trace_headers, + ) + generators.append(generator) + except ValueError as e: + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -1136,9 +1227,23 @@ def _construct_input_messages_with_harmony( request, with_custom_tools, tool_types ) messages.append(sys_msg) - if with_custom_tools: + + # Determine if we need a developer message. + # Per Harmony cookbook: developer message holds instructions, + # function tools, AND response format schemas. + response_format_schema = _extract_response_format_schema(request) + needs_dev_msg = with_custom_tools or response_format_schema is not None + + if needs_dev_msg: + response_format_text = None + if response_format_schema is not None: + response_format_text = inject_response_formats( + None, response_format_schema + ) dev_msg = get_developer_message( - instructions=request.instructions, tools=request.tools + instructions=request.instructions, + tools=request.tools if with_custom_tools else None, + response_format_section=response_format_text, ) messages.append(dev_msg) messages += construct_harmony_previous_input_messages(request) @@ -1978,7 +2083,7 @@ def _increment_sequence_number_and_return( output=[], status="in_progress", usage=None, - ).model_dump() + ) yield _increment_sequence_number_and_return( ResponseCreatedEvent( type="response.created", diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 5271a307075e..29ee6d33be0c 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -154,10 +154,18 @@ def prepare_structured_tag( self, original_tag: str | None, tool_server: ToolServer | None, + final_content_format: dict | None = None, ) -> str | None: """ - Instance method that is implemented for preparing the structured tag - Otherwise, None is returned + Instance method that is implemented for preparing the structured tag. + Otherwise, None is returned. + + Args: + original_tag: An existing structural tag string, if any. + tool_server: The tool server for builtin tool support. + final_content_format: Optional xgrammar content format dict + (e.g. json_schema, regex) to embed in the <|channel|>final + tag for constraining the model's final output region. """ return None @@ -298,7 +306,8 @@ def _decorator(obj: type[ReasoningParser]) -> type[ReasoningParser]: if isinstance(name, str): names = [name] elif is_list_of(name, str): - names = name + assert name is not None + names = list(name) else: names = [class_name] diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 89299d4b12b8..4dba7eaa3f93 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import json from collections.abc import Sequence -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from transformers import PreTrainedTokenizerBase @@ -158,30 +159,46 @@ def extract_reasoning( # This function prepares the structural tag to format reasoning output def prepare_structured_tag( - self, original_tag: str | None, tool_server: ToolServer | None + self, + original_tag: str | None, + tool_server: ToolServer | None, + final_content_format: dict | None = None, ) -> str | None: - if original_tag is None: - if tool_server is None: - return json.dumps(no_func_reasoning_tag) - else: - builtin_tool_list: list[str] = [] - if tool_server.has_tool("browser"): - builtin_tool_list.append("browser") - if tool_server.has_tool("python"): - builtin_tool_list.append("python") - if tool_server.has_tool("container"): - builtin_tool_list.append("container") - - if len(builtin_tool_list) > 0: - logger.info("Builtin_tool_list: %s", builtin_tool_list) - func_tag = json.dumps( - tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tool_list) - ) - else: - logger.info("Builtin_tool_list is empty") - func_tag = json.dumps(no_func_reasoning_tag) - - return func_tag - else: + if original_tag is not None: # There is potential risk for appending the tag to the original tag return original_tag + + tag: dict[str, Any] + if tool_server is None: + tag = copy.deepcopy(no_func_reasoning_tag) + else: + builtin_tool_list: list[str] = [] + if tool_server.has_tool("browser"): + builtin_tool_list.append("browser") + if tool_server.has_tool("python"): + builtin_tool_list.append("python") + if tool_server.has_tool("container"): + builtin_tool_list.append("container") + + if len(builtin_tool_list) > 0: + logger.info("Builtin_tool_list: %s", builtin_tool_list) + tag = tag_with_builtin_funcs(no_func_reasoning_tag, builtin_tool_list) + else: + logger.info("Builtin_tool_list is empty") + tag = copy.deepcopy(no_func_reasoning_tag) + + # If a content constraint is requested for the final channel, + # add a triggered tag for <|channel|>final with that constraint. + # This ensures grammar enforcement only applies within the final + # output region, not during reasoning. + if final_content_format is not None: + tag["format"]["triggers"].append("<|channel|>final") + tag["format"]["tags"].append( + { + "begin": "<|channel|>final<|message|>", + "content": final_content_format, + "end": "<|end|>", + } + ) + + return json.dumps(tag)