diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 72d468db08f6..d820b86104d2 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -28,6 +28,7 @@ def server(monkeypatch_module: pytest.MonkeyPatch): with monkeypatch_module.context() as m: m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + m.setenv("VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT", "1") with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -48,6 +49,11 @@ async def test_basic(client: OpenAI, model_name: str): assert response is not None print("response: ", response) assert response.status == "completed" + # For now, just validate that input and output harmony messages exist + # To validate that the env flag enables them to be passed through properly + # Actual unit tests will need to be implemented on a model specific basis + assert len(response.input_harmony_messages) != 0 + assert len(response.output_harmony_messages) != 0 @pytest.mark.asyncio diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9d587e866933..2fae013fac80 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -75,6 +75,7 @@ def __init__( available_tools: list[str], ): self._messages = messages + self.input_messages = messages self.available_tools = available_tools self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 488102232562..e9ff65c45e11 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -33,6 +33,7 @@ from openai.types.responses.response import ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning +from openai_harmony import Message from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, ValidationInfo, field_validator, model_validator) from typing_extensions import TypeAlias @@ -1847,6 +1848,10 @@ class ResponseUsage(OpenAIBaseModel): class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") created_at: int = Field(default_factory=lambda: int(time.time())) + # These are populated when the env flag + # VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT is set + input_harmony_messages: Optional[list[Message]] = None + output_harmony_messages: Optional[list[Message]] = None # error: Optional[ResponseError] = None # incomplete_details: Optional[IncompleteDetails] = None instructions: Optional[str] = None @@ -1882,12 +1887,16 @@ def from_request( created_time: int, output: list[ResponseOutputItem], status: ResponseStatus, + input_harmony_messages: Optional[list[Message]] = None, + output_harmony_messages: Optional[list[Message]] = None, usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": return cls( id=request.request_id, created_at=created_time, instructions=request.instructions, + input_harmony_messages=input_harmony_messages, + output_harmony_messages=output_harmony_messages, metadata=request.metadata, model=model_name, output=output, @@ -2179,7 +2188,7 @@ class TranscriptionRequest(OpenAIBaseModel): to_language: Optional[str] = None """The language of the output audio we transcribe to. - Please note that this is not currently used by supported models at this + Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api. """ diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4c15de303099..74110c51590f 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -439,9 +439,17 @@ async def responses_full_generator( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + output_harmony_messages = None + input_harmony_messages = None if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) + if envs.VLLM_GPT_OSS_USE_CONTAINER_TOOL: + # TODO: Handle leftover parser state? + input_harmony_messages = context.input_messages + # .messages contains input and output, so just get the output + output_harmony_messages = context.messages[ + len(input_harmony_messages):] # TODO: these are all 0 for now! num_prompt_tokens = context.num_prompt_tokens num_generated_tokens = context.num_output_tokens @@ -479,6 +487,8 @@ async def responses_full_generator( model_name=model_name, created_time=created_time, output=output, + input_harmony_messages=input_harmony_messages, + output_harmony_messages=output_harmony_messages, status="completed", usage=usage, ) diff --git a/vllm/envs.py b/vllm/envs.py index 1232bd7bf963..bf5d0736aa9d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1199,6 +1199,12 @@ def get_vllm_port() -> Optional[int]: "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + # Whether to enable outputting Harmony messages on the + # Responses API response object + "VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT": + lambda: bool(int( + os.getenv("VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT", + "0"))), } # --8<-- [end:env-vars-definition]