Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 70 additions & 54 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,29 @@ def _validate_generator_input(
)
return None

def _validate_create_responses_input(
self, request: ResponsesRequest
) -> ErrorResponse | None:
if self.use_harmony and request.is_include_output_logprobs():
return self.create_error_response(
err_type="invalid_request_error",
message="logprobs are not supported with gpt-oss models",
status_code=HTTPStatus.BAD_REQUEST,
)
if request.store and not self.enable_store and request.background:
return self.create_error_response(
err_type="invalid_request_error",
message=(
"This vLLM engine does not support `store=True` and "
"therefore does not support the background mode. To "
"enable these features, set the environment variable "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
"the vLLM server."
),
status_code=HTTPStatus.BAD_REQUEST,
)
return None

async def create_responses(
self,
request: ResponsesRequest,
Expand All @@ -240,6 +263,9 @@ async def create_responses(
if error_check_ret is not None:
logger.error("Error with model %s", error_check_ret)
return error_check_ret
maybe_validation_error = self._validate_create_responses_input(request)
if maybe_validation_error is not None:
return maybe_validation_error

# If the engine is dead, raise the engine's DEAD_ERROR.
# This is required for the streaming case, where we return a
Expand All @@ -248,31 +274,13 @@ async def create_responses(
raise self.engine_client.dead_error

if request.store and not self.enable_store:
if request.background:
return self.create_error_response(
err_type="invalid_request_error",
message=(
"This vLLM engine does not support `store=True` and "
"therefore does not support the background mode. To "
"enable these features, set the environment variable "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
"the vLLM server."
),
status_code=HTTPStatus.BAD_REQUEST,
)
# Disable the store option.
# NOTE(woosuk): Although returning an error is possible, we opted
# to implicitly disable store and process the request anyway, as
# we assume most users do not intend to actually store the response
# (i.e., their request's `store=True` just because it's the default
# value).
request.store = False
if self.use_harmony and request.is_include_output_logprobs():
return self.create_error_response(
err_type="invalid_request_error",
message="logprobs are not supported with gpt-oss models",
status_code=HTTPStatus.BAD_REQUEST,
)

# Handle the previous response ID.
prev_response_id = request.previous_response_id
Expand Down Expand Up @@ -849,6 +857,47 @@ def _construct_input_messages(
messages.extend(request.input) # type: ignore
return messages

def _construct_harmony_system_input_message(
self, request: ResponsesRequest, with_custom_tools: bool, tool_types: list[str]
) -> OpenAIHarmonyMessage:
reasoning_effort = request.reasoning.effort if request.reasoning else None
enable_browser = (
"web_search_preview" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("browser")
)
enable_code_interpreter = (
"code_interpreter" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("python")
)
enable_container = (
"container" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("container")
)
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=(
self.tool_server.get_tool_description("browser")
if enable_browser and self.tool_server is not None
else None
),
python_description=(
self.tool_server.get_tool_description("python")
if enable_code_interpreter and self.tool_server is not None
else None
),
container_description=(
self.tool_server.get_tool_description("container")
if enable_container and self.tool_server is not None
else None
),
instructions=request.instructions,
with_custom_tools=with_custom_tools,
)
return sys_msg

def _construct_input_messages_with_harmony(
self,
request: ResponsesRequest,
Expand All @@ -857,9 +906,7 @@ def _construct_input_messages_with_harmony(
messages: list[OpenAIHarmonyMessage] = []
if prev_response is None:
# New conversation.
reasoning_effort = request.reasoning.effort if request.reasoning else None
tool_types = [tool.type for tool in request.tools]

# Allow the MCP Tool type to enable built in tools if the
# server_label is allowlisted in
# envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
Expand All @@ -870,41 +917,10 @@ def _construct_input_messages_with_harmony(
and tool.server_label in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
):
tool_types.append(tool.server_label)
enable_browser = (
"web_search_preview" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("browser")
)
enable_code_interpreter = (
"code_interpreter" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("python")
)
enable_container = (
"container" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("container")
)
with_custom_tools = has_custom_tools(tool_types)
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=(
self.tool_server.get_tool_description("browser")
if enable_browser and self.tool_server is not None
else None
),
python_description=(
self.tool_server.get_tool_description("python")
if enable_code_interpreter and self.tool_server is not None
else None
),
container_description=(
self.tool_server.get_tool_description("container")
if enable_container and self.tool_server is not None
else None
),
instructions=request.instructions,
with_custom_tools=with_custom_tools,

sys_msg = self._construct_harmony_system_input_message(
request, with_custom_tools, tool_types
)
messages.append(sys_msg)
if with_custom_tools:
Expand Down