[gpt-oss][1/N] EZ: refactor serving_responses for modularity (vllm-project#26948)

qandrew · rtourgeman · commit 4a624155307d · 2025-11-10T09:29:54.000+02:00
Signed-off-by: Andrew Xia &lt;axia@meta.com&gt;
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -227,6 +227,29 @@ def _validate_generator_input(
             )
         return None
 
+    def _validate_create_responses_input(
+        self, request: ResponsesRequest
+    ) -> ErrorResponse | None:
+        if self.use_harmony and request.is_include_output_logprobs():
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message="logprobs are not supported with gpt-oss models",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        if request.store and not self.enable_store and request.background:
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message=(
+                    "This vLLM engine does not support `store=True` and "
+                    "therefore does not support the background mode. To "
+                    "enable these features, set the environment variable "
+                    "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
+                    "the vLLM server."
+                ),
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        return None
+
     async def create_responses(
         self,
         request: ResponsesRequest,
@@ -240,6 +263,9 @@ async def create_responses(
         if error_check_ret is not None:
             logger.error("Error with model %s", error_check_ret)
             return error_check_ret
+        maybe_validation_error = self._validate_create_responses_input(request)
+        if maybe_validation_error is not None:
+            return maybe_validation_error
 
         # If the engine is dead, raise the engine's DEAD_ERROR.
         # This is required for the streaming case, where we return a
@@ -248,31 +274,13 @@ async def create_responses(
             raise self.engine_client.dead_error
 
         if request.store and not self.enable_store:
-            if request.background:
-                return self.create_error_response(
-                    err_type="invalid_request_error",
-                    message=(
-                        "This vLLM engine does not support `store=True` and "
-                        "therefore does not support the background mode. To "
-                        "enable these features, set the environment variable "
-                        "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
-                        "the vLLM server."
-                    ),
-                    status_code=HTTPStatus.BAD_REQUEST,
-                )
             # Disable the store option.
             # NOTE(woosuk): Although returning an error is possible, we opted
             # to implicitly disable store and process the request anyway, as
             # we assume most users do not intend to actually store the response
             # (i.e., their request's `store=True` just because it's the default
             # value).
             request.store = False
-        if self.use_harmony and request.is_include_output_logprobs():
-            return self.create_error_response(
-                err_type="invalid_request_error",
-                message="logprobs are not supported with gpt-oss models",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
 
         # Handle the previous response ID.
         prev_response_id = request.previous_response_id
@@ -849,6 +857,47 @@ def _construct_input_messages(
             messages.extend(request.input)  # type: ignore
         return messages
 
+    def _construct_harmony_system_input_message(
+        self, request: ResponsesRequest, with_custom_tools: bool, tool_types: list[str]
+    ) -> OpenAIHarmonyMessage:
+        reasoning_effort = request.reasoning.effort if request.reasoning else None
+        enable_browser = (
+            "web_search_preview" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("browser")
+        )
+        enable_code_interpreter = (
+            "code_interpreter" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("python")
+        )
+        enable_container = (
+            "container" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("container")
+        )
+        sys_msg = get_system_message(
+            reasoning_effort=reasoning_effort,
+            browser_description=(
+                self.tool_server.get_tool_description("browser")
+                if enable_browser and self.tool_server is not None
+                else None
+            ),
+            python_description=(
+                self.tool_server.get_tool_description("python")
+                if enable_code_interpreter and self.tool_server is not None
+                else None
+            ),
+            container_description=(
+                self.tool_server.get_tool_description("container")
+                if enable_container and self.tool_server is not None
+                else None
+            ),
+            instructions=request.instructions,
+            with_custom_tools=with_custom_tools,
+        )
+        return sys_msg
+
     def _construct_input_messages_with_harmony(
         self,
         request: ResponsesRequest,
@@ -857,9 +906,7 @@ def _construct_input_messages_with_harmony(
         messages: list[OpenAIHarmonyMessage] = []
         if prev_response is None:
             # New conversation.
-            reasoning_effort = request.reasoning.effort if request.reasoning else None
             tool_types = [tool.type for tool in request.tools]
-
             # Allow the MCP Tool type to enable built in tools if the
             # server_label is allowlisted in
             # envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
@@ -870,41 +917,10 @@ def _construct_input_messages_with_harmony(
                         and tool.server_label in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
                     ):
                         tool_types.append(tool.server_label)
-            enable_browser = (
-                "web_search_preview" in tool_types
-                and self.tool_server is not None
-                and self.tool_server.has_tool("browser")
-            )
-            enable_code_interpreter = (
-                "code_interpreter" in tool_types
-                and self.tool_server is not None
-                and self.tool_server.has_tool("python")
-            )
-            enable_container = (
-                "container" in tool_types
-                and self.tool_server is not None
-                and self.tool_server.has_tool("container")
-            )
             with_custom_tools = has_custom_tools(tool_types)
-            sys_msg = get_system_message(
-                reasoning_effort=reasoning_effort,
-                browser_description=(
-                    self.tool_server.get_tool_description("browser")
-                    if enable_browser and self.tool_server is not None
-                    else None
-                ),
-                python_description=(
-                    self.tool_server.get_tool_description("python")
-                    if enable_code_interpreter and self.tool_server is not None
-                    else None
-                ),
-                container_description=(
-                    self.tool_server.get_tool_description("container")
-                    if enable_container and self.tool_server is not None
-                    else None
-                ),
-                instructions=request.instructions,
-                with_custom_tools=with_custom_tools,
+
+            sys_msg = self._construct_harmony_system_input_message(
+                request, with_custom_tools, tool_types
             )
             messages.append(sys_msg)
             if with_custom_tools: