diff --git a/tests/entrypoints/openai/responses/test_serving_stateless.py b/tests/entrypoints/openai/responses/test_serving_stateless.py new file mode 100644 index 000000000000..5ae50f775528 --- /dev/null +++ b/tests/entrypoints/openai/responses/test_serving_stateless.py @@ -0,0 +1,649 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Unit tests for stateless multi-turn Responses API (RFC #26934 + @grs). + +Covers: +- Protocol-level validation (pydantic model constraints on ResponsesRequest) +- State carrier injection helpers (_build_state_carrier / _extract_state_from_response) +- Error paths when enable_store=False (retrieve, cancel, previous_response_id) +- Error path when previous_response lacks a state carrier (no include= on prior turn) +- background=True rejected when previous_response is set +- utils.py skipping of state-carrier items +- construct_input_messages contract (prev_msg prepended before new turn) +""" + +from http import HTTPStatus +from unittest.mock import MagicMock + +import pytest + +# --------------------------------------------------------------------------- +# Signing key fixture — deterministic per test run +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def set_signing_key(monkeypatch): + monkeypatch.setenv("VLLM_RESPONSES_STATE_SIGNING_KEY", "cc" * 32) + import vllm.entrypoints.openai.responses.state as state_mod + + state_mod._SIGNING_KEY = None + yield + state_mod._SIGNING_KEY = None + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_minimal_serving(enable_store: bool = False): + """Return an OpenAIServingResponses instance with only the attributes + exercised by the methods under test. We bypass __init__ via __new__ and + set exactly what is needed. + + Attributes touched by retrieve_responses / cancel_responses: + enable_store, response_store, response_store_lock, background_tasks, + log_error_stack (for create_error_response from base class) + Attributes touched by create_responses (up to store check): + models (for _check_model), engine_client (for errored check) + """ + import asyncio + + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + obj = OpenAIServingResponses.__new__(OpenAIServingResponses) + obj.enable_store = enable_store + obj.response_store = {} + obj.response_store_lock = asyncio.Lock() + obj.background_tasks = {} + obj.log_error_stack = False + # _check_model needs models.is_base_model to return True (model found) + obj.models = MagicMock() + obj.models.is_base_model.return_value = True + # _validate_create_responses_input checks this before hitting the store guard + obj.use_harmony = False + return obj + + +# --------------------------------------------------------------------------- +# Protocol validation — pydantic model constraints +# --------------------------------------------------------------------------- + + +class TestProtocolValidation: + def test_previous_response_and_id_mutually_exclusive(self): + from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + ) + + # model_construct bypasses field validation so we get a typed instance + # without needing all required fields — Pydantic accepts it for type checks. + fake_prev = ResponsesResponse.model_construct(id="resp_fake") + with pytest.raises(Exception, match="Cannot set both"): + ResponsesRequest( + model="test", + input="hello", + previous_response_id="resp_abc", + previous_response=fake_prev, + ) + + def test_previous_response_silently_clears_store(self): + from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + ) + + fake_prev = ResponsesResponse.model_construct(id="resp_fake") + req = ResponsesRequest( + model="test", + input="hello", + store=True, + previous_response=fake_prev, + ) + assert req.store is False + + def test_background_with_previous_response_raises(self): + """background=True + previous_response must be rejected. + + The mode='before' validator allows background=True when store=True + (the default). Our mode='after' validator must then catch that + previous_response was also set and raise — otherwise the request + would produce an unretrievable background response (store gets + silently cleared to False, but background remains True). + """ + from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + ) + + fake_prev = ResponsesResponse.model_construct(id="resp_fake") + with pytest.raises(Exception, match="background"): + ResponsesRequest( + model="test", + input="hello", + background=True, + previous_response=fake_prev, + ) + + +# --------------------------------------------------------------------------- +# State carrier round-trip via serving helpers +# (thin wrappers over state.py — verified here to confirm the integration) +# --------------------------------------------------------------------------- + + +class TestStateCarrierHelpers: + def test_build_and_extract_roundtrip(self): + from openai.types.responses.response_reasoning_item import ResponseReasoningItem + + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving() + messages = [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}, + ] + + carrier = OpenAIServingResponses._build_state_carrier( + serving, messages, "test_req_id_12345" + ) + + assert isinstance(carrier, ResponseReasoningItem) + assert carrier.type == "reasoning" + assert carrier.status == "completed" + assert carrier.encrypted_content.startswith("vllm:1:") + + mock_response = MagicMock() + mock_response.output = [carrier] + + recovered = OpenAIServingResponses._extract_state_from_response( + serving, mock_response + ) + assert recovered == messages + + def test_extract_returns_none_when_no_carrier(self): + from openai.types.responses import ResponseOutputMessage, ResponseOutputText + + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving() + + text = ResponseOutputText(type="output_text", text="hi", annotations=[]) + msg = ResponseOutputMessage( + id="msg_1", + type="message", + role="assistant", + content=[text], + status="completed", + ) + mock_response = MagicMock() + mock_response.output = [msg] + + assert ( + OpenAIServingResponses._extract_state_from_response(serving, mock_response) + is None + ) + + def test_extract_raises_on_tampered_carrier(self): + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving() + carrier = OpenAIServingResponses._build_state_carrier( + serving, [{"role": "user", "content": "hi"}], "req" + ) + + parts = carrier.encrypted_content.split(":", 3) + parts[3] = "0" * 64 + carrier.encrypted_content = ":".join(parts) + + mock_response = MagicMock() + mock_response.output = [carrier] + + with pytest.raises(ValueError, match="HMAC verification failed"): + OpenAIServingResponses._extract_state_from_response(serving, mock_response) + + def test_restore_harmony_state_messages_roundtrips_renderably(self): + from openai_harmony import Author, DeveloperContent, Message, Role, TextContent + + from vllm.entrypoints.openai.parser.harmony_utils import render_for_completion + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + from vllm.entrypoints.openai.responses.state import ( + deserialize_state, + serialize_state, + ) + + original = [ + Message( + author=Author(role=Role.DEVELOPER), + content=[DeveloperContent(instructions="Be concise")], + ), + Message( + author=Author(role=Role.USER), + content=[TextContent(text="Hello")], + ), + ] + + recovered = deserialize_state(serialize_state(original)) + restored = OpenAIServingResponses._restore_harmony_state_messages(recovered) + + assert [message.to_dict() for message in restored] == [ + message.to_dict() for message in original + ] + assert render_for_completion(restored) + + +# --------------------------------------------------------------------------- +# Error paths when enable_store=False +# --------------------------------------------------------------------------- + + +class TestStorelessErrorPaths: + @pytest.mark.asyncio + async def test_retrieve_without_store_returns_501(self): + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving(enable_store=False) + result = await OpenAIServingResponses.retrieve_responses( + serving, "resp_123", None, False + ) + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.NOT_IMPLEMENTED + assert "VLLM_ENABLE_RESPONSES_API_STORE" in result.error.message + + @pytest.mark.asyncio + async def test_cancel_without_store_unknown_id_returns_404(self): + """With store off and no matching background task, cancel returns 404.""" + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving(enable_store=False) + result = await OpenAIServingResponses.cancel_responses(serving, "resp_123") + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.NOT_FOUND + + @pytest.mark.asyncio + async def test_cancel_without_store_active_task_returns_400(self): + """With store off but a matching in-flight task, cancel cancels it and + returns 400 (not 404, not 501) — the task was cancelled but no stored + response object can be returned in stateless mode. + + This is the success branch of the stateless cancel path and was the + actual behavior added by the review fix. + """ + import asyncio + + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving(enable_store=False) + + # Simulate an in-flight background task that blocks until cancelled. + async def _blocking(): + await asyncio.sleep(9999) + + task = asyncio.ensure_future(_blocking()) + # Yield once so the task starts and reaches its first await point; + # otherwise cancel() on an unstarted task may mark it cancelled before + # the coroutine body ever runs, but task.cancelled() is still True. + await asyncio.sleep(0) + + serving.background_tasks["resp_inflight"] = task + + result = await OpenAIServingResponses.cancel_responses(serving, "resp_inflight") + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.BAD_REQUEST + assert "stateless mode" in result.error.message + assert task.cancelled(), "task was not cancelled" + + @pytest.mark.asyncio + async def test_previous_response_without_carrier_returns_400(self): + """previous_response with no state carrier + store disabled → 400. + + Regression test for P1 review finding: if a client passes + previous_response but the prior turn was generated without + include=['reasoning.encrypted_content'], _extract_state_from_response + returns None. With store disabled there is no msg_store fallback, so + we must return a clear 400 rather than letting msg_store[id] raise + KeyError → 500. + """ + from openai.types.responses import ResponseOutputMessage, ResponseOutputText + + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + ) + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + # Build a previous response whose output has NO state carrier item. + text = ResponseOutputText(type="output_text", text="Hello!", annotations=[]) + msg = ResponseOutputMessage( + id="msg_1", + type="message", + role="assistant", + content=[text], + status="completed", + ) + prev_resp = ResponsesResponse.model_construct(id="resp_old", output=[msg]) + + serving = _make_minimal_serving(enable_store=False) + serving.engine_client = MagicMock() + serving.engine_client.errored = False + + req = ResponsesRequest( + model="test", + input="What is my name?", + store=False, + previous_response=prev_resp, + ) + + result = await OpenAIServingResponses.create_responses(serving, req) + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.BAD_REQUEST + assert "state carrier" in result.error.message + + @pytest.mark.asyncio + async def test_previous_response_without_carrier_store_enabled_returns_400(self): + """previous_response with no state carrier returns 400 even when store is on. + + Regression for Codex P1: the original guard was gated on + `not self.enable_store`, so with store=True and a carrierless + previous_response the code fell through to msg_store[prev_response.id] + which raised KeyError → 500. previous_response always means stateless + path; carrier absence is always a client error. + """ + from openai.types.responses import ResponseOutputMessage, ResponseOutputText + + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + ) + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + text = ResponseOutputText(type="output_text", text="Hello!", annotations=[]) + msg = ResponseOutputMessage( + id="msg_1", + type="message", + role="assistant", + content=[text], + status="completed", + ) + prev_resp = ResponsesResponse.model_construct(id="resp_old", output=[msg]) + + # store is ENABLED — this is the case the original guard missed + serving = _make_minimal_serving(enable_store=True) + serving.engine_client = MagicMock() + serving.engine_client.errored = False + + req = ResponsesRequest( + model="test", + input="What is my name?", + store=False, + previous_response=prev_resp, + ) + + result = await OpenAIServingResponses.create_responses(serving, req) + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.BAD_REQUEST + assert "state carrier" in result.error.message + + @pytest.mark.asyncio + async def test_previous_response_id_without_store_returns_400(self): + """create_responses should reject previous_response_id when store is off.""" + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving(enable_store=False) + # Minimal extra attributes that create_responses reads before the store check + serving.engine_client = MagicMock() + serving.engine_client.errored = False + + req = ResponsesRequest( + model="test", + input="hello", + previous_response_id="resp_old", + store=False, + ) + + result = await OpenAIServingResponses.create_responses(serving, req) + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.BAD_REQUEST + assert "VLLM_ENABLE_RESPONSES_API_STORE" in result.error.message + + +# --------------------------------------------------------------------------- +# utils.py — state-carrier items are transparently skipped +# --------------------------------------------------------------------------- + + +class TestUtilsStateCarrierSkipping: + def test_construct_chat_messages_skips_state_carrier(self): + from openai.types.responses import ResponseOutputMessage, ResponseOutputText + from openai.types.responses.response_reasoning_item import ResponseReasoningItem + + from vllm.entrypoints.openai.responses.state import serialize_state + from vllm.entrypoints.openai.responses.utils import ( + construct_chat_messages_with_tool_call, + ) + + blob = serialize_state([{"role": "user", "content": "prev"}]) + carrier = ResponseReasoningItem( + id="rs_state_abc", + type="reasoning", + summary=[], + status="completed", + encrypted_content=blob, + ) + text = ResponseOutputText(type="output_text", text="Hello!", annotations=[]) + msg = ResponseOutputMessage( + id="msg_1", + type="message", + role="assistant", + content=[text], + status="completed", + ) + + result = construct_chat_messages_with_tool_call([carrier, msg]) + + assert len(result) == 1 + assert result[0]["role"] == "assistant" + assert result[0]["content"] == "Hello!" + + def test_single_message_from_state_carrier_is_none(self): + from openai.types.responses.response_reasoning_item import ResponseReasoningItem + + from vllm.entrypoints.openai.responses.state import serialize_state + from vllm.entrypoints.openai.responses.utils import ( + _construct_single_message_from_response_item, + ) + + blob = serialize_state([{"role": "user", "content": "hi"}]) + carrier = ResponseReasoningItem( + id="rs_state_xyz", + type="reasoning", + summary=[], + status="completed", + encrypted_content=blob, + ) + assert _construct_single_message_from_response_item(carrier) is None + + def test_external_encrypted_content_still_raises(self): + from openai.types.responses.response_reasoning_item import ResponseReasoningItem + + from vllm.entrypoints.openai.responses.utils import ( + _construct_single_message_from_response_item, + ) + + external = ResponseReasoningItem( + id="rs_ext", + type="reasoning", + summary=[], + status="completed", + encrypted_content="opaque-blob-not-from-vllm", + ) + with pytest.raises(ValueError, match="not supported"): + _construct_single_message_from_response_item(external) + + +# --------------------------------------------------------------------------- +# construct_input_messages with prev_messages override +# --------------------------------------------------------------------------- + + +class TestNoDuplicateAssistantTurn: + def test_stateless_path_does_not_duplicate_assistant_output(self): + """Bug 1 regression: when prev_messages (from the state carrier) + already contains the assistant turn, _make_request must NOT also pass + prev_response_output, otherwise construct_input_messages appends the + assistant content twice. + """ + from vllm.entrypoints.openai.responses.utils import construct_input_messages + + # Simulate what the carrier stores: full history including assistant reply. + prev = [ + {"role": "user", "content": "Who are you?"}, + {"role": "assistant", "content": "I am helpful."}, + ] + + # When using the stateless path (prev_messages is not None), + # prev_response_output must be None — otherwise the assistant turn + # appears twice. + result = construct_input_messages( + request_instructions=None, + request_input="What did you say?", + prev_msg=prev, + prev_response_output=None, + ) + + assistant_msgs = [m for m in result if m.get("role") == "assistant"] + assert len(assistant_msgs) == 1, ( + f"Expected 1 assistant message, got {len(assistant_msgs)}: {assistant_msgs}" + ) + + # Verify the full message list order. + assert result[0] == {"role": "user", "content": "Who are you?"} + assert result[1] == {"role": "assistant", "content": "I am helpful."} + assert result[2] == {"role": "user", "content": "What did you say?"} + + def test_passing_both_prev_msg_and_output_causes_duplicate(self): + """Verify the bug scenario: passing both prev_msg AND prev_response_output + produces duplicate assistant messages (this is what we fixed). + """ + from openai.types.responses import ResponseOutputMessage, ResponseOutputText + + from vllm.entrypoints.openai.responses.utils import construct_input_messages + + prev = [ + {"role": "user", "content": "Who are you?"}, + {"role": "assistant", "content": "I am helpful."}, + ] + + text = ResponseOutputText( + type="output_text", text="I am helpful.", annotations=[] + ) + msg = ResponseOutputMessage( + id="msg_1", + type="message", + role="assistant", + content=[text], + status="completed", + ) + + # This is the BROKEN scenario — passing both leads to duplication. + result = construct_input_messages( + request_instructions=None, + request_input="What did you say?", + prev_msg=prev, + prev_response_output=[msg], + ) + + assistant_msgs = [m for m in result if m.get("role") == "assistant"] + assert len(assistant_msgs) == 2, ( + "Expected duplicate assistant messages when both prev_msg and " + "prev_response_output are passed" + ) + + +class TestTamperedCarrierReturns400: + @pytest.mark.asyncio + async def test_tampered_carrier_returns_400_not_500(self): + """Bug 2 regression: a tampered state carrier must return 400, not 500. + + _extract_state_from_response raises ValueError on HMAC mismatch. + Before the fix, this call was outside the try/except block, producing + an unhandled 500. + """ + from vllm.entrypoints.openai.engine.protocol import ErrorResponse + from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + ) + from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses + + serving = _make_minimal_serving(enable_store=False) + serving.engine_client = MagicMock() + serving.engine_client.errored = False + + # Build a valid carrier then tamper with the HMAC. + carrier = OpenAIServingResponses._build_state_carrier( + serving, [{"role": "user", "content": "hi"}], "req" + ) + parts = carrier.encrypted_content.split(":", 3) + parts[3] = "0" * 64 + carrier.encrypted_content = ":".join(parts) + + prev_resp = ResponsesResponse.model_construct( + id="resp_tampered", output=[carrier] + ) + + req = ResponsesRequest( + model="test", + input="Follow up", + store=False, + previous_response=prev_resp, + ) + + result = await OpenAIServingResponses.create_responses(serving, req) + + assert isinstance(result, ErrorResponse) + assert result.error.code == HTTPStatus.BAD_REQUEST + assert "tampered" in result.error.message + + +class TestPrevMessagesOverride: + def test_construct_input_messages_prepends_prev_msg(self): + """construct_input_messages correctly prepends a deserialized history + list (prev_msg) before the new user turn. + + This is the contract the stateless path relies on: after extracting + message history from the encrypted_content carrier, the history is + passed as prev_msg and must appear in order before the new input. + """ + from vllm.entrypoints.openai.responses.utils import construct_input_messages + + prev = [ + {"role": "user", "content": "Who are you?"}, + {"role": "assistant", "content": "I am helpful."}, + ] + result = construct_input_messages( + request_instructions=None, + request_input="What did you say?", + prev_msg=prev, + prev_response_output=None, + ) + assert result[0] == {"role": "user", "content": "Who are you?"} + assert result[1] == {"role": "assistant", "content": "I am helpful."} + assert result[2] == {"role": "user", "content": "What did you say?"} diff --git a/tests/entrypoints/openai/responses/test_state.py b/tests/entrypoints/openai/responses/test_state.py new file mode 100644 index 000000000000..2abcf7719688 --- /dev/null +++ b/tests/entrypoints/openai/responses/test_state.py @@ -0,0 +1,272 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for the stateless Responses API state carrier (state.py).""" + +import pytest + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _reset_signing_key(): + """Force state.py to re-derive the signing key on next call.""" + import vllm.entrypoints.openai.responses.state as state_mod + + state_mod._SIGNING_KEY = None + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def isolated_key(monkeypatch): + """Each test gets a fresh, deterministic signing key.""" + monkeypatch.setenv( + "VLLM_RESPONSES_STATE_SIGNING_KEY", + "ab" * 32, # 64 hex chars = 32 bytes + ) + _reset_signing_key() + yield + _reset_signing_key() + + +# --------------------------------------------------------------------------- +# Import after env is patched (in case module was already imported) +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def state(): + import vllm.entrypoints.openai.responses.state as m + + return m + + +# --------------------------------------------------------------------------- +# Round-trip tests +# --------------------------------------------------------------------------- + + +def test_roundtrip_plain_dicts(state): + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + ] + blob = state.serialize_state(messages) + recovered = state.deserialize_state(blob) + assert recovered == messages + + +def test_roundtrip_empty_list(state): + blob = state.serialize_state([]) + recovered = state.deserialize_state(blob) + assert recovered == [] + + +def test_roundtrip_nested_structure(state): + messages = [ + { + "role": "user", + "content": [{"type": "text", "text": "What is 2+2?"}], + }, + {"role": "assistant", "content": "4", "extra": {"key": [1, 2, 3]}}, + ] + blob = state.serialize_state(messages) + recovered = state.deserialize_state(blob) + assert recovered == messages + + +def test_roundtrip_pydantic_model(state): + """Objects with model_dump() should serialize transparently.""" + + class FakeModel: + def model_dump(self): + return {"author": {"role": "user"}, "content": "hi"} + + messages = [FakeModel()] + blob = state.serialize_state(messages) + recovered = state.deserialize_state(blob) + # After JSON round-trip, FakeModel becomes a plain dict + assert recovered == [{"author": {"role": "user"}, "content": "hi"}] + + +def test_roundtrip_openai_harmony_messages_uses_wire_format(state): + from openai_harmony import Author, DeveloperContent, Message, Role, TextContent + + messages = [ + Message( + author=Author(role=Role.DEVELOPER), + content=[DeveloperContent(instructions="Be concise")], + ), + Message( + author=Author(role=Role.USER), + content=[TextContent(text="Hello")], + ), + ] + + blob = state.serialize_state(messages) + recovered = state.deserialize_state(blob) + + assert recovered == [message.to_dict() for message in messages] + + +# --------------------------------------------------------------------------- +# is_state_carrier +# --------------------------------------------------------------------------- + + +def test_is_state_carrier_true(state): + blob = state.serialize_state([{"role": "user", "content": "hi"}]) + + class FakeItem: + encrypted_content = blob + + assert state.is_state_carrier(FakeItem()) + + +def test_is_state_carrier_false_external(state): + """Real encrypted_content from external models should not be detected.""" + + class FakeItem: + encrypted_content = "some-opaque-blob-from-openai" + + assert not state.is_state_carrier(FakeItem()) + + +def test_is_state_carrier_false_no_field(state): + class FakeItem: + pass + + assert not state.is_state_carrier(FakeItem()) + + +def test_is_state_carrier_false_none(state): + class FakeItem: + encrypted_content = None + + assert not state.is_state_carrier(FakeItem()) + + +# --------------------------------------------------------------------------- +# deserialize_state — non-carrier inputs +# --------------------------------------------------------------------------- + + +def test_deserialize_returns_none_for_non_carrier(state): + assert state.deserialize_state("some-random-opaque-string") is None + + +def test_deserialize_returns_none_for_empty_string(state): + assert state.deserialize_state("") is None + + +# --------------------------------------------------------------------------- +# HMAC tamper detection +# --------------------------------------------------------------------------- + + +def test_tampered_payload_raises(state): + blob = state.serialize_state([{"role": "user", "content": "original"}]) + # Corrupt the payload part (index 2 when split on ':') + parts = blob.split(":", 3) + assert len(parts) == 4 + parts[2] = parts[2][:-4] + "XXXX" # corrupt the base64 payload + tampered = ":".join(parts) + with pytest.raises(ValueError, match="HMAC verification failed"): + state.deserialize_state(tampered) + + +def test_tampered_sig_raises(state): + blob = state.serialize_state([{"role": "user", "content": "hello"}]) + parts = blob.split(":", 3) + parts[3] = "0" * 64 # replace HMAC with zeros + tampered = ":".join(parts) + with pytest.raises(ValueError, match="HMAC verification failed"): + state.deserialize_state(tampered) + + +def test_malformed_carrier_raises(state): + malformed = "vllm:1:onlythreeparts" + with pytest.raises(ValueError, match="Malformed vLLM state carrier"): + state.deserialize_state(malformed) + + +# --------------------------------------------------------------------------- +# Cross-key incompatibility +# --------------------------------------------------------------------------- + + +def test_different_keys_are_incompatible(monkeypatch): + """A blob signed with key A must not validate with key B.""" + import vllm.entrypoints.openai.responses.state as state_mod + + monkeypatch.setenv("VLLM_RESPONSES_STATE_SIGNING_KEY", "aa" * 32) + state_mod._SIGNING_KEY = None + blob = state_mod.serialize_state([{"role": "user", "content": "secret"}]) + + # Switch to a different key + monkeypatch.setenv("VLLM_RESPONSES_STATE_SIGNING_KEY", "bb" * 32) + state_mod._SIGNING_KEY = None + + with pytest.raises(ValueError, match="HMAC verification failed"): + state_mod.deserialize_state(blob) + + +# --------------------------------------------------------------------------- +# Random-key warning (no env var) +# --------------------------------------------------------------------------- + + +def test_no_env_var_generates_random_key(monkeypatch): + """Without the env var, a random 32-byte key is generated. + + The warning is emitted via vLLM's logger (visible in test output) but is + not capturable via capsys/caplog since vLLM writes to sys.__stdout__ directly. + """ + import vllm.entrypoints.openai.responses.state as state_mod + + monkeypatch.delenv("VLLM_RESPONSES_STATE_SIGNING_KEY", raising=False) + state_mod._SIGNING_KEY = None + + key = state_mod._get_signing_key() + + assert key is not None + assert len(key) == 32 + # A second call returns the same cached key (warning only fires once) + key2 = state_mod._get_signing_key() + assert key == key2 + + +# --------------------------------------------------------------------------- +# Invalid hex key +# --------------------------------------------------------------------------- + + +def test_invalid_hex_key_raises(monkeypatch): + import vllm.entrypoints.openai.responses.state as state_mod + + monkeypatch.setenv("VLLM_RESPONSES_STATE_SIGNING_KEY", "not-valid-hex!!") + state_mod._SIGNING_KEY = None + + with pytest.raises(ValueError, match="valid hex string"): + state_mod._get_signing_key() + + +def test_short_key_raises(monkeypatch): + """A key shorter than 32 bytes (64 hex chars) must be rejected. + + Short HMAC keys weaken tamper protection; enforce minimum length so + misconfigured deployments fail loudly rather than silently degrading + security. 'aa' decodes to 1 byte — well below the 32-byte minimum. + """ + import vllm.entrypoints.openai.responses.state as state_mod + + monkeypatch.setenv("VLLM_RESPONSES_STATE_SIGNING_KEY", "aa" * 4) # 4 bytes + state_mod._SIGNING_KEY = None + + with pytest.raises(ValueError, match="minimum of 32 bytes"): + state_mod._get_signing_key() diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index a5f62bdd8c39..84e611c13d6f 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -236,6 +236,13 @@ class ResponsesRequest(OpenAIBaseModel): # this cannot be used in conjunction with previous_response_id # TODO: consider supporting non harmony messages as well previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None + # Accept full previous response for stateless multi-turn (RFC #26934 + @grs). + # The client stores the full response object and passes it back on the next + # turn instead of a previous_response_id. vLLM extracts the Harmony message + # history from the encrypted_content state carrier embedded in the response + # output, so no server-side store is required. + # Cannot be set together with previous_response_id. + previous_response: "ResponsesResponse | None" = None structured_outputs: StructuredOutputsParams | None = Field( default=None, description="Additional kwargs for structured outputs", @@ -382,6 +389,33 @@ def is_include_output_logprobs(self) -> bool: and "message.output_text.logprobs" in self.include ) + @model_validator(mode="after") + def validate_previous_response_xor_id(self) -> "ResponsesRequest": + if self.previous_response_id and self.previous_response is not None: + raise ValueError( + "Cannot set both 'previous_response_id' and 'previous_response'. " + "Use 'previous_response_id' for store-backed multi-turn, or " + "'previous_response' (with include=['reasoning.encrypted_content'] " + "and store=false) for stateless multi-turn." + ) + if self.previous_response is not None: + if self.background: + # background mode requires store=True to retrieve the response + # later, but the stateless path forces store=False. Raise + # explicitly rather than silently producing an unretrievable + # background response. + raise ValueError( + "'background' mode cannot be used with 'previous_response'. " + "Stateless multi-turn (previous_response + " + "include=['reasoning.encrypted_content']) does not support " + "background responses." + ) + if self.store: + # Stateless path: store is meaningless (and would cause + # confusion). Mirror the silent-disable in create_responses. + self.store = False + return self + @model_validator(mode="before") @classmethod def validate_background(cls, data): @@ -652,3 +686,8 @@ class ResponseInProgressEvent(OpenAIResponseInProgressEvent): | ResponseMcpCallInProgressEvent | ResponseMcpCallCompletedEvent ) + +# Resolve forward reference: ResponsesRequest.previous_response -> ResponsesResponse +# Both classes are defined in this module; model_rebuild() is needed because +# ResponsesResponse did not exist when ResponsesRequest was first evaluated. +ResponsesRequest.model_rebuild() diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 574282c4cdc6..9bcb9c1c437c 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -11,6 +11,7 @@ from http import HTTPStatus from typing import Any, Final +import jinja2 from fastapi import Request from openai.types.responses import ( ResponseContentPartAddedEvent, @@ -321,6 +322,14 @@ def _validate_create_responses_input( status_code=HTTPStatus.BAD_REQUEST, param="previous_response_id", ) + if request.previous_input_messages and request.previous_response is not None: + return self.create_error_response( + err_type="invalid_request_error", + message="Only one of `previous_input_messages` and " + "`previous_response` can be set.", + status_code=HTTPStatus.BAD_REQUEST, + param="previous_response", + ) return None async def create_responses( @@ -355,25 +364,92 @@ async def create_responses( # value). request.store = False - # Handle the previous response ID. - prev_response_id = request.previous_response_id - if prev_response_id is not None: + # Resolve the previous response (store-backed or stateless). + prev_response: ResponsesResponse | None = None + if request.previous_response_id is not None: + if not self.enable_store: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "`previous_response_id` requires the response store to be " + "enabled (VLLM_ENABLE_RESPONSES_API_STORE=1). " + "For stateless multi-turn without a persistent store, pass " + "the full previous response object via `previous_response` " + "with `include=['reasoning.encrypted_content']` and " + "`store=false`." + ), + status_code=HTTPStatus.BAD_REQUEST, + ) async with self.response_store_lock: - prev_response = self.response_store.get(prev_response_id) + prev_response = self.response_store.get(request.previous_response_id) if prev_response is None: - return self._make_not_found_error(prev_response_id) - else: - prev_response = None + return self._make_not_found_error(request.previous_response_id) + elif request.previous_response is not None: + prev_response = request.previous_response + + # For stateless multi-turn: extract Harmony/chat messages from the + # encrypted_content state carrier embedded in the previous response. + # This replaces the msg_store lookup for subsequent turns. + prev_messages_from_state: list | None = None + if prev_response is not None and request.previous_response is not None: + try: + prev_messages_from_state = self._extract_state_from_response( + prev_response + ) + except ValueError as exc: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "The state carrier in 'previous_response' is invalid " + f"({exc}). The data may have been tampered with. " + "Ensure VLLM_RESPONSES_STATE_SIGNING_KEY is consistent " + "across requests." + ), + status_code=HTTPStatus.BAD_REQUEST, + ) + if prev_messages_from_state is None: + # The caller passed previous_response but the prior response + # contains no vLLM state carrier — most likely because + # include=['reasoning.encrypted_content'] was omitted on the + # prior turn. previous_response always implies the stateless + # path; there is no msg_store fallback regardless of + # enable_store (msg_store is keyed by response ID, not carried + # in the response object). Return a clear 400 rather than + # letting msg_store[prev_response.id] raise a KeyError 500. + return self.create_error_response( + err_type="invalid_request_error", + message=( + "The provided 'previous_response' does not contain a " + "vLLM state carrier. To use stateless multi-turn, " + "generate the previous response with " + "include=['reasoning.encrypted_content'] and " + "store=false." + ), + status_code=HTTPStatus.BAD_REQUEST, + ) - lora_request = self._maybe_get_adapters(request) - model_name = self.models.model_name(lora_request) + try: + lora_request = self._maybe_get_adapters(request) + model_name = self.models.model_name(lora_request) - if self.use_harmony: - messages, engine_prompts = self._make_request_with_harmony( - request, prev_response - ) - else: - messages, engine_prompts = await self._make_request(request, prev_response) + if self.use_harmony: + messages, engine_prompts = self._make_request_with_harmony( + request, prev_response, prev_messages_from_state + ) + else: + messages, engine_prompts = await self._make_request( + request, prev_response, prev_messages_from_state + ) + + except ( + ValueError, + TypeError, + RuntimeError, + jinja2.TemplateError, + NotImplementedError, + ) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(e) request_metadata = RequestResponseMetadata(request_id=request.request_id) if raw_request: @@ -560,30 +636,45 @@ async def create_responses( model_name, tokenizer, request_metadata, + messages=messages, ) - return await self.responses_full_generator( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - ) + try: + return await self.responses_full_generator( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + messages=messages, + ) + except Exception as e: + return self.create_error_response(e) async def _make_request( self, request: ResponsesRequest, prev_response: ResponsesResponse | None, + prev_messages: list | None = None, ): tool_dicts = construct_tool_dicts(request.tools, request.tool_choice) # Construct the input messages. + # For stateless multi-turn, prev_messages comes from the encrypted_content + # state carrier; otherwise fall back to the in-memory msg_store. + prev_msg = ( + prev_messages + if prev_messages is not None + else (self.msg_store.get(prev_response.id) if prev_response else None) + ) messages = construct_input_messages( request_instructions=request.instructions, request_input=request.input, - prev_msg=self.msg_store.get(prev_response.id) if prev_response else None, - prev_response_output=prev_response.output if prev_response else None, + prev_msg=prev_msg, + prev_response_output=prev_response.output + if prev_response and prev_messages is None + else None, ) _, engine_prompts = await self.openai_serving_render.preprocess_chat( @@ -704,13 +795,16 @@ def _make_request_with_harmony( self, request: ResponsesRequest, prev_response: ResponsesResponse | None, + prev_messages: list | None = None, ): if request.tool_choice != "auto": raise NotImplementedError( "Only 'auto' tool_choice is supported in response API with Harmony" ) - messages = self._construct_input_messages_with_harmony(request, prev_response) + messages = self._construct_input_messages_with_harmony( + request, prev_response, prev_messages + ) prompt_token_ids = render_for_completion(messages) engine_prompt = token_inputs(prompt_token_ids) @@ -746,6 +840,7 @@ async def responses_full_generator( tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, created_time: int | None = None, + messages: list | None = None, ) -> ErrorResponse | ResponsesResponse: if created_time is None: created_time = int(time.time()) @@ -875,6 +970,40 @@ async def responses_full_generator( usage=usage, ) + # Inject state carrier for stateless multi-turn (RFC #26934 + @grs). + # When the client requests encrypted_content inclusion and has opted out + # of server-side storage, embed a signed blob of the full message history + # into a synthetic ReasoningItem appended to response.output. The client + # stores the response opaquely and passes it back as `previous_response` + # on the next turn; vLLM then deserializes the blob to recover history. + if ( + request.include + and "reasoning.encrypted_content" in request.include + and not request.store + ): + carrier_messages: list | None = None + if self.use_harmony and isinstance(context, HarmonyContext): + # Harmony context already contains the full history including + # the assistant turn that was just generated. + carrier_messages = list(context.messages) + elif messages is not None: + # Non-Harmony path: `messages` is the input to the LLM for + # this turn. Append the assistant's response output so the + # carrier contains the complete history (input + response) + # for the next turn to build on. + from vllm.entrypoints.openai.responses.utils import ( + construct_chat_messages_with_tool_call, + ) + + carrier_messages = messages + construct_chat_messages_with_tool_call( + response.output + ) + if carrier_messages is not None: + state_carrier = self._build_state_carrier( + carrier_messages, request.request_id + ) + response.output.append(state_carrier) + if request.store: async with self.response_store_lock: stored_response = self.response_store.get(response.id) @@ -1125,6 +1254,7 @@ def _construct_input_messages_with_harmony( self, request: ResponsesRequest, prev_response: ResponsesResponse | None, + prev_messages: list | None = None, ) -> list[OpenAIHarmonyMessage]: messages: list[OpenAIHarmonyMessage] = [] if prev_response is None: @@ -1147,7 +1277,20 @@ def _construct_input_messages_with_harmony( # Continue the previous conversation. # FIXME(woosuk): Currently, request params like reasoning and # instructions are ignored. - prev_msgs = self.msg_store[prev_response.id] + if prev_messages is not None: + # Stateless path: messages came from the encrypted_content + # state carrier; restore wire-format dicts back to + # OpenAIHarmonyMessage via the library's typed constructor. + prev_msgs = self._restore_harmony_state_messages(prev_messages) + else: + stored = self.msg_store.get(prev_response.id) + if stored is None: + raise ValueError( + f"No stored messages found for response " + f"'{prev_response.id}'. The response may have " + f"expired or the store may be inconsistent." + ) + prev_msgs = stored # FIXME(woosuk): The slice-delete-reappend cycle below is # currently a no-op --- it removes messages then puts them all @@ -1200,6 +1343,15 @@ def _construct_input_messages_with_harmony( prev_outputs.append(response_msg) return messages + @staticmethod + def _restore_harmony_state_messages( + prev_messages: list, + ) -> list[OpenAIHarmonyMessage]: + return [ + OpenAIHarmonyMessage.from_dict(m) if isinstance(m, dict) else m + for m in prev_messages + ] + async def _run_background_request_stream( self, request: ResponsesRequest, @@ -1273,6 +1425,19 @@ async def retrieve_responses( | ResponsesResponse | AsyncGenerator[StreamingResponsesResponse, None] ): + if not self.enable_store: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "Response retrieval requires the response store to be enabled " + "(VLLM_ENABLE_RESPONSES_API_STORE=1). " + "For stateless multi-turn, use `previous_response` with " + "`include=['reasoning.encrypted_content']` and `store=false` " + "instead of retrieving responses by ID." + ), + status_code=HTTPStatus.NOT_IMPLEMENTED, + ) + async with self.response_store_lock: response = self.response_store.get(response_id) @@ -1290,6 +1455,35 @@ async def cancel_responses( self, response_id: str, ) -> ErrorResponse | ResponsesResponse: + if not self.enable_store: + # Stateless mode: cancel in-flight tasks by ID only (no stored response). + task = self.background_tasks.get(response_id) + if not task: + # Mimic the stateful path's 404 when no task is found. + return self._make_not_found_error(response_id) + + task.cancel() + try: + await task + except asyncio.CancelledError: + logger.info( + "Background task for %s was cancelled (stateless mode)", + response_id, + ) + + # Cancellation was initiated but we cannot return a full response + # object because no state is stored in stateless mode. + return self.create_error_response( + err_type="invalid_request_error", + message=( + "Response cancellation was initiated for the in-flight task, " + "but a full response object cannot be returned in stateless mode. " + "Enable VLLM_ENABLE_RESPONSES_API_STORE=1 for full cancel " + "support." + ), + status_code=HTTPStatus.BAD_REQUEST, + ) + async with self.response_store_lock: response = self.response_store.get(response_id) if response is None: @@ -1323,6 +1517,63 @@ def _make_not_found_error(self, response_id: str) -> ErrorResponse: param="response_id", ) + def _build_state_carrier( + self, messages: list, request_id: str + ) -> ResponseReasoningItem: + """Build a synthetic ReasoningItem that carries serialized message history. + + The ``encrypted_content`` field holds a signed, base64-encoded JSON blob + of the full message list. On the next turn the client passes the full + response back as ``previous_response``; vLLM calls + ``_extract_state_from_response`` to recover history without a store. + """ + from vllm.entrypoints.openai.responses.state import serialize_state + + return ResponseReasoningItem( + id=f"rs_state_{request_id[-12:]}", + type="reasoning", + summary=[], + status="completed", + encrypted_content=serialize_state(messages), + ) + + def _extract_state_from_response(self, response: ResponsesResponse) -> list | None: + """Extract the serialized message history from a response's state carrier. + + Scans ``response.output`` for a vLLM state-carrier ReasoningItem and + deserializes its ``encrypted_content`` field back into a message list. + + Returns: + The deserialized message list (plain dicts), or ``None`` if no state + carrier is found (e.g. when the previous response was generated + without ``include=['reasoning.encrypted_content']``). + + Raises: + ValueError: If a state carrier is found but its HMAC is invalid. + """ + from vllm.entrypoints.openai.responses.state import ( + deserialize_state, + is_state_carrier, + ) + + for item in response.output: + if is_state_carrier(item): + return deserialize_state(item.encrypted_content) + return None + + def _make_store_not_supported_error(self) -> ErrorResponse: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "`store=True` (default) is not supported. Please set " + "`store=False` in Responses API or set " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when " + "starting the vLLM server." + ), + status_code=HTTPStatus.BAD_REQUEST, + param="store", + ) + async def _process_simple_streaming_events( self, request: ResponsesRequest, @@ -1942,6 +2193,7 @@ async def responses_stream_generator( tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, created_time: int | None = None, + messages: list | None = None, ) -> AsyncGenerator[StreamingResponsesResponse, None]: # TODO: # 1. Handle disconnect @@ -2029,6 +2281,7 @@ async def empty_async_generator(): tokenizer, request_metadata, created_time=created_time, + messages=messages, ) yield _increment_sequence_number_and_return( ResponseCompletedEvent( diff --git a/vllm/entrypoints/openai/responses/state.py b/vllm/entrypoints/openai/responses/state.py new file mode 100644 index 000000000000..e98dee2b4748 --- /dev/null +++ b/vllm/entrypoints/openai/responses/state.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Stateless conversation state carrier for the Responses API. + +Serializes conversation message history into the ``encrypted_content`` field of +a synthetic ResponseReasoningItem, enabling multi-turn conversations without +server-side storage. Implements the approach proposed by @grs in +vllm-project/vllm#26934. + +Wire format: ``vllm:1::`` + +Security note: + The payload is **signed**, not encrypted. The HMAC-SHA256 signature + prevents client-side tampering with the serialized history, but the + contents (serialized message dicts) are readable by anyone who holds + the response object. This matches the spirit of the ``encrypted_content`` + field in the OpenAI protocol: an opaque blob the client stores and + returns verbatim. + +Multi-node deployments: + Set ``VLLM_RESPONSES_STATE_SIGNING_KEY`` to the same 64-character hex + string on all nodes. Without it each process generates a random key at + startup, making state carriers incompatible across restarts/replicas. +""" + +import hashlib +import hmac +import json +import os +from typing import Any + +import pybase64 as base64 + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +_FORMAT_VERSION = "vllm:1" +_SIGNING_KEY: bytes | None = None + + +def _get_signing_key() -> bytes: + global _SIGNING_KEY + if _SIGNING_KEY is None: + key_hex = os.environ.get("VLLM_RESPONSES_STATE_SIGNING_KEY", "") + if key_hex: + try: + key_bytes = bytes.fromhex(key_hex) + except ValueError as exc: + raise ValueError( + "VLLM_RESPONSES_STATE_SIGNING_KEY must be a valid hex " + "string (e.g. a 64-char / 32-byte hex key)." + ) from exc + if len(key_bytes) < 32: + raise ValueError( + f"VLLM_RESPONSES_STATE_SIGNING_KEY decoded to only " + f"{len(key_bytes)} byte(s); a minimum of 32 bytes " + f"(64 hex characters) is required for a secure HMAC-SHA256 key." + ) + _SIGNING_KEY = key_bytes + else: + _SIGNING_KEY = os.urandom(32) + logger.warning( + "VLLM_RESPONSES_STATE_SIGNING_KEY is not set. " + "Stateless multi-turn state carriers are valid only for this " + "server instance and will break across restarts or on " + "multi-node deployments. " + "Set VLLM_RESPONSES_STATE_SIGNING_KEY to a shared 64-char " + "hex key to enable multi-node / restart-safe operation." + ) + return _SIGNING_KEY + + +def _harmony_serializer(obj: Any) -> Any: + """JSON serializer for OpenAIHarmonyMessage and similar pydantic models.""" + if hasattr(obj, "to_dict"): + return obj.to_dict() + if hasattr(obj, "model_dump"): + return obj.model_dump() + raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + + +def serialize_state(messages: list[Any]) -> str: + """Serialize *messages* into a signed, base64-encoded state carrier string. + + The returned string is suitable for embedding in + ``ResponseReasoningItem.encrypted_content``. + + Args: + messages: A list of message objects (OpenAIHarmonyMessage instances or + plain ``ChatCompletionMessageParam`` dicts). + + Returns: + A wire-format state carrier string: ``vllm:1::``. + """ + payload_b64 = base64.b64encode( + json.dumps(messages, default=_harmony_serializer).encode() + ).decode() + sig = hmac.new(_get_signing_key(), payload_b64.encode(), hashlib.sha256).hexdigest() + return f"{_FORMAT_VERSION}:{payload_b64}:{sig}" + + +def deserialize_state(encrypted_content: str) -> list[Any] | None: + """Deserialize a state carrier string back into a message list. + + Args: + encrypted_content: The value of ``ResponseReasoningItem.encrypted_content``. + + Returns: + The deserialized message list (as plain dicts), or ``None`` if the + string is not a vLLM state carrier (e.g. a real encrypted_content + from an external model). + + Raises: + ValueError: If the string looks like a vLLM state carrier but the + HMAC signature does not match (indicates tampering). + """ + if not encrypted_content.startswith(f"{_FORMAT_VERSION}:"): + return None + # Expected: "vllm:1::" + # Split into exactly 4 parts on the first 3 colons. + parts = encrypted_content.split(":", 3) + if len(parts) != 4: + raise ValueError( + "Malformed vLLM state carrier: expected " + f"'{_FORMAT_VERSION}::', got {encrypted_content!r}" + ) + _, _, payload_b64, sig = parts + expected = hmac.new( + _get_signing_key(), payload_b64.encode(), hashlib.sha256 + ).hexdigest() + if not hmac.compare_digest(sig, expected): + raise ValueError( + "vLLM state carrier HMAC verification failed. " + "The state may have been tampered with, or was produced by a " + "different server instance (check VLLM_RESPONSES_STATE_SIGNING_KEY)." + ) + return json.loads(base64.b64decode(payload_b64)) + + +def is_state_carrier(item: Any) -> bool: + """Return True if *item* is a vLLM state-carrier ReasoningItem. + + A state carrier is a ``ResponseReasoningItem`` whose ``encrypted_content`` + field starts with the vLLM format prefix. These items are synthetic: + they carry serialized conversation history and should not be sent to the + LLM as part of the chat message list. + + Args: + item: Any object, typically a ``ResponseOutputItem``. + + Returns: + True if the item is a vLLM-generated state carrier. + """ + ec = getattr(item, "encrypted_content", None) + return bool(ec and isinstance(ec, str) and ec.startswith(f"{_FORMAT_VERSION}:")) diff --git a/vllm/entrypoints/openai/responses/utils.py b/vllm/entrypoints/openai/responses/utils.py index 789a0e0b6be6..ee2f94a10603 100644 --- a/vllm/entrypoints/openai/responses/utils.py +++ b/vllm/entrypoints/openai/responses/utils.py @@ -167,14 +167,16 @@ def construct_chat_messages_with_tool_call( if maybe_combined_message is not None: messages[-1] = maybe_combined_message else: - messages.append(_construct_single_message_from_response_item(item)) + result = _construct_single_message_from_response_item(item) + if result is not None: # None means state carrier — skip it + messages.append(result) return messages def _construct_single_message_from_response_item( item: ResponseInputOutputItem, -) -> ChatCompletionMessageParam: +) -> ChatCompletionMessageParam | None: if isinstance(item, ResponseFunctionToolCall): # Append the function call as a tool call. return ChatCompletionAssistantMessageParam( @@ -191,9 +193,18 @@ def _construct_single_message_from_response_item( ], ) elif isinstance(item, ResponseReasoningItem): + from vllm.entrypoints.openai.responses.state import is_state_carrier + + if is_state_carrier(item): + # This is a vLLM state-carrier item: it carries serialized Harmony + # history in encrypted_content and must not be forwarded to the LLM. + return None reasoning = "" if item.encrypted_content: - raise ValueError("Encrypted content is not supported.") + raise ValueError( + "Encrypted content from external providers is not supported. " + "vLLM-generated state carriers are handled transparently." + ) elif item.content and len(item.content) >= 1: reasoning = item.content[0].text elif len(item.summary) >= 1: diff --git a/vllm/envs.py b/vllm/envs.py index 2f93b2cb3e0d..3c95bfad3ad6 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -200,6 +200,7 @@ VLLM_LOOPBACK_IP: str = "" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True VLLM_ENABLE_RESPONSES_API_STORE: bool = False + VLLM_RESPONSES_STATE_SIGNING_KEY: str = "" VLLM_NVFP4_GEMM_BACKEND: str | None = None VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False @@ -1465,6 +1466,12 @@ def _get_or_set_default() -> str: "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool( int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0")) ), + # Hex-encoded 32-byte (64-char) signing key for stateless multi-turn + # state carriers (RFC #26934). If not set, a random key is generated at + # startup (incompatible across restarts / multi-node deployments). + "VLLM_RESPONSES_STATE_SIGNING_KEY": lambda: os.environ.get( + "VLLM_RESPONSES_STATE_SIGNING_KEY", "" + ), # If set, use the fp8 mfma in rocm paged attention. "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool( int(os.getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))