diff --git a/tests/entrypoints/anthropic/test_anthropic_messages_conversion.py b/tests/entrypoints/anthropic/test_anthropic_messages_conversion.py index e3b006c16a97..eb9798980f06 100644 --- a/tests/entrypoints/anthropic/test_anthropic_messages_conversion.py +++ b/tests/entrypoints/anthropic/test_anthropic_messages_conversion.py @@ -4,6 +4,9 @@ Tests the image source handling and tool_result content parsing in AnthropicServingMessages._convert_anthropic_to_openai_request(). + +Also covers extended-thinking edge cases such as ``redacted_thinking`` +blocks echoed back by Anthropic clients. """ from vllm.entrypoints.anthropic.protocol import ( @@ -373,3 +376,262 @@ def test_system_string_unchanged(self): result = _convert(request) system_msg = result.messages[0] assert system_msg["content"] == "You are a helpful assistant." + + +# ====================================================================== +# Thinking block conversion (Anthropic → OpenAI) +# ====================================================================== + + +class TestThinkingBlockConversion: + """Verify that thinking blocks in assistant messages are correctly + moved to the ``reasoning`` field and stripped from ``content`` during + the Anthropic→OpenAI conversion. + + This is the Anthropic-endpoint path: the client echoes back the full + assistant message (including thinking blocks emitted by vllm) in + subsequent requests. + """ + + def test_thinking_plus_text_in_assistant_message(self): + """thinking + text → reasoning field + plain-string content.""" + request = _make_request( + [ + {"role": "user", "content": "Write me some code."}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "I should write a simple example.", + "signature": "sig_abc123", + }, + {"type": "text", "text": "Sure! Here is the code."}, + ], + }, + {"role": "user", "content": "Can you fix the bug?"}, + ] + ) + result = _convert(request) + + # Find the assistant message in the converted output. + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 1 + asst = asst_msgs[0] + + # Thinking content must be in reasoning, NOT in content. + assert asst.get("reasoning") == "I should write a simple example." + assert asst.get("content") == "Sure! Here is the code." + + def test_thinking_only_in_assistant_message(self): + """Assistant message with only a thinking block (no visible text). + + This can happen when the model emits reasoning but no final answer + yet (e.g. a mid-turn reasoning step). Content should be None. + """ + request = _make_request( + [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "Just thinking...", + "signature": "sig_xyz", + } + ], + }, + {"role": "user", "content": "Go on."}, + ] + ) + result = _convert(request) + + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 1 + asst = asst_msgs[0] + + assert asst.get("reasoning") == "Just thinking..." + # No visible text → content should be absent or None. + assert asst.get("content") is None + + def test_thinking_plus_tool_use_in_assistant_message(self): + """thinking + tool_use: reasoning field set, tool_calls populated.""" + request = _make_request( + [ + {"role": "user", "content": "What is 2+2?"}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "I need to call the calculator.", + "signature": "sig_tool", + }, + { + "type": "tool_use", + "id": "call_001", + "name": "calculator", + "input": {"expression": "2+2"}, + }, + ], + }, + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_001", + "content": "4", + } + ], + }, + ] + ) + result = _convert(request) + + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 1 + asst = asst_msgs[0] + + assert asst.get("reasoning") == "I need to call the calculator." + tool_calls = list(asst.get("tool_calls", [])) + assert len(tool_calls) == 1 + assert tool_calls[0]["function"]["name"] == "calculator" + # No text content alongside reasoning + tool_use. + assert asst.get("content") is None + + def test_multiple_thinking_blocks_concatenated(self): + """Multiple thinking blocks should be joined in order.""" + request = _make_request( + [ + {"role": "user", "content": "Think hard."}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "First thought. ", + "signature": "s1", + }, + { + "type": "thinking", + "thinking": "Second thought.", + "signature": "s2", + }, + {"type": "text", "text": "Done."}, + ], + }, + ] + ) + result = _convert(request) + + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 1 + asst = asst_msgs[0] + + assert asst.get("reasoning") == "First thought. Second thought." + assert asst.get("content") == "Done." + + def test_no_thinking_blocks_unchanged(self): + """Messages without thinking blocks must not be modified.""" + request = _make_request( + [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + ) + result = _convert(request) + + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 1 + asst = asst_msgs[0] + + assert asst.get("content") == "Hello!" + assert "reasoning" not in asst + + def test_multi_turn_with_thinking_blocks(self): + """Full multi-turn conversation: previous assistant messages that + include thinking blocks must all be converted without a 400 error. + + This is the primary regression scenario from the bug report: + upgrading vllm from v0.15.1 → v0.17.0 introduced thinking-block + support in responses, but echoing those responses back in subsequent + requests caused a Pydantic validation failure. + """ + request = _make_request( + [ + {"role": "user", "content": "Turn 1 question"}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "Reasoning for turn 1.", + "signature": "s_t1", + }, + {"type": "text", "text": "Answer for turn 1."}, + ], + }, + {"role": "user", "content": "Turn 2 question"}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "Reasoning for turn 2.", + "signature": "s_t2", + }, + {"type": "text", "text": "Answer for turn 2."}, + ], + }, + {"role": "user", "content": "Turn 3 question"}, + ] + ) + # Must not raise a ValidationError / 400. + result = _convert(request) + + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 2 + + assert asst_msgs[0].get("reasoning") == "Reasoning for turn 1." + assert asst_msgs[0].get("content") == "Answer for turn 1." + assert asst_msgs[1].get("reasoning") == "Reasoning for turn 2." + assert asst_msgs[1].get("content") == "Answer for turn 2." + + def test_redacted_thinking_block_is_accepted(self): + """Anthropic clients may echo back redacted thinking blocks. + + vLLM should accept these blocks (to avoid 400 validation errors) + and ignore them when constructing the OpenAI-format prompt. + """ + request = _make_request( + [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "Thinking...", + "signature": "sig_think", + }, + { + "type": "redacted_thinking", + "data": "BASE64_OR_OTHER_OPAQUE_DATA", + }, + {"type": "text", "text": "Hi!"}, + ], + }, + {"role": "user", "content": "Continue"}, + ] + ) + result = _convert(request) + + asst_msgs = [m for m in result.messages if m.get("role") == "assistant"] + assert len(asst_msgs) == 1 + asst = asst_msgs[0] + + # Redacted thinking is ignored, normal thinking still becomes reasoning. + assert asst.get("reasoning") == "Thinking..." + assert asst.get("content") == "Hi!" diff --git a/vllm/entrypoints/anthropic/protocol.py b/vllm/entrypoints/anthropic/protocol.py index c541db5139d3..ab3ca66e2cd0 100644 --- a/vllm/entrypoints/anthropic/protocol.py +++ b/vllm/entrypoints/anthropic/protocol.py @@ -34,7 +34,14 @@ class AnthropicUsage(BaseModel): class AnthropicContentBlock(BaseModel): """Content block in message""" - type: Literal["text", "image", "tool_use", "tool_result", "thinking"] + type: Literal[ + "text", + "image", + "tool_use", + "tool_result", + "thinking", + "redacted_thinking", + ] text: str | None = None # For image content source: dict[str, Any] | None = None @@ -48,6 +55,8 @@ class AnthropicContentBlock(BaseModel): # For thinking content thinking: str | None = None signature: str | None = None + # For redacted thinking content (safety-filtered by the API) + data: str | None = None class AnthropicMessage(BaseModel): diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index f301ed499f86..8fbe2c405e7e 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -224,6 +224,12 @@ def _convert_block( content_parts.append({"type": "image_url", "image_url": {"url": image_url}}) elif block.type == "thinking" and block.thinking is not None: reasoning_parts.append(block.thinking) + elif block.type == "redacted_thinking": + # Redacted thinking blocks contain safety-filtered reasoning. + # We skip them as the content is opaque (base64 'data' field), + # but accepting the block prevents a validation error when the + # client echoes back the full assistant message. + pass elif block.type == "tool_use": cls._convert_tool_use_block(block, tool_calls) elif block.type == "tool_result":