From e680cbfec0bdc8e4daa2c81c645f79ad9dbcf13b Mon Sep 17 00:00:00 2001
From: timon0305 <timon0305@outlook.com>
Date: Sun, 22 Feb 2026 01:06:29 +0100
Subject: [PATCH] feat: add reasoning/thinking support to Anthropic
 /v1/messages endpoint

Signed-off-by: timon0305 <timon0305@outlook.com>
---
 .../openai/test_anthropic_reasoning.py        | 524 ++++++++++++++++++
 vllm/entrypoints/anthropic/protocol.py        |  22 +-
 vllm/entrypoints/anthropic/serving.py         | 120 +++-
 3 files changed, 651 insertions(+), 15 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_anthropic_reasoning.py

diff --git a/tests/entrypoints/openai/test_anthropic_reasoning.py b/tests/entrypoints/openai/test_anthropic_reasoning.py
new file mode 100644
index 000000000000..df9cc044c93f
--- /dev/null
+++ b/tests/entrypoints/openai/test_anthropic_reasoning.py
@@ -0,0 +1,524 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for Anthropic Messages API reasoning/thinking support."""
+
+import json
+
+import pytest
+
+from vllm.entrypoints.anthropic.protocol import (
+    AnthropicContentBlock,
+    AnthropicDelta,
+    AnthropicMessagesRequest,
+    AnthropicStreamEvent,
+    AnthropicThinkingConfig,
+)
+from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionStreamResponse,
+    ChatCompletionStreamResponseChoice,
+    ChatMessage,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+
+# -- Protocol model tests --------------------------------------------------
+
+
+class TestAnthropicThinkingConfig:
+    def test_enabled_requires_budget_tokens(self):
+        with pytest.raises(ValueError, match="budget_tokens is required"):
+            AnthropicThinkingConfig(type="enabled")
+
+    def test_enabled_with_budget_tokens(self):
+        cfg = AnthropicThinkingConfig(type="enabled", budget_tokens=1024)
+        assert cfg.type == "enabled"
+        assert cfg.budget_tokens == 1024
+
+    def test_disabled(self):
+        cfg = AnthropicThinkingConfig(type="disabled")
+        assert cfg.type == "disabled"
+        assert cfg.budget_tokens is None
+
+
+class TestAnthropicContentBlockThinking:
+    def test_thinking_type(self):
+        block = AnthropicContentBlock(
+            type="thinking", thinking="Let me reason about this..."
+        )
+        assert block.type == "thinking"
+        assert block.thinking == "Let me reason about this..."
+        assert block.text is None
+
+
+class TestAnthropicDeltaThinking:
+    def test_thinking_delta_type(self):
+        delta = AnthropicDelta(type="thinking_delta", thinking="partial reasoning")
+        assert delta.type == "thinking_delta"
+        assert delta.thinking == "partial reasoning"
+        assert delta.text is None
+
+
+class TestAnthropicMessagesRequestThinking:
+    def test_request_with_thinking(self):
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            max_tokens=1024,
+            thinking={"type": "enabled", "budget_tokens": 2048},
+        )
+        assert req.thinking is not None
+        assert req.thinking.type == "enabled"
+        assert req.thinking.budget_tokens == 2048
+
+    def test_request_without_thinking(self):
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            max_tokens=1024,
+        )
+        assert req.thinking is None
+
+
+# -- Request conversion tests ----------------------------------------------
+
+
+class TestConvertAnthropicToOpenAIRequest:
+    """Test _convert_anthropic_to_openai_request reasoning handling."""
+
+    def _make_handler(self):
+        """Create a minimal AnthropicServingMessages for testing conversion."""
+        # We only need the conversion method, not the full server.
+        # Use __new__ to bypass __init__ which requires engine_client etc.
+        handler = object.__new__(AnthropicServingMessages)
+        handler.stop_reason_map = {
+            "stop": "end_turn",
+            "length": "max_tokens",
+            "tool_calls": "tool_use",
+        }
+        return handler
+
+    def test_thinking_enabled_sets_include_reasoning(self):
+        handler = self._make_handler()
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            max_tokens=1024,
+            thinking={"type": "enabled", "budget_tokens": 2048},
+        )
+        openai_req = handler._convert_anthropic_to_openai_request(req)
+        assert openai_req.include_reasoning is True
+
+    def test_thinking_disabled_sets_include_reasoning_false(self):
+        handler = self._make_handler()
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            max_tokens=1024,
+            thinking={"type": "disabled"},
+        )
+        openai_req = handler._convert_anthropic_to_openai_request(req)
+        assert openai_req.include_reasoning is False
+
+    def test_no_thinking_sets_include_reasoning_false(self):
+        handler = self._make_handler()
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            max_tokens=1024,
+        )
+        openai_req = handler._convert_anthropic_to_openai_request(req)
+        assert openai_req.include_reasoning is False
+
+    def test_thinking_blocks_in_messages_converted(self):
+        handler = self._make_handler()
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "I need to think about this...",
+                        },
+                        {"type": "text", "text": "Here is my answer."},
+                    ],
+                },
+                {"role": "user", "content": "follow up question"},
+            ],
+            max_tokens=1024,
+        )
+        openai_req = handler._convert_anthropic_to_openai_request(req)
+        # The assistant message should contain both thinking (as text) and
+        # the actual text content
+        assistant_msg = openai_req.messages[0]
+        assert assistant_msg["role"] == "assistant"
+        content = assistant_msg["content"]
+        assert isinstance(content, list)
+        assert len(content) == 2
+        assert content[0]["text"] == "I need to think about this..."
+        assert content[1]["text"] == "Here is my answer."
+
+
+# -- Non-streaming response converter tests ---------------------------------
+
+
+class TestMessagesFullConverterReasoning:
+    def _make_handler(self):
+        handler = object.__new__(AnthropicServingMessages)
+        handler.stop_reason_map = {
+            "stop": "end_turn",
+            "length": "max_tokens",
+            "tool_calls": "tool_use",
+        }
+        return handler
+
+    def test_response_with_reasoning(self):
+        handler = self._make_handler()
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            model="test-model",
+            choices=[
+                ChatCompletionResponseChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content="The answer is 42.",
+                        reasoning="Let me think step by step...",
+                    ),
+                    finish_reason="stop",
+                )
+            ],
+            usage=UsageInfo(prompt_tokens=10, completion_tokens=20, total_tokens=30),
+        )
+        result = handler.messages_full_converter(response)
+
+        # Should have thinking block before text block
+        assert len(result.content) == 2
+        assert result.content[0].type == "thinking"
+        assert result.content[0].thinking == "Let me think step by step..."
+        assert result.content[1].type == "text"
+        assert result.content[1].text == "The answer is 42."
+
+    def test_response_without_reasoning(self):
+        handler = self._make_handler()
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            model="test-model",
+            choices=[
+                ChatCompletionResponseChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content="The answer is 42.",
+                    ),
+                    finish_reason="stop",
+                )
+            ],
+            usage=UsageInfo(prompt_tokens=10, completion_tokens=20, total_tokens=30),
+        )
+        result = handler.messages_full_converter(response)
+
+        # Should have only text block
+        assert len(result.content) == 1
+        assert result.content[0].type == "text"
+        assert result.content[0].text == "The answer is 42."
+
+    def test_response_with_reasoning_and_tool_calls(self):
+        handler = self._make_handler()
+        response = ChatCompletionResponse(
+            id="chatcmpl-123",
+            model="test-model",
+            choices=[
+                ChatCompletionResponseChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content="",
+                        reasoning="I should call the weather tool.",
+                        tool_calls=[
+                            {
+                                "id": "call_abc",
+                                "type": "function",
+                                "function": {
+                                    "name": "get_weather",
+                                    "arguments": json.dumps({"city": "NYC"}),
+                                },
+                            }
+                        ],
+                    ),
+                    finish_reason="tool_calls",
+                )
+            ],
+            usage=UsageInfo(prompt_tokens=10, completion_tokens=20, total_tokens=30),
+        )
+        result = handler.messages_full_converter(response)
+
+        assert result.stop_reason == "tool_use"
+        assert len(result.content) == 3
+        assert result.content[0].type == "thinking"
+        assert result.content[0].thinking == "I should call the weather tool."
+        assert result.content[1].type == "text"
+        assert result.content[2].type == "tool_use"
+        assert result.content[2].name == "get_weather"
+
+
+# -- Streaming response converter tests -------------------------------------
+
+
+class TestMessageStreamConverterReasoning:
+    def _make_handler(self):
+        handler = object.__new__(AnthropicServingMessages)
+        handler.stop_reason_map = {
+            "stop": "end_turn",
+            "length": "max_tokens",
+            "tool_calls": "tool_use",
+        }
+        return handler
+
+    async def _collect_events(self, handler, chunks):
+        """Helper to run stream converter and collect parsed events."""
+
+        async def mock_generator():
+            for chunk_obj in chunks:
+                yield f"data: {chunk_obj.model_dump_json()}\n\n"
+            yield "data: [DONE]\n\n"
+
+        events = []
+        async for event_str in handler.message_stream_converter(mock_generator()):
+            if event_str.startswith("event:"):
+                lines = event_str.strip().split("\n")
+                event_type = lines[0].split(": ", 1)[1]
+                data_json = lines[1].split(": ", 1)[1]
+                events.append((event_type, json.loads(data_json)))
+            elif event_str == "data: [DONE]\n\n":
+                events.append(("done", None))
+        return events
+
+    @pytest.mark.asyncio
+    async def test_stream_with_reasoning_then_text(self):
+        handler = self._make_handler()
+
+        chunks = [
+            # First chunk: message_start
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[],
+                usage=UsageInfo(prompt_tokens=10, completion_tokens=0, total_tokens=10),
+            ),
+            # Reasoning delta
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(reasoning="Let me think..."),
+                    )
+                ],
+            ),
+            # More reasoning
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(reasoning=" step by step."),
+                    )
+                ],
+            ),
+            # Text content starts
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(content="The answer is "),
+                    )
+                ],
+            ),
+            # More text
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(content="42."),
+                    )
+                ],
+            ),
+            # Finish reason
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(),
+                        finish_reason="stop",
+                    )
+                ],
+            ),
+            # Usage chunk (empty choices)
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[],
+                usage=UsageInfo(
+                    prompt_tokens=10, completion_tokens=20, total_tokens=30
+                ),
+            ),
+        ]
+
+        events = await self._collect_events(handler, chunks)
+        event_types = [e[0] for e in events]
+
+        # Verify event sequence
+        assert "message_start" in event_types
+
+        # Find thinking block events
+        thinking_start = None
+        thinking_deltas = []
+        text_start = None
+        text_deltas = []
+        block_stops = 0
+
+        for event_type, data in events:
+            if data is None:
+                continue
+            if event_type == "content_block_start":
+                cb = data.get("content_block", {})
+                if cb.get("type") == "thinking":
+                    thinking_start = data
+                elif cb.get("type") == "text":
+                    text_start = data
+            elif event_type == "content_block_delta":
+                delta = data.get("delta", {})
+                if delta.get("type") == "thinking_delta":
+                    thinking_deltas.append(delta)
+                elif delta.get("type") == "text_delta":
+                    text_deltas.append(delta)
+            elif event_type == "content_block_stop":
+                block_stops += 1
+
+        # Thinking block at index 0
+        assert thinking_start is not None
+        assert thinking_start["index"] == 0
+        assert len(thinking_deltas) == 2
+        assert thinking_deltas[0]["thinking"] == "Let me think..."
+        assert thinking_deltas[1]["thinking"] == " step by step."
+
+        # Text block at index 1
+        assert text_start is not None
+        assert text_start["index"] == 1
+        assert len(text_deltas) == 2
+        assert text_deltas[0]["text"] == "The answer is "
+        assert text_deltas[1]["text"] == "42."
+
+        # Two block stops (thinking + text)
+        assert block_stops == 2
+
+    @pytest.mark.asyncio
+    async def test_stream_without_reasoning(self):
+        handler = self._make_handler()
+
+        chunks = [
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[],
+                usage=UsageInfo(prompt_tokens=10, completion_tokens=0, total_tokens=10),
+            ),
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(content="Hello!"),
+                    )
+                ],
+            ),
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=0,
+                        delta=DeltaMessage(),
+                        finish_reason="stop",
+                    )
+                ],
+            ),
+            ChatCompletionStreamResponse(
+                id="chatcmpl-1",
+                model="test-model",
+                choices=[],
+                usage=UsageInfo(prompt_tokens=10, completion_tokens=5, total_tokens=15),
+            ),
+        ]
+
+        events = await self._collect_events(handler, chunks)
+
+        # No thinking events should be present
+        for event_type, data in events:
+            if data is None:
+                continue
+            if event_type == "content_block_start":
+                cb = data.get("content_block", {})
+                assert cb.get("type") != "thinking"
+            if event_type == "content_block_delta":
+                delta = data.get("delta", {})
+                assert delta.get("type") != "thinking_delta"
+
+
+# -- Serialization round-trip tests -----------------------------------------
+
+
+class TestThinkingContentBlockSerialization:
+    def test_thinking_block_serialized_correctly(self):
+        block = AnthropicContentBlock(type="thinking", thinking="reasoning text")
+        data = block.model_dump(exclude_none=True)
+        assert data == {"type": "thinking", "thinking": "reasoning text"}
+
+    def test_thinking_delta_serialized_correctly(self):
+        delta = AnthropicDelta(type="thinking_delta", thinking="partial reasoning")
+        data = delta.model_dump(exclude_none=True)
+        assert data == {
+            "type": "thinking_delta",
+            "thinking": "partial reasoning",
+        }
+
+    def test_stream_event_with_thinking_delta(self):
+        event = AnthropicStreamEvent(
+            type="content_block_delta",
+            index=0,
+            delta=AnthropicDelta(type="thinking_delta", thinking="some reasoning"),
+        )
+        data = json.loads(event.model_dump_json(exclude_unset=True))
+        assert data["type"] == "content_block_delta"
+        assert data["delta"]["type"] == "thinking_delta"
+        assert data["delta"]["thinking"] == "some reasoning"
+
+    def test_thinking_config_enabled_serialized(self):
+        cfg = AnthropicThinkingConfig(type="enabled", budget_tokens=4096)
+        data = cfg.model_dump()
+        assert data == {"type": "enabled", "budget_tokens": 4096}
+
+    def test_request_with_thinking_serialized(self):
+        req = AnthropicMessagesRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            max_tokens=1024,
+            thinking={"type": "enabled", "budget_tokens": 2048},
+        )
+        data = req.model_dump()
+        assert data["thinking"]["type"] == "enabled"
+        assert data["thinking"]["budget_tokens"] == 2048
diff --git a/vllm/entrypoints/anthropic/protocol.py b/vllm/entrypoints/anthropic/protocol.py
index af9430e78475..7223ef85e653 100644
--- a/vllm/entrypoints/anthropic/protocol.py
+++ b/vllm/entrypoints/anthropic/protocol.py
@@ -34,8 +34,9 @@ class AnthropicUsage(BaseModel):
 class AnthropicContentBlock(BaseModel):
     """Content block in message"""
 
-    type: Literal["text", "image", "tool_use", "tool_result"]
+    type: Literal["text", "image", "tool_use", "tool_result", "thinking"]
     text: str | None = None
+    thinking: str | None = None
     # For image content
     source: dict[str, Any] | None = None
     # For tool use/result
@@ -84,6 +85,21 @@ def validate_name_required_for_tool(self) -> "AnthropicToolChoice":
         return self
 
 
+class AnthropicThinkingConfig(BaseModel):
+    """Thinking/reasoning configuration for extended thinking"""
+
+    type: Literal["enabled", "disabled"]
+    budget_tokens: int | None = None
+
+    @model_validator(mode="after")
+    def validate_budget_tokens(self) -> "AnthropicThinkingConfig":
+        if self.type == "enabled" and self.budget_tokens is None:
+            raise ValueError(
+                "budget_tokens is required when thinking type is 'enabled'"
+            )
+        return self
+
+
 class AnthropicMessagesRequest(BaseModel):
     """Anthropic Messages API request"""
 
@@ -95,6 +111,7 @@ class AnthropicMessagesRequest(BaseModel):
     stream: bool | None = False
     system: str | list[AnthropicContentBlock] | None = None
     temperature: float | None = None
+    thinking: AnthropicThinkingConfig | None = None
     tool_choice: AnthropicToolChoice | None = None
     tools: list[AnthropicTool] | None = None
     top_k: int | None = None
@@ -118,8 +135,9 @@ def validate_max_tokens(cls, v):
 class AnthropicDelta(BaseModel):
     """Delta for streaming responses"""
 
-    type: Literal["text_delta", "input_json_delta"] | None = None
+    type: Literal["text_delta", "input_json_delta", "thinking_delta"] | None = None
     text: str | None = None
+    thinking: str | None = None
     partial_json: str | None = None
 
     # Message delta
diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py
index 8fb347aabed3..8ed04fe83b30 100644
--- a/vllm/entrypoints/anthropic/serving.py
+++ b/vllm/entrypoints/anthropic/serving.py
@@ -123,6 +123,11 @@ def _convert_anthropic_to_openai_request(
                                 "image_url": {"url": block.source.get("data", "")},
                             }
                         )
+                    elif block.type == "thinking" and block.thinking:
+                        # Thinking blocks from prior assistant turns are
+                        # passed through as plain text so the model can see
+                        # its own earlier reasoning.
+                        content_parts.append({"type": "text", "text": block.thinking})
                     elif block.type == "tool_use":
                         # Convert tool use to function call format
                         tool_call = {
@@ -183,6 +188,15 @@ def _convert_anthropic_to_openai_request(
             top_k=anthropic_request.top_k,
         )
 
+        # Enable reasoning output when thinking is configured
+        if (
+            anthropic_request.thinking is not None
+            and anthropic_request.thinking.type == "enabled"
+        ):
+            req.include_reasoning = True
+        else:
+            req.include_reasoning = False
+
         if anthropic_request.stream:
             req.stream = anthropic_request.stream
             req.stream_options = StreamOptions.validate(
@@ -270,23 +284,33 @@ def messages_full_converter(
         elif generator.choices[0].finish_reason == "tool_calls":
             result.stop_reason = "tool_use"
 
-        content: list[AnthropicContentBlock] = [
+        content: list[AnthropicContentBlock] = []
+
+        # Add thinking block if reasoning content is present
+        message = generator.choices[0].message
+        if message.reasoning:
+            content.append(
+                AnthropicContentBlock(
+                    type="thinking",
+                    thinking=message.reasoning,
+                )
+            )
+
+        content.append(
             AnthropicContentBlock(
                 type="text",
-                text=generator.choices[0].message.content
-                if generator.choices[0].message.content
-                else "",
+                text=message.content if message.content else "",
             )
-        ]
+        )
 
-        for tool_call in generator.choices[0].message.tool_calls:
+        for tool_call in message.tool_calls:
             anthropic_tool_call = AnthropicContentBlock(
                 type="tool_use",
                 id=tool_call.id,
                 name=tool_call.function.name,
                 input=json.loads(tool_call.function.arguments),
             )
-            content += [anthropic_tool_call]
+            content.append(anthropic_tool_call)
 
         result.content = content
 
@@ -301,6 +325,9 @@ async def message_stream_converter(
             finish_reason = None
             content_block_index = 0
             content_block_started = False
+            # Track current block type so we know when to close a
+            # thinking block before opening a text block.
+            current_block_type: str | None = None
 
             async for item in generator:
                 if item.startswith("data:"):
@@ -371,9 +398,74 @@ async def message_stream_converter(
                             finish_reason = origin_chunk.choices[0].finish_reason
                             continue
 
+                        delta = origin_chunk.choices[0].delta
+
+                        # reasoning / thinking
+                        if delta.reasoning is not None:
+                            if (
+                                not content_block_started
+                                or current_block_type != "thinking"
+                            ):
+                                # Close any prior block before opening thinking
+                                if content_block_started:
+                                    stop_chunk = AnthropicStreamEvent(
+                                        index=content_block_index,
+                                        type="content_block_stop",
+                                    )
+                                    data = stop_chunk.model_dump_json(
+                                        exclude_unset=True
+                                    )
+                                    yield wrap_data_with_event(
+                                        data, "content_block_stop"
+                                    )
+                                    content_block_index += 1
+
+                                chunk = AnthropicStreamEvent(
+                                    index=content_block_index,
+                                    type="content_block_start",
+                                    content_block=AnthropicContentBlock(
+                                        type="thinking", thinking=""
+                                    ),
+                                )
+                                data = chunk.model_dump_json(exclude_unset=True)
+                                yield wrap_data_with_event(data, "content_block_start")
+                                content_block_started = True
+                                current_block_type = "thinking"
+
+                            if delta.reasoning != "":
+                                chunk = AnthropicStreamEvent(
+                                    index=content_block_index,
+                                    type="content_block_delta",
+                                    delta=AnthropicDelta(
+                                        type="thinking_delta",
+                                        thinking=delta.reasoning,
+                                    ),
+                                )
+                                data = chunk.model_dump_json(exclude_unset=True)
+                                yield wrap_data_with_event(data, "content_block_delta")
+                            continue
+
                         # content
-                        if origin_chunk.choices[0].delta.content is not None:
-                            if not content_block_started:
+                        if delta.content is not None:
+                            if (
+                                not content_block_started
+                                or current_block_type != "text"
+                            ):
+                                # Close any prior block (e.g. thinking) before
+                                # opening the text block
+                                if content_block_started:
+                                    stop_chunk = AnthropicStreamEvent(
+                                        index=content_block_index,
+                                        type="content_block_stop",
+                                    )
+                                    data = stop_chunk.model_dump_json(
+                                        exclude_unset=True
+                                    )
+                                    yield wrap_data_with_event(
+                                        data, "content_block_stop"
+                                    )
+                                    content_block_index += 1
+
                                 chunk = AnthropicStreamEvent(
                                     index=content_block_index,
                                     type="content_block_start",
@@ -384,15 +476,16 @@ async def message_stream_converter(
                                 data = chunk.model_dump_json(exclude_unset=True)
                                 yield wrap_data_with_event(data, "content_block_start")
                                 content_block_started = True
+                                current_block_type = "text"
 
-                            if origin_chunk.choices[0].delta.content == "":
+                            if delta.content == "":
                                 continue
                             chunk = AnthropicStreamEvent(
                                 index=content_block_index,
                                 type="content_block_delta",
                                 delta=AnthropicDelta(
                                     type="text_delta",
-                                    text=origin_chunk.choices[0].delta.content,
+                                    text=delta.content,
                                 ),
                             )
                             data = chunk.model_dump_json(exclude_unset=True)
@@ -400,8 +493,8 @@ async def message_stream_converter(
                             continue
 
                         # tool calls
-                        elif len(origin_chunk.choices[0].delta.tool_calls) > 0:
-                            tool_call = origin_chunk.choices[0].delta.tool_calls[0]
+                        elif len(delta.tool_calls) > 0:
+                            tool_call = delta.tool_calls[0]
                             if tool_call.id is not None:
                                 if content_block_started:
                                     stop_chunk = AnthropicStreamEvent(
@@ -432,6 +525,7 @@ async def message_stream_converter(
                                 data = chunk.model_dump_json(exclude_unset=True)
                                 yield wrap_data_with_event(data, "content_block_start")
                                 content_block_started = True
+                                current_block_type = "tool_use"
 
                             else:
                                 chunk = AnthropicStreamEvent(