Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 60 additions & 2 deletions src/fastmcp/client/sampling/handlers/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from collections.abc import Iterator, Sequence
from typing import Any

from mcp.types import CreateMessageRequestParams as SamplingParams
from mcp.types import (
AudioContent,
CreateMessageResult,
CreateMessageResultWithTools,
ImageContent,
ModelPreferences,
SamplingMessage,
SamplingMessageContentBlock,
Expand All @@ -17,10 +18,13 @@
ToolResultContent,
ToolUseContent,
)
from mcp.types import CreateMessageRequestParams as SamplingParams

try:
from anthropic import AsyncAnthropic
from anthropic.types import (
Base64ImageSourceParam,
ImageBlockParam,
Message,
MessageParam,
TextBlock,
Expand All @@ -42,6 +46,28 @@

__all__ = ["AnthropicSamplingHandler"]

# Anthropic supports these image MIME types
_ANTHROPIC_IMAGE_MEDIA_TYPES = frozenset(
{"image/jpeg", "image/png", "image/gif", "image/webp"}
)


def _image_content_to_anthropic_block(content: ImageContent) -> ImageBlockParam:
"""Convert MCP ImageContent to Anthropic ImageBlockParam."""
if content.mimeType not in _ANTHROPIC_IMAGE_MEDIA_TYPES:
raise ValueError(
f"Unsupported image MIME type for Anthropic: {content.mimeType!r}. "
f"Supported types: {', '.join(sorted(_ANTHROPIC_IMAGE_MEDIA_TYPES))}"
)
return ImageBlockParam(
type="image",
source=Base64ImageSourceParam(
type="base64",
media_type=content.mimeType, # type: ignore[arg-type]
data=content.data,
),
)


class AnthropicSamplingHandler:
"""Sampling handler that uses the Anthropic API.
Expand Down Expand Up @@ -155,7 +181,10 @@ def _convert_to_anthropic_messages(
# Handle list content (from CreateMessageResultWithTools)
if isinstance(content, list):
content_blocks: list[
TextBlockParam | ToolUseBlockParam | ToolResultBlockParam
TextBlockParam
| ImageBlockParam
| ToolUseBlockParam
| ToolResultBlockParam
] = []

for item in content:
Expand All @@ -172,6 +201,17 @@ def _convert_to_anthropic_messages(
content_blocks.append(
TextBlockParam(type="text", text=item.text)
)
elif isinstance(item, ImageContent):
if message.role != "user":
raise ValueError(
"ImageContent is only supported in user messages "
"for Anthropic"
Comment on lines +204 to +208
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve images inside Anthropic tool_result blocks

Anthropic's tool-use API allows tool_result.content to contain nested text or image blocks, but _convert_to_anthropic_messages() still serializes ToolResultContent.content by collecting only TextContent. If a FastMCP tool returns content=[ImageContent(...)], Claude will receive an empty tool result instead of the image it requested. Since this commit adds ImageContent support for user messages, the same media needs to be forwarded (or explicitly rejected) in ToolResultContent as well.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same — pre-existing, unchanged by this PR. Anthropic does support images in tool_result blocks so it's a valid follow-up.

)
content_blocks.append(_image_content_to_anthropic_block(item))
elif isinstance(item, AudioContent):
raise ValueError(
"AudioContent is not supported by the Anthropic API"
)
elif isinstance(item, ToolResultContent):
# Extract text content from the result
result_content: str | list[TextBlockParam] = ""
Expand Down Expand Up @@ -262,6 +302,24 @@ def _convert_to_anthropic_messages(
)
continue

# Handle ImageContent
if isinstance(content, ImageContent):
if message.role != "user":
raise ValueError(
"ImageContent is only supported in user messages for Anthropic"
)
anthropic_messages.append(
MessageParam(
role="user",
content=[_image_content_to_anthropic_block(content)],
Comment on lines +306 to +314
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reject assistant-side image blocks before calling Anthropic

_convert_to_anthropic_messages() now forwards ImageContent with role=message.role, but Anthropic's Messages API only accepts image blocks in user turns; assistant turns are limited to model text/tool-use content. If a sampling history includes an assistant ImageContent (or a list-content assistant turn with an image), this code will build an invalid payload and the next request fails with a provider-side 400 instead of a local validation error.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch — fixed in 2298e76. Both single-content and list-content paths now validate that ImageContent is only in user messages.

)
)
continue

# Handle AudioContent - not supported by Anthropic
if isinstance(content, AudioContent):
raise ValueError("AudioContent is not supported by the Anthropic API")

raise ValueError(f"Unsupported content type: {type(content)}")

return anthropic_messages
Expand Down
18 changes: 18 additions & 0 deletions src/fastmcp/client/sampling/handlers/google_genai.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Google GenAI sampling handler with tool support for FastMCP 3.0."""

import base64
from collections.abc import Sequence
from uuid import uuid4

try:
from google.genai import Client as GoogleGenaiClient
from google.genai.types import (
Blob,
Candidate,
Content,
FunctionCall,
Expand Down Expand Up @@ -197,6 +199,22 @@ def _sampling_content_to_google_genai_part(
if isinstance(content, TextContent):
return Part(text=content.text)

if isinstance(content, ImageContent):
return Part(
inline_data=Blob(
data=base64.b64decode(content.data),
mime_type=content.mimeType,
Comment on lines +202 to +206
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Validate Gemini media MIME types before building inline_data

Gemini only accepts a fixed allowlist of image and audio MIME types, but this new conversion path passes content.mimeType straight through without any checks (the audio branch immediately below does the same). Inputs like image/bmp or common mobile recordings such as audio/m4a will now get all the way to generate_content() and fail with provider-side 400s instead of the early, clear ValueError the other handlers return for unsupported media.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gemini supports a much wider range of media types than OpenAI/Anthropic — a restrictive allowlist would reject valid inputs. The API validates and returns a clear error for unsupported types.

Comment on lines +202 to +206
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Handle media blocks in Gemini function responses too

The new ImageContent/AudioContent branches only cover top-level message parts. _sampling_content_to_google_genai_part() still raises from its ToolResultContent branch on any non-TextContent, so a tool returning ImageContent or AudioContent will still fail during sampling. Gemini's FunctionResponsePart supports inlineData blobs, so this leaves the new multimodal support incomplete for the common tool-result path.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same — pre-existing, unchanged by this PR. Multimodal tool results are a valid follow-up.

)
)

if isinstance(content, AudioContent):
return Part(
inline_data=Blob(
data=base64.b64decode(content.data),
mime_type=content.mimeType,
)
)

if isinstance(content, ToolUseContent):
# Note: thought_signature bypass is required for manually constructed tool calls.
# Google's Gemini 3+ models enforce thought signature validation for function calls.
Expand Down
126 changes: 114 additions & 12 deletions src/fastmcp/client/sampling/handlers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

from mcp import ClientSession, ServerSession
from mcp.shared.context import LifespanContextT, RequestContext
from mcp.types import CreateMessageRequestParams as SamplingParams
from mcp.types import (
AudioContent,
CreateMessageResult,
CreateMessageResultWithTools,
ImageContent,
ModelPreferences,
SamplingMessage,
StopReason,
Expand All @@ -19,12 +20,17 @@
ToolResultContent,
ToolUseContent,
)
from mcp.types import CreateMessageRequestParams as SamplingParams

try:
from openai import AsyncOpenAI
from openai.types.chat import (
ChatCompletion,
ChatCompletionAssistantMessageParam,
ChatCompletionContentPartImageParam,
ChatCompletionContentPartInputAudioParam,
ChatCompletionContentPartParam,
ChatCompletionContentPartTextParam,
ChatCompletionMessageParam,
ChatCompletionMessageToolCallParam,
ChatCompletionSystemMessageParam,
Expand All @@ -41,6 +47,50 @@
"Please install `fastmcp[openai]` or add `openai` to your dependencies manually."
) from e

# OpenAI only supports wav and mp3 for input audio
_OPENAI_AUDIO_FORMATS: dict[str, str] = {
"audio/wav": "wav",
"audio/x-wav": "wav",
"audio/mp3": "mp3",
"audio/mpeg": "mp3",
}

_OPENAI_IMAGE_MEDIA_TYPES: frozenset[str] = frozenset(
{"image/jpeg", "image/png", "image/gif", "image/webp"}
)


def _image_content_to_openai_part(
content: ImageContent,
) -> ChatCompletionContentPartImageParam:
"""Convert MCP ImageContent to OpenAI image_url content part."""
if content.mimeType not in _OPENAI_IMAGE_MEDIA_TYPES:
raise ValueError(
f"Unsupported image MIME type for OpenAI: {content.mimeType!r}. "
f"Supported types: {', '.join(sorted(_OPENAI_IMAGE_MEDIA_TYPES))}"
)
data_url = f"data:{content.mimeType};base64,{content.data}"
return ChatCompletionContentPartImageParam(
type="image_url",
image_url={"url": data_url},
)


def _audio_content_to_openai_part(
content: AudioContent,
) -> ChatCompletionContentPartInputAudioParam:
"""Convert MCP AudioContent to OpenAI input_audio content part."""
audio_format = _OPENAI_AUDIO_FORMATS.get(content.mimeType)
if audio_format is None:
raise ValueError(
f"Unsupported audio MIME type for OpenAI: {content.mimeType!r}. "
f"Supported types: {', '.join(sorted(_OPENAI_AUDIO_FORMATS))}"
)
return ChatCompletionContentPartInputAudioParam(
type="input_audio",
input_audio={"data": content.data, "format": audio_format},
)


class OpenAISamplingHandler:
"""Sampling handler that uses the OpenAI API."""
Expand Down Expand Up @@ -147,8 +197,9 @@ def _convert_to_openai_messages(

# Handle list content (from CreateMessageResultWithTools)
if isinstance(content, list):
# Collect tool calls and text from the list
# Collect tool calls, content parts, and text from the list
tool_calls: list[ChatCompletionMessageToolCallParam] = []
content_parts: list[ChatCompletionContentPartParam] = []
text_parts: list[str] = []
# Collect tool results separately to maintain correct ordering
tool_messages: list[ChatCompletionToolMessageParam] = []
Expand All @@ -167,6 +218,15 @@ def _convert_to_openai_messages(
)
elif isinstance(item, TextContent):
text_parts.append(item.text)
content_parts.append(
ChatCompletionContentPartTextParam(
type="text", text=item.text
)
)
elif isinstance(item, ImageContent):
content_parts.append(_image_content_to_openai_part(item))
elif isinstance(item, AudioContent):
content_parts.append(_audio_content_to_openai_part(item))
Comment on lines +226 to +229
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reject non-text media in OpenAI tool results

This change makes ImageContent/AudioContent valid elsewhere in _convert_to_openai_messages(), but both ToolResultContent branches still build role="tool" messages from TextContent only. When an MCP tool returns content=[ImageContent(...)] or AudioContent(...), the payload is silently stripped and OpenAI receives an empty tool result, so the model never sees the screenshot/audio it asked for. OpenAI's chat-completions docs say tool messages only support text parts, so this path should fail fast instead of dropping the media.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pre-existing limitation — ToolResultContent has always only extracted TextContent. This PR adds multimodal support for messages, not tool results. Valid follow-up but out of scope here.

elif isinstance(item, ToolResultContent):
# Collect tool results (added after assistant message)
content_text = ""
Expand All @@ -186,33 +246,47 @@ def _convert_to_openai_messages(

# Add assistant message with tool calls if present
# OpenAI requires: assistant (with tool_calls) -> tool messages
if tool_calls or text_parts:
msg_content = "\n".join(text_parts) if text_parts else None
if tool_calls or content_parts:
if tool_calls:
has_multimodal = len(content_parts) > len(text_parts)
if has_multimodal:
raise ValueError(
"ImageContent/AudioContent is only supported "
"in user messages for OpenAI"
)
text_str = "\n".join(text_parts) or None
openai_messages.append(
ChatCompletionAssistantMessageParam(
role="assistant",
content=msg_content,
content=text_str,
tool_calls=tool_calls,
)
)
# Add tool messages AFTER assistant message
openai_messages.extend(tool_messages)
elif msg_content:
elif content_parts:
if message.role == "user":
openai_messages.append(
ChatCompletionUserMessageParam(
role="user",
content=msg_content,
content=content_parts,
)
)
else:
openai_messages.append(
ChatCompletionAssistantMessageParam(
role="assistant",
content=msg_content,
has_multimodal = len(content_parts) > len(text_parts)
if has_multimodal:
raise ValueError(
"ImageContent/AudioContent is only supported "
"in user messages for OpenAI"
)
assistant_text = "\n".join(text_parts)
if assistant_text:
openai_messages.append(
ChatCompletionAssistantMessageParam(
role="assistant",
content=assistant_text,
)
)
)
elif tool_messages:
# Tool results only (assistant message was in previous message)
openai_messages.extend(tool_messages)
Expand Down Expand Up @@ -272,6 +346,34 @@ def _convert_to_openai_messages(
)
continue

# Handle ImageContent
if isinstance(content, ImageContent):
if message.role != "user":
raise ValueError(
"ImageContent is only supported in user messages for OpenAI"
)
openai_messages.append(
ChatCompletionUserMessageParam(
role="user",
content=[_image_content_to_openai_part(content)],
)
)
continue

# Handle AudioContent
if isinstance(content, AudioContent):
if message.role != "user":
raise ValueError(
"AudioContent is only supported in user messages for OpenAI"
)
openai_messages.append(
ChatCompletionUserMessageParam(
role="user",
content=[_audio_content_to_openai_part(content)],
)
)
continue

raise ValueError(f"Unsupported content type: {type(content)}")

return openai_messages
Expand Down
Loading
Loading