-
Notifications
You must be signed in to change notification settings - Fork 2k
Support ImageContent and AudioContent in sampling handlers #3550
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d746172
db9c1e2
aa76c47
2298e76
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,10 +3,11 @@ | |
| from collections.abc import Iterator, Sequence | ||
| from typing import Any | ||
|
|
||
| from mcp.types import CreateMessageRequestParams as SamplingParams | ||
| from mcp.types import ( | ||
| AudioContent, | ||
| CreateMessageResult, | ||
| CreateMessageResultWithTools, | ||
| ImageContent, | ||
| ModelPreferences, | ||
| SamplingMessage, | ||
| SamplingMessageContentBlock, | ||
|
|
@@ -17,10 +18,13 @@ | |
| ToolResultContent, | ||
| ToolUseContent, | ||
| ) | ||
| from mcp.types import CreateMessageRequestParams as SamplingParams | ||
|
|
||
| try: | ||
| from anthropic import AsyncAnthropic | ||
| from anthropic.types import ( | ||
| Base64ImageSourceParam, | ||
| ImageBlockParam, | ||
| Message, | ||
| MessageParam, | ||
| TextBlock, | ||
|
|
@@ -42,6 +46,28 @@ | |
|
|
||
| __all__ = ["AnthropicSamplingHandler"] | ||
|
|
||
| # Anthropic supports these image MIME types | ||
| _ANTHROPIC_IMAGE_MEDIA_TYPES = frozenset( | ||
| {"image/jpeg", "image/png", "image/gif", "image/webp"} | ||
| ) | ||
|
|
||
|
|
||
| def _image_content_to_anthropic_block(content: ImageContent) -> ImageBlockParam: | ||
| """Convert MCP ImageContent to Anthropic ImageBlockParam.""" | ||
| if content.mimeType not in _ANTHROPIC_IMAGE_MEDIA_TYPES: | ||
| raise ValueError( | ||
| f"Unsupported image MIME type for Anthropic: {content.mimeType!r}. " | ||
| f"Supported types: {', '.join(sorted(_ANTHROPIC_IMAGE_MEDIA_TYPES))}" | ||
| ) | ||
| return ImageBlockParam( | ||
| type="image", | ||
| source=Base64ImageSourceParam( | ||
| type="base64", | ||
| media_type=content.mimeType, # type: ignore[arg-type] | ||
| data=content.data, | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| class AnthropicSamplingHandler: | ||
| """Sampling handler that uses the Anthropic API. | ||
|
|
@@ -155,7 +181,10 @@ def _convert_to_anthropic_messages( | |
| # Handle list content (from CreateMessageResultWithTools) | ||
| if isinstance(content, list): | ||
| content_blocks: list[ | ||
| TextBlockParam | ToolUseBlockParam | ToolResultBlockParam | ||
| TextBlockParam | ||
| | ImageBlockParam | ||
| | ToolUseBlockParam | ||
| | ToolResultBlockParam | ||
| ] = [] | ||
|
|
||
| for item in content: | ||
|
|
@@ -172,6 +201,17 @@ def _convert_to_anthropic_messages( | |
| content_blocks.append( | ||
| TextBlockParam(type="text", text=item.text) | ||
| ) | ||
| elif isinstance(item, ImageContent): | ||
| if message.role != "user": | ||
| raise ValueError( | ||
| "ImageContent is only supported in user messages " | ||
| "for Anthropic" | ||
| ) | ||
| content_blocks.append(_image_content_to_anthropic_block(item)) | ||
| elif isinstance(item, AudioContent): | ||
| raise ValueError( | ||
| "AudioContent is not supported by the Anthropic API" | ||
| ) | ||
| elif isinstance(item, ToolResultContent): | ||
| # Extract text content from the result | ||
| result_content: str | list[TextBlockParam] = "" | ||
|
|
@@ -262,6 +302,24 @@ def _convert_to_anthropic_messages( | |
| ) | ||
| continue | ||
|
|
||
| # Handle ImageContent | ||
| if isinstance(content, ImageContent): | ||
| if message.role != "user": | ||
| raise ValueError( | ||
| "ImageContent is only supported in user messages for Anthropic" | ||
| ) | ||
| anthropic_messages.append( | ||
| MessageParam( | ||
| role="user", | ||
| content=[_image_content_to_anthropic_block(content)], | ||
|
Comment on lines
+306
to
+314
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch — fixed in 2298e76. Both single-content and list-content paths now validate that ImageContent is only in user messages. |
||
| ) | ||
| ) | ||
| continue | ||
|
|
||
| # Handle AudioContent - not supported by Anthropic | ||
| if isinstance(content, AudioContent): | ||
| raise ValueError("AudioContent is not supported by the Anthropic API") | ||
|
|
||
| raise ValueError(f"Unsupported content type: {type(content)}") | ||
|
|
||
| return anthropic_messages | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,13 @@ | ||
| """Google GenAI sampling handler with tool support for FastMCP 3.0.""" | ||
|
|
||
| import base64 | ||
| from collections.abc import Sequence | ||
| from uuid import uuid4 | ||
|
|
||
| try: | ||
| from google.genai import Client as GoogleGenaiClient | ||
| from google.genai.types import ( | ||
| Blob, | ||
| Candidate, | ||
| Content, | ||
| FunctionCall, | ||
|
|
@@ -197,6 +199,22 @@ def _sampling_content_to_google_genai_part( | |
| if isinstance(content, TextContent): | ||
| return Part(text=content.text) | ||
|
|
||
| if isinstance(content, ImageContent): | ||
| return Part( | ||
| inline_data=Blob( | ||
| data=base64.b64decode(content.data), | ||
| mime_type=content.mimeType, | ||
|
Comment on lines
+202
to
+206
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Gemini only accepts a fixed allowlist of image and audio MIME types, but this new conversion path passes Useful? React with 👍 / 👎.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gemini supports a much wider range of media types than OpenAI/Anthropic — a restrictive allowlist would reject valid inputs. The API validates and returns a clear error for unsupported types.
Comment on lines
+202
to
+206
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new Useful? React with 👍 / 👎.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same — pre-existing, unchanged by this PR. Multimodal tool results are a valid follow-up. |
||
| ) | ||
| ) | ||
|
|
||
| if isinstance(content, AudioContent): | ||
| return Part( | ||
| inline_data=Blob( | ||
| data=base64.b64decode(content.data), | ||
| mime_type=content.mimeType, | ||
| ) | ||
| ) | ||
|
|
||
| if isinstance(content, ToolUseContent): | ||
| # Note: thought_signature bypass is required for manually constructed tool calls. | ||
| # Google's Gemini 3+ models enforce thought signature validation for function calls. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,10 +6,11 @@ | |
|
|
||
| from mcp import ClientSession, ServerSession | ||
| from mcp.shared.context import LifespanContextT, RequestContext | ||
| from mcp.types import CreateMessageRequestParams as SamplingParams | ||
| from mcp.types import ( | ||
| AudioContent, | ||
| CreateMessageResult, | ||
| CreateMessageResultWithTools, | ||
| ImageContent, | ||
| ModelPreferences, | ||
| SamplingMessage, | ||
| StopReason, | ||
|
|
@@ -19,12 +20,17 @@ | |
| ToolResultContent, | ||
| ToolUseContent, | ||
| ) | ||
| from mcp.types import CreateMessageRequestParams as SamplingParams | ||
|
|
||
| try: | ||
| from openai import AsyncOpenAI | ||
| from openai.types.chat import ( | ||
| ChatCompletion, | ||
| ChatCompletionAssistantMessageParam, | ||
| ChatCompletionContentPartImageParam, | ||
| ChatCompletionContentPartInputAudioParam, | ||
| ChatCompletionContentPartParam, | ||
| ChatCompletionContentPartTextParam, | ||
| ChatCompletionMessageParam, | ||
| ChatCompletionMessageToolCallParam, | ||
| ChatCompletionSystemMessageParam, | ||
|
|
@@ -41,6 +47,50 @@ | |
| "Please install `fastmcp[openai]` or add `openai` to your dependencies manually." | ||
| ) from e | ||
|
|
||
| # OpenAI only supports wav and mp3 for input audio | ||
| _OPENAI_AUDIO_FORMATS: dict[str, str] = { | ||
| "audio/wav": "wav", | ||
| "audio/x-wav": "wav", | ||
| "audio/mp3": "mp3", | ||
| "audio/mpeg": "mp3", | ||
| } | ||
|
|
||
| _OPENAI_IMAGE_MEDIA_TYPES: frozenset[str] = frozenset( | ||
| {"image/jpeg", "image/png", "image/gif", "image/webp"} | ||
| ) | ||
|
|
||
|
|
||
| def _image_content_to_openai_part( | ||
| content: ImageContent, | ||
| ) -> ChatCompletionContentPartImageParam: | ||
| """Convert MCP ImageContent to OpenAI image_url content part.""" | ||
| if content.mimeType not in _OPENAI_IMAGE_MEDIA_TYPES: | ||
| raise ValueError( | ||
| f"Unsupported image MIME type for OpenAI: {content.mimeType!r}. " | ||
| f"Supported types: {', '.join(sorted(_OPENAI_IMAGE_MEDIA_TYPES))}" | ||
| ) | ||
| data_url = f"data:{content.mimeType};base64,{content.data}" | ||
| return ChatCompletionContentPartImageParam( | ||
| type="image_url", | ||
| image_url={"url": data_url}, | ||
| ) | ||
|
|
||
|
|
||
| def _audio_content_to_openai_part( | ||
| content: AudioContent, | ||
| ) -> ChatCompletionContentPartInputAudioParam: | ||
| """Convert MCP AudioContent to OpenAI input_audio content part.""" | ||
| audio_format = _OPENAI_AUDIO_FORMATS.get(content.mimeType) | ||
| if audio_format is None: | ||
| raise ValueError( | ||
| f"Unsupported audio MIME type for OpenAI: {content.mimeType!r}. " | ||
| f"Supported types: {', '.join(sorted(_OPENAI_AUDIO_FORMATS))}" | ||
| ) | ||
| return ChatCompletionContentPartInputAudioParam( | ||
| type="input_audio", | ||
| input_audio={"data": content.data, "format": audio_format}, | ||
| ) | ||
|
|
||
|
|
||
| class OpenAISamplingHandler: | ||
| """Sampling handler that uses the OpenAI API.""" | ||
|
|
@@ -147,8 +197,9 @@ def _convert_to_openai_messages( | |
|
|
||
| # Handle list content (from CreateMessageResultWithTools) | ||
| if isinstance(content, list): | ||
| # Collect tool calls and text from the list | ||
| # Collect tool calls, content parts, and text from the list | ||
| tool_calls: list[ChatCompletionMessageToolCallParam] = [] | ||
| content_parts: list[ChatCompletionContentPartParam] = [] | ||
| text_parts: list[str] = [] | ||
| # Collect tool results separately to maintain correct ordering | ||
| tool_messages: list[ChatCompletionToolMessageParam] = [] | ||
|
|
@@ -167,6 +218,15 @@ def _convert_to_openai_messages( | |
| ) | ||
| elif isinstance(item, TextContent): | ||
| text_parts.append(item.text) | ||
| content_parts.append( | ||
| ChatCompletionContentPartTextParam( | ||
| type="text", text=item.text | ||
| ) | ||
| ) | ||
| elif isinstance(item, ImageContent): | ||
| content_parts.append(_image_content_to_openai_part(item)) | ||
| elif isinstance(item, AudioContent): | ||
| content_parts.append(_audio_content_to_openai_part(item)) | ||
|
Comment on lines
+226
to
+229
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This change makes Useful? React with 👍 / 👎.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pre-existing limitation — ToolResultContent has always only extracted TextContent. This PR adds multimodal support for messages, not tool results. Valid follow-up but out of scope here. |
||
| elif isinstance(item, ToolResultContent): | ||
| # Collect tool results (added after assistant message) | ||
| content_text = "" | ||
|
|
@@ -186,33 +246,47 @@ def _convert_to_openai_messages( | |
|
|
||
| # Add assistant message with tool calls if present | ||
| # OpenAI requires: assistant (with tool_calls) -> tool messages | ||
| if tool_calls or text_parts: | ||
| msg_content = "\n".join(text_parts) if text_parts else None | ||
| if tool_calls or content_parts: | ||
| if tool_calls: | ||
| has_multimodal = len(content_parts) > len(text_parts) | ||
| if has_multimodal: | ||
| raise ValueError( | ||
| "ImageContent/AudioContent is only supported " | ||
| "in user messages for OpenAI" | ||
| ) | ||
| text_str = "\n".join(text_parts) or None | ||
| openai_messages.append( | ||
| ChatCompletionAssistantMessageParam( | ||
| role="assistant", | ||
| content=msg_content, | ||
| content=text_str, | ||
| tool_calls=tool_calls, | ||
| ) | ||
| ) | ||
| # Add tool messages AFTER assistant message | ||
| openai_messages.extend(tool_messages) | ||
| elif msg_content: | ||
| elif content_parts: | ||
| if message.role == "user": | ||
| openai_messages.append( | ||
| ChatCompletionUserMessageParam( | ||
| role="user", | ||
| content=msg_content, | ||
| content=content_parts, | ||
| ) | ||
| ) | ||
| else: | ||
| openai_messages.append( | ||
| ChatCompletionAssistantMessageParam( | ||
| role="assistant", | ||
| content=msg_content, | ||
| has_multimodal = len(content_parts) > len(text_parts) | ||
| if has_multimodal: | ||
| raise ValueError( | ||
| "ImageContent/AudioContent is only supported " | ||
| "in user messages for OpenAI" | ||
| ) | ||
| assistant_text = "\n".join(text_parts) | ||
| if assistant_text: | ||
| openai_messages.append( | ||
| ChatCompletionAssistantMessageParam( | ||
| role="assistant", | ||
| content=assistant_text, | ||
| ) | ||
| ) | ||
| ) | ||
| elif tool_messages: | ||
| # Tool results only (assistant message was in previous message) | ||
| openai_messages.extend(tool_messages) | ||
|
|
@@ -272,6 +346,34 @@ def _convert_to_openai_messages( | |
| ) | ||
| continue | ||
|
|
||
| # Handle ImageContent | ||
| if isinstance(content, ImageContent): | ||
| if message.role != "user": | ||
| raise ValueError( | ||
| "ImageContent is only supported in user messages for OpenAI" | ||
| ) | ||
| openai_messages.append( | ||
| ChatCompletionUserMessageParam( | ||
| role="user", | ||
| content=[_image_content_to_openai_part(content)], | ||
| ) | ||
| ) | ||
| continue | ||
|
|
||
| # Handle AudioContent | ||
| if isinstance(content, AudioContent): | ||
| if message.role != "user": | ||
| raise ValueError( | ||
| "AudioContent is only supported in user messages for OpenAI" | ||
| ) | ||
| openai_messages.append( | ||
| ChatCompletionUserMessageParam( | ||
| role="user", | ||
| content=[_audio_content_to_openai_part(content)], | ||
| ) | ||
| ) | ||
| continue | ||
|
|
||
| raise ValueError(f"Unsupported content type: {type(content)}") | ||
|
|
||
| return openai_messages | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Anthropic's tool-use API allows
tool_result.contentto contain nestedtextorimageblocks, but_convert_to_anthropic_messages()still serializesToolResultContent.contentby collecting onlyTextContent. If a FastMCP tool returnscontent=[ImageContent(...)], Claude will receive an empty tool result instead of the image it requested. Since this commit addsImageContentsupport for user messages, the same media needs to be forwarded (or explicitly rejected) inToolResultContentas well.Useful? React with 👍 / 👎.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same — pre-existing, unchanged by this PR. Anthropic does support images in tool_result blocks so it's a valid follow-up.