diff --git a/client/inprocess_test.go b/client/inprocess_test.go index de4476025..ca2ccad3f 100644 --- a/client/inprocess_test.go +++ b/client/inprocess_test.go @@ -36,6 +36,11 @@ func TestInProcessMCPClient(t *testing.T) { Type: "text", Text: "Input parameter: " + request.Params.Arguments["parameter-1"].(string), }, + mcp.AudioContent{ + Type: "audio", + Data: "base64-encoded-audio-data", + MIMEType: "audio/wav", + }, }, }, nil }) @@ -77,6 +82,14 @@ func TestInProcessMCPClient(t *testing.T) { Text: "Test prompt with arg1: " + request.Params.Arguments["arg1"], }, }, + { + Role: mcp.RoleUser, + Content: mcp.AudioContent{ + Type: "audio", + Data: "base64-encoded-audio-data", + MIMEType: "audio/wav", + }, + }, }, }, nil }, @@ -192,7 +205,7 @@ func TestInProcessMCPClient(t *testing.T) { t.Fatalf("CallTool failed: %v", err) } - if len(result.Content) != 1 { + if len(result.Content) != 2 { t.Errorf("Expected 1 content item, got %d", len(result.Content)) } }) @@ -359,14 +372,17 @@ func TestInProcessMCPClient(t *testing.T) { request := mcp.GetPromptRequest{} request.Params.Name = "test-prompt" + request.Params.Arguments = map[string]string{ + "arg1": "arg1 value", + } result, err := client.GetPrompt(context.Background(), request) if err != nil { t.Errorf("GetPrompt failed: %v", err) } - if len(result.Messages) != 1 { - t.Errorf("Expected 1 message, got %d", len(result.Messages)) + if len(result.Messages) != 2 { + t.Errorf("Expected 2 message, got %d", len(result.Messages)) } }) diff --git a/mcp/prompts.go b/mcp/prompts.go index bc12a7297..1309cc5cb 100644 --- a/mcp/prompts.go +++ b/mcp/prompts.go @@ -78,7 +78,7 @@ const ( // resources from the MCP server. type PromptMessage struct { Role Role `json:"role"` - Content Content `json:"content"` // Can be TextContent, ImageContent, or EmbeddedResource + Content Content `json:"content"` // Can be TextContent, ImageContent, AudioContent or EmbeddedResource } // PromptListChangedNotification is an optional notification from the server diff --git a/mcp/tools.go b/mcp/tools.go index b62cdd5b7..2b3d936ae 100644 --- a/mcp/tools.go +++ b/mcp/tools.go @@ -33,7 +33,7 @@ type ListToolsResult struct { // should be reported as an MCP error response. type CallToolResult struct { Result - Content []Content `json:"content"` // Can be TextContent, ImageContent, or EmbeddedResource + Content []Content `json:"content"` // Can be TextContent, ImageContent, AudioContent, or EmbeddedResource // Whether the tool call ended in an error. // // If not set, this is assumed to be false (the call was successful). diff --git a/mcp/types.go b/mcp/types.go index 714eb482c..702089d33 100644 --- a/mcp/types.go +++ b/mcp/types.go @@ -656,7 +656,7 @@ type CreateMessageResult struct { // SamplingMessage describes a message issued to or received from an LLM API. type SamplingMessage struct { Role Role `json:"role"` - Content interface{} `json:"content"` // Can be TextContent or ImageContent + Content interface{} `json:"content"` // Can be TextContent, ImageContent or AudioContent } type Annotations struct { @@ -709,6 +709,19 @@ type ImageContent struct { func (ImageContent) isContent() {} +// AudioContent represents the contents of audio, embedded into a prompt or tool call result. +// It must have Type set to "audio". +type AudioContent struct { + Annotated + Type string `json:"type"` // Must be "audio" + // The base64-encoded audio data. + Data string `json:"data"` + // The MIME type of the audio. Different providers may support different audio types. + MIMEType string `json:"mimeType"` +} + +func (AudioContent) isContent() {} + // EmbeddedResource represents the contents of a resource, embedded into a prompt or tool call result. // // It is up to the client how best to render embedded resources for the diff --git a/mcp/utils.go b/mcp/utils.go index 333d65759..1a6e651a5 100644 --- a/mcp/utils.go +++ b/mcp/utils.go @@ -77,6 +77,11 @@ func AsImageContent(content interface{}) (*ImageContent, bool) { return asType[ImageContent](content) } +// AsImageContent attempts to cast the given interface to AudioContent +func AsAudioContent(content interface{}) (*AudioContent, bool) { + return asType[AudioContent](content) +} + // AsEmbeddedResource attempts to cast the given interface to EmbeddedResource func AsEmbeddedResource(content interface{}) (*EmbeddedResource, bool) { return asType[EmbeddedResource](content) @@ -202,6 +207,15 @@ func NewImageContent(data, mimeType string) ImageContent { } } +// Helper function to create a new AudioContent +func NewAudioContent(data, mimeType string) AudioContent { + return AudioContent{ + Type: "audio", + Data: data, + MIMEType: mimeType, + } +} + // Helper function to create a new EmbeddedResource func NewEmbeddedResource(resource ResourceContents) EmbeddedResource { return EmbeddedResource{ @@ -239,6 +253,23 @@ func NewToolResultImage(text, imageData, mimeType string) *CallToolResult { } } +// NewToolResultAudio creates a new CallToolResult with both text and audio content +func NewToolResultAudio(text, imageData, mimeType string) *CallToolResult { + return &CallToolResult{ + Content: []Content{ + TextContent{ + Type: "text", + Text: text, + }, + AudioContent{ + Type: "audio", + Data: imageData, + MIMEType: mimeType, + }, + }, + } +} + // NewToolResultResource creates a new CallToolResult with an embedded resource func NewToolResultResource( text string, @@ -415,6 +446,14 @@ func ParseContent(contentMap map[string]any) (Content, error) { } return NewImageContent(data, mimeType), nil + case "audio": + data := ExtractString(contentMap, "data") + mimeType := ExtractString(contentMap, "mimeType") + if data == "" || mimeType == "" { + return nil, fmt.Errorf("audio data or mimeType is missing") + } + return NewAudioContent(data, mimeType), nil + case "resource": resourceMap := ExtractMap(contentMap, "resource") if resourceMap == nil {