diff --git a/README.md b/README.md index efdff5995d..e183d364d5 100644 --- a/README.md +++ b/README.md @@ -149,10 +149,12 @@ For additional configurations in HTTP server setup, please read [this](https://g Provider: schemas.OpenAI, Model: "gpt-4o-mini", // make sure you have configured gpt-4o-mini in your account interface Input: schemas.RequestInput{ - ChatCompletionInput: bifrost.Ptr([]schemas.Message{{ - Role: schemas.RoleUser, - Content: bifrost.Ptr("What is a LLM gateway?"), - }}), + ChatCompletionInput: bifrost.Ptr([]schemas.BifrostMessage{{ + Role: schemas.ModelChatMessageRoleUser, + Content: schemas.MessageContent{ + ContentStr: bifrost.Ptr("What is a LLM gateway?"), + }, + }}), }, }, ) diff --git a/core/providers/anthropic.go b/core/providers/anthropic.go index 3092887ea2..fedd65ce5f 100644 --- a/core/providers/anthropic.go +++ b/core/providers/anthropic.go @@ -69,6 +69,12 @@ type AnthropicError struct { } `json:"error"` // Error details } +type AnthropicImageContent struct { + Type ImageContentType `json:"type"` + URL string `json:"url"` + MediaType string `json:"media_type,omitempty"` +} + // AnthropicProvider implements the Provider interface for Anthropic's Claude API. type AnthropicProvider struct { logger schemas.Logger // Logger for provider operations @@ -255,8 +261,10 @@ func (provider *AnthropicProvider) TextCompletion(ctx context.Context, model, ke { Index: 0, Message: schemas.BifrostMessage{ - Role: schemas.ModelChatMessageRoleAssistant, - Content: &response.Completion, + Role: schemas.ModelChatMessageRoleAssistant, + Content: schemas.MessageContent{ + ContentStr: &response.Completion, + }, }, }, } @@ -318,22 +326,34 @@ func (provider *AnthropicProvider) ChatCompletion(ctx context.Context, model, ke } // buildAnthropicImageSourceMap creates the "source" map for an Anthropic image content part. -func buildAnthropicImageSourceMap(imgContent *schemas.ImageContent) map[string]interface{} { +func buildAnthropicImageSourceMap(imgContent *schemas.ImageURLStruct) map[string]interface{} { if imgContent == nil { return nil } - formattedImgContent := *FormatImageContent(imgContent, false) + sanitizedURL, _ := SanitizeImageURL(imgContent.URL) + urlTypeInfo := ExtractURLTypeInfo(sanitizedURL) + + formattedImgContent := AnthropicImageContent{ + Type: urlTypeInfo.Type, + MediaType: *urlTypeInfo.MediaType, + } + + if urlTypeInfo.DataURLWithoutPrefix != nil { + formattedImgContent.URL = *urlTypeInfo.DataURLWithoutPrefix + } else { + formattedImgContent.URL = sanitizedURL + } sourceMap := map[string]interface{}{ "type": string(formattedImgContent.Type), // "base64" or "url" } - if formattedImgContent.Type == schemas.ImageContentTypeURL { + if formattedImgContent.Type == ImageContentTypeURL { sourceMap["url"] = formattedImgContent.URL } else { - if formattedImgContent.MediaType != nil { - sourceMap["media_type"] = *formattedImgContent.MediaType + if formattedImgContent.MediaType != "" { + sourceMap["media_type"] = formattedImgContent.MediaType } sourceMap["data"] = formattedImgContent.URL // URL field contains base64 data string } @@ -345,10 +365,18 @@ func prepareAnthropicChatRequest(messages []schemas.BifrostMessage, params *sche var systemMessages []BedrockAnthropicSystemMessage for _, msg := range messages { if msg.Role == schemas.ModelChatMessageRoleSystem { - if msg.Content != nil { + if msg.Content.ContentStr != nil { systemMessages = append(systemMessages, BedrockAnthropicSystemMessage{ - Text: *msg.Content, + Text: *msg.Content.ContentStr, }) + } else if msg.Content.ContentBlocks != nil { + for _, block := range *msg.Content.ContentBlocks { + if block.Text != nil { + systemMessages = append(systemMessages, BedrockAnthropicSystemMessage{ + Text: *block.Text, + }) + } + } } } } @@ -367,61 +395,49 @@ func prepareAnthropicChatRequest(messages []schemas.BifrostMessage, params *sche var toolCallResultContent []map[string]interface{} - if msg.Content != nil { + if msg.Content.ContentStr != nil { toolCallResultContent = append(toolCallResultContent, map[string]interface{}{ "type": "text", - "text": *msg.Content, + "text": *msg.Content.ContentStr, }) - } - - if (msg.UserMessage != nil && msg.UserMessage.ImageContent != nil) || (msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil) { - var messageImageContent schemas.ImageContent - if msg.UserMessage != nil && msg.UserMessage.ImageContent != nil { - // Create a copy to avoid modifying the original - messageImageContent = *msg.UserMessage.ImageContent - } else if msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil { - // Create a copy to avoid modifying the original - messageImageContent = *msg.ToolMessage.ImageContent - } - - imageSource := buildAnthropicImageSourceMap(&messageImageContent) - if imageSource != nil { - toolCallResultContent = append(toolCallResultContent, map[string]interface{}{ - "type": "image", - "source": imageSource, - }) + } else if msg.Content.ContentBlocks != nil { + for _, block := range *msg.Content.ContentBlocks { + if block.Text != nil { + toolCallResultContent = append(toolCallResultContent, map[string]interface{}{ + "type": "text", + "text": *block.Text, + }) + } } } toolCallResult["content"] = toolCallResultContent - content = append(content, toolCallResult) } else { - if (msg.UserMessage != nil && msg.UserMessage.ImageContent != nil) || (msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil) { - var messageImageContent schemas.ImageContent - if msg.UserMessage != nil && msg.UserMessage.ImageContent != nil { - // Create a copy to avoid modifying the original - messageImageContent = *msg.UserMessage.ImageContent - } else if msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil { - // Create a copy to avoid modifying the original - messageImageContent = *msg.ToolMessage.ImageContent - } - - imageSource := buildAnthropicImageSourceMap(&messageImageContent) - if imageSource != nil { - content = append(content, map[string]interface{}{ - "type": "image", - "source": imageSource, - }) - } - } - // Add text content if present - if msg.Content != nil && *msg.Content != "" { + if msg.Content.ContentStr != nil && *msg.Content.ContentStr != "" { content = append(content, map[string]interface{}{ "type": "text", - "text": *msg.Content, + "text": *msg.Content.ContentStr, }) + } else if msg.Content.ContentBlocks != nil { + for _, block := range *msg.Content.ContentBlocks { + if block.Text != nil && *block.Text != "" { + content = append(content, map[string]interface{}{ + "type": "text", + "text": *block.Text, + }) + } + if block.ImageURL != nil { + imageSource := buildAnthropicImageSourceMap(block.ImageURL) + if imageSource != nil { + content = append(content, map[string]interface{}{ + "type": "image", + "source": imageSource, + }) + } + } + } } // Add thinking content if present in AssistantMessage @@ -577,20 +593,20 @@ func prepareAnthropicChatRequest(messages []schemas.BifrostMessage, params *sche func parseAnthropicResponse(response *AnthropicChatResponse, bifrostResponse *schemas.BifrostResponse) (*schemas.BifrostResponse, *schemas.BifrostError) { // Collect all content and tool calls into a single message - var content strings.Builder var toolCalls []schemas.ToolCall var thinking string + var contentBlocks []schemas.ContentBlock // Process content and tool calls for _, c := range response.Content { switch c.Type { case "thinking": thinking = c.Thinking case "text": - if content.Len() > 0 { - content.WriteString("\n") - } - content.WriteString(c.Text) + contentBlocks = append(contentBlocks, schemas.ContentBlock{ + Type: "text", + Text: &c.Text, + }) case "tool_use": function := schemas.FunctionCall{ Name: &c.Name, @@ -612,7 +628,6 @@ func parseAnthropicResponse(response *AnthropicChatResponse, bifrostResponse *sc } // Create the assistant message - messageContent := content.String() var assistantMessage *schemas.AssistantMessage // Create AssistantMessage if we have tool calls or thinking @@ -632,8 +647,10 @@ func parseAnthropicResponse(response *AnthropicChatResponse, bifrostResponse *sc { Index: 0, Message: schemas.BifrostMessage{ - Role: schemas.ModelChatMessageRoleAssistant, - Content: &messageContent, + Role: schemas.ModelChatMessageRoleAssistant, + Content: schemas.MessageContent{ + ContentBlocks: &contentBlocks, + }, AssistantMessage: assistantMessage, }, FinishReason: &response.StopReason, diff --git a/core/providers/azure.go b/core/providers/azure.go index 08039450f5..b85f436720 100644 --- a/core/providers/azure.go +++ b/core/providers/azure.go @@ -269,8 +269,10 @@ func (provider *AzureProvider) TextCompletion(ctx context.Context, model, key, t choices = append(choices, schemas.BifrostResponseChoice{ Index: 0, Message: schemas.BifrostMessage{ - Role: schemas.ModelChatMessageRoleAssistant, - Content: &textCopy, + Role: schemas.ModelChatMessageRoleAssistant, + Content: schemas.MessageContent{ + ContentStr: &textCopy, + }, }, FinishReason: response.Choices[0].FinishReason, LogProbs: &schemas.LogProbs{ diff --git a/core/providers/bedrock.go b/core/providers/bedrock.go index e4718237d1..c59677f8f1 100644 --- a/core/providers/bedrock.go +++ b/core/providers/bedrock.go @@ -339,8 +339,10 @@ func (provider *BedrockProvider) getTextCompletionResult(result []byte, model st { Index: 0, Message: schemas.BifrostMessage{ - Role: schemas.ModelChatMessageRoleAssistant, - Content: &response.Completion, + Role: schemas.ModelChatMessageRoleAssistant, + Content: schemas.MessageContent{ + ContentStr: &response.Completion, + }, }, FinishReason: &response.StopReason, StopString: &response.Stop, @@ -377,8 +379,10 @@ func (provider *BedrockProvider) getTextCompletionResult(result []byte, model st choices = append(choices, schemas.BifrostResponseChoice{ Index: i, Message: schemas.BifrostMessage{ - Role: schemas.ModelChatMessageRoleAssistant, - Content: &output.Text, + Role: schemas.ModelChatMessageRoleAssistant, + Content: schemas.MessageContent{ + ContentStr: &output.Text, + }, }, FinishReason: &output.StopReason, }) @@ -401,6 +405,26 @@ func (provider *BedrockProvider) getTextCompletionResult(result []byte, model st } } +// parseBedrockAnthropicMessageToolCallContent parses the content of a tool call message. +// It handles both text and JSON content. +// Returns a map containing the parsed content. +func parseBedrockAnthropicMessageToolCallContent(content string) map[string]interface{} { + toolResultContentBlock := map[string]interface{}{} + var parsedJSON interface{} + err := json.Unmarshal([]byte(content), &parsedJSON) + if err == nil { + if arr, ok := parsedJSON.([]interface{}); ok { + toolResultContentBlock["json"] = map[string]interface{}{"content": arr} + } else { + toolResultContentBlock["json"] = parsedJSON + } + } else { + toolResultContentBlock["text"] = content + } + + return toolResultContentBlock +} + // PrepareChatCompletionMessages formats chat messages for Bedrock's API. // It handles different model types (Anthropic and Mistral) and formats messages accordingly. // Returns a map containing the formatted messages and any system messages, or an error if formatting fails. @@ -427,10 +451,18 @@ func (provider *BedrockProvider) prepareChatCompletionMessages(messages []schema var systemMessages []BedrockAnthropicSystemMessage for _, msg := range messages { if msg.Role == schemas.ModelChatMessageRoleSystem { - if msg.Content != nil { + if msg.Content.ContentStr != nil { systemMessages = append(systemMessages, BedrockAnthropicSystemMessage{ - Text: *msg.Content, + Text: *msg.Content.ContentStr, }) + } else if msg.Content.ContentBlocks != nil { + for _, block := range *msg.Content.ContentBlocks { + if block.Text != nil { + systemMessages = append(systemMessages, BedrockAnthropicSystemMessage{ + Text: *block.Text, + }) + } + } } } } @@ -444,28 +476,20 @@ func (provider *BedrockProvider) prepareChatCompletionMessages(messages []schema toolCallResult := map[string]interface{}{ "toolUseId": *msg.ToolCallID, } - - var toolResultContentBlock map[string]interface{} - if msg.Content != nil { - toolResultContentBlock = map[string]interface{}{} - var parsedJSON interface{} - err := json.Unmarshal([]byte(*msg.Content), &parsedJSON) - if err == nil { - if arr, ok := parsedJSON.([]interface{}); ok { - toolResultContentBlock["json"] = map[string]interface{}{"content": arr} - } else { - toolResultContentBlock["json"] = parsedJSON + var toolResultContentBlocks []map[string]interface{} + if msg.Content.ContentStr != nil { + toolResultContentBlocks = append(toolResultContentBlocks, parseBedrockAnthropicMessageToolCallContent(*msg.Content.ContentStr)) + } else if msg.Content.ContentBlocks != nil { + for _, block := range *msg.Content.ContentBlocks { + if block.Text != nil { + toolResultContentBlocks = append(toolResultContentBlocks, parseBedrockAnthropicMessageToolCallContent(*block.Text)) } - } else { - toolResultContentBlock["text"] = *msg.Content } - - toolCallResult["content"] = []interface{}{toolResultContentBlock} - - content = append(content, map[string]interface{}{ - "toolResult": toolCallResult, - }) } + toolCallResult["content"] = toolResultContentBlocks + content = append(content, map[string]interface{}{ + "toolResult": toolCallResult, + }) } else { if msg.AssistantMessage != nil && msg.AssistantMessage.ToolCalls != nil { for _, toolCall := range *msg.AssistantMessage.ToolCalls { @@ -486,40 +510,47 @@ func (provider *BedrockProvider) prepareChatCompletionMessages(messages []schema } } - if (msg.UserMessage != nil && msg.UserMessage.ImageContent != nil) || (msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil) { - var messageImageContent schemas.ImageContent - if msg.UserMessage != nil && msg.UserMessage.ImageContent != nil { - messageImageContent = *msg.UserMessage.ImageContent - } else if msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil { - messageImageContent = *msg.ToolMessage.ImageContent - } - - formattedImgContent := *FormatImageContent(&messageImageContent, false) - - content = append(content, BedrockAnthropicImageMessage{ - Type: "image", - Image: BedrockAnthropicImage{ - Format: func() string { - if formattedImgContent.MediaType != nil { - mediaType := *formattedImgContent.MediaType - // Remove "image/" prefix if present, since normalizeMediaType ensures full format - mediaType = strings.TrimPrefix(mediaType, "image/") - return mediaType - } - return "" - }(), - Source: BedrockAnthropicImageSource{ - Bytes: formattedImgContent.URL, - }, - }, - }) - } - - if msg.Content != nil { + if msg.Content.ContentStr != nil { content = append(content, BedrockAnthropicTextMessage{ Type: "text", - Text: *msg.Content, + Text: *msg.Content.ContentStr, }) + } else if msg.Content.ContentBlocks != nil { + for _, block := range *msg.Content.ContentBlocks { + if block.ImageURL != nil { + sanitizedURL, _ := SanitizeImageURL(block.ImageURL.URL) + urlTypeInfo := ExtractURLTypeInfo(sanitizedURL) + + formattedImgContent := AnthropicImageContent{ + Type: urlTypeInfo.Type, + MediaType: *urlTypeInfo.MediaType, + } + + if urlTypeInfo.DataURLWithoutPrefix != nil { + formattedImgContent.URL = *urlTypeInfo.DataURLWithoutPrefix + } else { + formattedImgContent.URL = sanitizedURL + } + + content = append(content, BedrockAnthropicImageMessage{ + Type: "image", + Image: BedrockAnthropicImage{ + Format: func() string { + if formattedImgContent.MediaType != "" { + mediaType := formattedImgContent.MediaType + // Remove "image/" prefix if present, since normalizeMediaType ensures full format + mediaType = strings.TrimPrefix(mediaType, "image/") + return mediaType + } + return "" + }(), + Source: BedrockAnthropicImageSource{ + Bytes: formattedImgContent.URL, + }, + }, + }) + } + } } } @@ -603,7 +634,7 @@ func (provider *BedrockProvider) prepareChatCompletionMessages(messages []schema messages = append(messages, message.Text) } - body["system"] = strings.Join(messages, " ") + body["system"] = strings.Join(messages, " \n") } return body, nil @@ -629,9 +660,14 @@ func (provider *BedrockProvider) prepareChatCompletionMessages(messages []schema Role: msg.Role, } - if msg.Content != nil { - message.Content = []BedrockMistralContent{ - {Text: *msg.Content}, + switch { + case msg.Content.ContentStr != nil: + message.Content = []BedrockMistralContent{{Text: *msg.Content.ContentStr}} + case msg.Content.ContentBlocks != nil: + for _, b := range *msg.Content.ContentBlocks { + if b.Text != nil { + message.Content = append(message.Content, BedrockMistralContent{Text: *b.Text}) + } } } @@ -873,16 +909,16 @@ func (provider *BedrockProvider) ChatCompletion(ctx context.Context, model, key } // Collect all content and tool calls into a single message (similar to Anthropic aggregation) - var content strings.Builder var toolCalls []schemas.ToolCall + var contentBlocks []schemas.ContentBlock // Process content and tool calls for _, choice := range response.Output.Message.Content { if choice.Text != nil && *choice.Text != "" { - if content.Len() > 0 { - content.WriteString("\n") - } - content.WriteString(*choice.Text) + contentBlocks = append(contentBlocks, schemas.ContentBlock{ + Type: "text", + Text: choice.Text, + }) } if choice.ToolUse != nil { @@ -909,12 +945,6 @@ func (provider *BedrockProvider) ChatCompletion(ctx context.Context, model, key } // Create the assistant message - messageContent := content.String() - var contentPtr *string - if messageContent != "" { - contentPtr = &messageContent - } - var assistantMessage *schemas.AssistantMessage // Create AssistantMessage if we have tool calls @@ -929,8 +959,10 @@ func (provider *BedrockProvider) ChatCompletion(ctx context.Context, model, key { Index: 0, Message: schemas.BifrostMessage{ - Role: schemas.ModelChatMessageRoleAssistant, - Content: contentPtr, + Role: schemas.ModelChatMessageRoleAssistant, + Content: schemas.MessageContent{ + ContentBlocks: &contentBlocks, + }, AssistantMessage: assistantMessage, }, FinishReason: &response.StopReason, diff --git a/core/providers/cohere.go b/core/providers/cohere.go index 9e60809280..1467f5e387 100644 --- a/core/providers/cohere.go +++ b/core/providers/cohere.go @@ -228,38 +228,35 @@ func (provider *CohereProvider) ChatCompletion(ctx context.Context, model, key s "name": *msg.ToolMessage.ToolCallID, "parameters": toolCallParameters, }, - "outputs": *msg.Content, + "outputs": *msg.Content.ContentStr, }, } historyMsg["tool_results"] = toolResults } - // Handle message content based on whether it supports vision - if msg.UserMessage != nil && msg.UserMessage.ImageContent != nil { + if msg.Content.ContentStr != nil { + historyMsg["message"] = *msg.Content.ContentStr + } else if msg.Content.ContentBlocks != nil { // Create content array with text and image contentArray := []map[string]interface{}{} - // Add text content if present - if msg.Content != nil { - contentArray = append(contentArray, map[string]interface{}{ - "type": "text", - "text": *msg.Content, - }) + // Iterate over ContentBlocks to build the content array + for _, block := range *msg.Content.ContentBlocks { + if block.Text != nil { + contentArray = append(contentArray, map[string]interface{}{ + "type": "text", + "text": *block.Text, + }) + } + // Add image content using our helper function + // NOTE: Cohere v1 does not support image content + // if processedImageContent := processImageContent(block.ImageContent); processedImageContent != nil { + // contentArray = append(contentArray, processedImageContent) + // } } - // Add image content using our helper function - // NOTE: Cohere v1 does not support image content - // if processedImageContent := processImageContent(msg.UserMessage.ImageContent); processedImageContent != nil { - // contentArray = append(contentArray, processedImageContent) - // } - historyMsg["content"] = contentArray - } else { - // For non-vision models or text-only messages, use simple message field - if msg.Content != nil { - historyMsg["message"] = *msg.Content - } } cohereHistory = append(cohereHistory, historyMsg) @@ -274,30 +271,16 @@ func (provider *CohereProvider) ChatCompletion(ctx context.Context, model, key s }, preparedParams) // Handle the last message content based on whether it supports vision - if lastMessage.UserMessage != nil && lastMessage.UserMessage.ImageContent != nil { - // Create content array with text and image - contentArray := []map[string]interface{}{} - - // Add text content if present - if lastMessage.Content != nil { - contentArray = append(contentArray, map[string]interface{}{ - "type": "text", - "text": *lastMessage.Content, - }) - } - - // Add image content using our helper function - // NOTE: Cohere v1 does not support image content - // if processedImageContent := processImageContent(lastMessage.UserMessage.ImageContent); processedImageContent != nil { - // contentArray = append(contentArray, processedImageContent) - // } - - requestBody["content"] = contentArray - } else { - // For non-vision models or text-only messages, use simple message field - if lastMessage.Content != nil { - requestBody["message"] = *lastMessage.Content + if lastMessage.Content.ContentStr != nil { + requestBody["message"] = *lastMessage.Content.ContentStr + } else if lastMessage.Content.ContentBlocks != nil { + message := "" + for _, block := range *lastMessage.Content.ContentBlocks { + if block.Text != nil { + message += *block.Text + "\n" + } } + requestBody["message"] = strings.TrimSuffix(message, "\n") } // Add tools if present @@ -434,8 +417,10 @@ func (provider *CohereProvider) ChatCompletion(ctx context.Context, model, key s { Index: 0, Message: schemas.BifrostMessage{ - Role: role, - Content: &content, + Role: role, + Content: schemas.MessageContent{ + ContentStr: &content, + }, AssistantMessage: &schemas.AssistantMessage{ ToolCalls: &toolCalls, }, @@ -465,12 +450,19 @@ func (provider *CohereProvider) ChatCompletion(ctx context.Context, model, key s // processImageContent processes image content for Cohere API format. // It creates a copy of the image content, normalizes and formats it, then returns the properly formatted map. // This prevents unintended mutations to the original image content. -func processImageContent(imageContent *schemas.ImageContent) map[string]interface{} { +func processImageContent(imageContent *schemas.ImageURLStruct) map[string]interface{} { if imageContent == nil { return nil } - formattedImgContent := *FormatImageContent(imageContent, true) + sanitizedURL, _ := SanitizeImageURL(imageContent.URL) + urlTypeInfo := ExtractURLTypeInfo(sanitizedURL) + + formattedImgContent := AnthropicImageContent{ + Type: urlTypeInfo.Type, + URL: sanitizedURL, + MediaType: *urlTypeInfo.MediaType, + } return map[string]interface{}{ "type": "image_url", @@ -509,8 +501,10 @@ func convertChatHistory(history []struct { } } converted[i] = schemas.BifrostMessage{ - Role: msg.Role, - Content: &msg.Message, + Role: msg.Role, + Content: schemas.MessageContent{ + ContentStr: &msg.Message, + }, AssistantMessage: &schemas.AssistantMessage{ ToolCalls: &toolCalls, }, diff --git a/core/providers/openai.go b/core/providers/openai.go index a5a972649f..78c563487e 100644 --- a/core/providers/openai.go +++ b/core/providers/openai.go @@ -212,53 +212,29 @@ func prepareOpenAIChatRequest(messages []schemas.BifrostMessage, params *schemas if msg.Role == schemas.ModelChatMessageRoleAssistant { assistantMessage := map[string]interface{}{ "role": msg.Role, - "content": coalesceString(msg.Content), + "content": coalesceString(msg.Content.ContentStr), } if msg.AssistantMessage != nil && msg.AssistantMessage.ToolCalls != nil { assistantMessage["tool_calls"] = *msg.AssistantMessage.ToolCalls } formattedMessages = append(formattedMessages, assistantMessage) - } else if (msg.UserMessage != nil && msg.UserMessage.ImageContent != nil) || (msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil) { - var messageImageContent schemas.ImageContent - if msg.UserMessage != nil && msg.UserMessage.ImageContent != nil { - messageImageContent = *msg.UserMessage.ImageContent - } else if msg.ToolMessage != nil && msg.ToolMessage.ImageContent != nil { - messageImageContent = *msg.ToolMessage.ImageContent - } - - formattedImgContent := *FormatImageContent(&messageImageContent, true) - - var content []map[string]interface{} - - // Add text content if present - if msg.Content != nil { - content = append(content, map[string]interface{}{ - "type": "text", - "text": *msg.Content, - }) - } - - imageContent := map[string]interface{}{ - "type": "image_url", - "image_url": map[string]interface{}{ - "url": formattedImgContent.URL, - }, - } - - if formattedImgContent.Detail != nil { - imageContent["image_url"].(map[string]interface{})["detail"] = formattedImgContent.Detail - } - - content = append(content, imageContent) - - formattedMessages = append(formattedMessages, map[string]interface{}{ - "role": msg.Role, - "content": content, - }) } else { message := map[string]interface{}{ - "role": msg.Role, - "content": coalesceString(msg.Content), + "role": msg.Role, + } + + if msg.Content.ContentStr != nil { + message["content"] = *msg.Content.ContentStr + } else if msg.Content.ContentBlocks != nil { + contentBlocks := *msg.Content.ContentBlocks + for i := range contentBlocks { + if contentBlocks[i].Type == schemas.ContentBlockTypeImage && contentBlocks[i].ImageURL != nil { + sanitizedURL, _ := SanitizeImageURL(contentBlocks[i].ImageURL.URL) + contentBlocks[i].ImageURL.URL = sanitizedURL + } + } + + message["content"] = contentBlocks } if msg.ToolMessage != nil && msg.ToolMessage.ToolCallID != nil { diff --git a/core/providers/utils.go b/core/providers/utils.go index 42dca5c1ad..54a215d624 100644 --- a/core/providers/utils.go +++ b/core/providers/utils.go @@ -28,7 +28,38 @@ var bifrostResponsePool = sync.Pool{ // dataURIRegex is a precompiled regex for matching data URI format patterns. // It matches patterns like: data:image/png;base64,iVBORw0KGgo... -var dataURIRegex = regexp.MustCompile(`^data:([^;]+);base64,(.*)$`) +var dataURIRegex = regexp.MustCompile(`^data:([^;]+)(;base64)?,(.+)$`) + +// base64Regex is a precompiled regex for matching base64 strings. +// It matches strings containing only valid base64 characters with optional padding. +var base64Regex = regexp.MustCompile(`^[A-Za-z0-9+/]*={0,2}$`) + +// fileExtensionToMediaType maps common image file extensions to their corresponding media types. +// This map is used to infer media types from file extensions in URLs. +var fileExtensionToMediaType = map[string]string{ + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".svg": "image/svg+xml", + ".bmp": "image/bmp", +} + +// ImageContentType represents the type of image content +type ImageContentType string + +const ( + ImageContentTypeBase64 ImageContentType = "base64" + ImageContentTypeURL ImageContentType = "url" +) + +// URLTypeInfo contains extracted information about a URL +type URLTypeInfo struct { + Type ImageContentType + MediaType *string + DataURLWithoutPrefix *string // URL without the prefix (eg data:image/png;base64,iVBORw0KGgo...) +} // acquireBifrostResponse gets a Bifrost response from the pool and resets it. func acquireBifrostResponse() *schemas.BifrostResponse { @@ -316,130 +347,162 @@ func coalesceString(s *string) string { return *s } -// normalizeMediaType converts short media types to full media types -// e.g., "jpeg" -> "image/jpeg", "png" -> "image/png" -func normalizeMediaType(mediaType string) string { - if mediaType == "" { - return "image/jpeg" // default +//* IMAGE UTILS *// + +// SanitizeImageURL sanitizes and validates an image URL. +// It handles both data URLs and regular HTTP/HTTPS URLs. +// It also detects raw base64 image data and adds proper data URL headers. +func SanitizeImageURL(rawURL string) (string, error) { + if rawURL == "" { + return rawURL, fmt.Errorf("URL cannot be empty") } - // If it already has the image/ prefix, return as is - if strings.HasPrefix(mediaType, "image/") { - return mediaType + // Trim whitespace + rawURL = strings.TrimSpace(rawURL) + + // Check if it's already a proper data URL + if strings.HasPrefix(rawURL, "data:") { + // Validate data URL format + if !dataURIRegex.MatchString(rawURL) { + return rawURL, fmt.Errorf("invalid data URL format") + } + return rawURL, nil } - // Add image/ prefix for common formats - switch strings.ToLower(mediaType) { - case "jpeg", "jpg": - return "image/jpeg" - case "png": - return "image/png" - case "gif": - return "image/gif" - case "webp": - return "image/webp" - case "bmp": - return "image/bmp" - case "svg": - return "image/svg+xml" - default: - return "image/" + mediaType + // Check if it looks like raw base64 image data + if isLikelyBase64(rawURL) { + // Detect the image type from the base64 data + mediaType := detectImageTypeFromBase64(rawURL) + + // Remove any whitespace/newlines from base64 data + cleanBase64 := strings.ReplaceAll(strings.ReplaceAll(rawURL, "\n", ""), " ", "") + + // Create proper data URL + return fmt.Sprintf("data:%s;base64,%s", mediaType, cleanBase64), nil } -} -// Normalize handles type inference and media type normalization for image content. -// It automatically detects content type from URL patterns and normalizes media types. -// -// NOTE: This function is called internally by the Bifrost system - you do not need to call it yourself. -// It is automatically invoked when processing image content in requests. -func normalizeImageContent(ic *schemas.ImageContent) { - if ic == nil { - return - } - - // Handle unknown/empty type - try to infer from URL - if ic.Type == "" && ic.URL != "" { - if dataURIRegex.MatchString(ic.URL) { - // Looks like base64 data URI - ic.Type = schemas.ImageContentTypeBase64 - } else if strings.HasPrefix(ic.URL, "http://") || strings.HasPrefix(ic.URL, "https://") { - // Looks like a regular URL - ic.Type = schemas.ImageContentTypeURL - } else { - // Assume it's raw base64 data - ic.Type = schemas.ImageContentTypeBase64 - } + // Parse as regular URL + parsedURL, err := url.Parse(rawURL) + if err != nil { + return rawURL, fmt.Errorf("invalid URL format: %w", err) } - // Normalize MediaType if provided - if ic.MediaType != nil && *ic.MediaType != "" { - normalizedMediaType := normalizeMediaType(*ic.MediaType) - ic.MediaType = &normalizedMediaType + // Validate scheme + if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { + return rawURL, fmt.Errorf("URL must use http or https scheme") } + // Validate host + if parsedURL.Host == "" { + return rawURL, fmt.Errorf("URL must have a valid host") + } + + return parsedURL.String(), nil } -// FormatDataURL modifies the image content struct in place to format data URL for base64 image content. -// -// NOTE: This function is called internally by the Bifrost system - you do not need to call it yourself. -// It is automatically invoked when processing image content for different providers. -// -// Parameters: -// - includePrefix: Whether to include the "data:mediatype;base64," prefix -// - true: URL will be in full data URI format (data:image/png;base64,iVBORw0KGgo...) -// - false: URL will contain only the base64 data (iVBORw0KGgo...) -func FormatImageContent(imageContent *schemas.ImageContent, includePrefix bool) *schemas.ImageContent { - if imageContent == nil { - return nil +// ExtractURLTypeInfo extracts type and media type information from a sanitized URL. +// For data URLs, it parses the media type and encoding. +// For regular URLs, it attempts to infer the media type from the file extension. +func ExtractURLTypeInfo(sanitizedURL string) URLTypeInfo { + if strings.HasPrefix(sanitizedURL, "data:") { + return extractDataURLInfo(sanitizedURL) } + return extractRegularURLInfo(sanitizedURL) +} - newImageContent := *imageContent - - normalizeImageContent(&newImageContent) +// extractDataURLInfo extracts information from a data URL +func extractDataURLInfo(dataURL string) URLTypeInfo { + // Parse data URL: data:[][;base64], + matches := dataURIRegex.FindStringSubmatch(dataURL) - if newImageContent.Type != schemas.ImageContentTypeBase64 { - return &newImageContent + if len(matches) != 4 { + return URLTypeInfo{Type: ImageContentTypeBase64} } - var finalMediaType string - var base64Data string + mediaType := matches[1] + isBase64 := matches[2] == ";base64" - // Extract base64 data and media type from URL using precompiled regex - if matches := dataURIRegex.FindStringSubmatch(newImageContent.URL); matches != nil { - // URL already has data URI format - existingMediaType := matches[1] - base64Data = matches[2] + dataURLWithoutPrefix := dataURL + if isBase64 { + dataURLWithoutPrefix = dataURL[len("data:")+len(mediaType)+len(";base64,"):] + } - // Determine final media type (prefer explicit MediaType field) - if newImageContent.MediaType != nil && *newImageContent.MediaType != "" { - finalMediaType = normalizeMediaType(*newImageContent.MediaType) - } else { - finalMediaType = normalizeMediaType(existingMediaType) - } + info := URLTypeInfo{ + MediaType: &mediaType, + DataURLWithoutPrefix: &dataURLWithoutPrefix, + } + + if isBase64 { + info.Type = ImageContentTypeBase64 } else { - // URL contains raw base64 data (no data URI prefix) - base64Data = newImageContent.URL - - // Determine media type - if newImageContent.MediaType != nil && *newImageContent.MediaType != "" { - finalMediaType = normalizeMediaType(*newImageContent.MediaType) - } else { - finalMediaType = "image/jpeg" // default when no media type provided + info.Type = ImageContentTypeURL // Non-base64 data URL + } + + return info +} + +// extractRegularURLInfo extracts information from a regular HTTP/HTTPS URL +func extractRegularURLInfo(regularURL string) URLTypeInfo { + info := URLTypeInfo{ + Type: ImageContentTypeURL, + } + + // Try to infer media type from file extension + parsedURL, err := url.Parse(regularURL) + if err != nil { + return info + } + + path := strings.ToLower(parsedURL.Path) + + // Check for known file extensions using the map + for ext, mediaType := range fileExtensionToMediaType { + if strings.HasSuffix(path, ext) { + info.MediaType = &mediaType + break } } + // For URLs without recognizable extensions, MediaType remains nil - // Ensure MediaType field is always set with normalized value - normalizedMediaType := finalMediaType - newImageContent.MediaType = &normalizedMediaType + return info +} - // Set URL based on includePrefix preference - if includePrefix { - // Full data URI format - newImageContent.URL = fmt.Sprintf("data:%s;base64,%s", finalMediaType, base64Data) - } else { - // Raw base64 data only - newImageContent.URL = base64Data +// detectImageTypeFromBase64 detects the image type from base64 data by examining the header bytes +func detectImageTypeFromBase64(base64Data string) string { + // Remove any whitespace or newlines + cleanData := strings.ReplaceAll(strings.ReplaceAll(base64Data, "\n", ""), " ", "") + + // Check common image format signatures in base64 + switch { + case strings.HasPrefix(cleanData, "/9j/") || strings.HasPrefix(cleanData, "/9k/"): + // JPEG images typically start with /9j/ or /9k/ in base64 (FFD8 in hex) + return "image/jpeg" + case strings.HasPrefix(cleanData, "iVBORw0KGgo"): + // PNG images start with iVBORw0KGgo in base64 (89504E470D0A1A0A in hex) + return "image/png" + case strings.HasPrefix(cleanData, "R0lGOD"): + // GIF images start with R0lGOD in base64 (474946 in hex) + return "image/gif" + case strings.HasPrefix(cleanData, "Qk"): + // BMP images start with Qk in base64 (424D in hex) + return "image/bmp" + case strings.HasPrefix(cleanData, "UklGR") && len(cleanData) >= 16 && cleanData[12:16] == "V0VC": + // WebP images start with RIFF header (UklGR in base64) and have WEBP signature at offset 8-11 (V0VC in base64) + return "image/webp" + case strings.HasPrefix(cleanData, "PHN2Zy") || strings.HasPrefix(cleanData, "PD94bW"): + // SVG images often start with 0 { for i, choice := range result.Choices { @@ -372,3 +389,21 @@ func SetupAllRequests(bifrostClient *bifrost.Bifrost, config TestConfig) { log.Println("Test setup finished.") bifrostClient.Cleanup() } + +func getResultContent(result *schemas.BifrostResponse) string { + if result == nil || len(result.Choices) == 0 { + return "" + } + + resultContent := "" + if result.Choices[0].Message.Content.ContentStr != nil { + resultContent = *result.Choices[0].Message.Content.ContentStr + } else if result.Choices[0].Message.Content.ContentBlocks != nil { + for _, block := range *result.Choices[0].Message.Content.ContentBlocks { + if block.Text != nil { + resultContent += *block.Text + } + } + } + return resultContent +} diff --git a/docs/http-transport-api.md b/docs/http-transport-api.md index fc87b9e346..c013a7fb66 100644 --- a/docs/http-transport-api.md +++ b/docs/http-transport-api.md @@ -78,6 +78,36 @@ Creates a chat completion using conversational messages. } ``` +#### Request Body with Structured Content (Text and Image) + +```json +{ + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's happening in this image? What's the weather like?" + }, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/weather-photo.jpg" + } + } + ] + } + ], + "params": { + "max_tokens": 1000, + "temperature": 0.7 + } +} +``` + #### Response ```json @@ -212,16 +242,34 @@ The main request object for both chat and text completions. Represents a message in a chat conversation. -| Field | Type | Required | Description | -| --------------- | ------------------------------- | -------- | ---------------------------------------------------- | -| `role` | `string` | ✅ | Message role (`user`, `assistant`, `system`, `tool`) | -| `content` | `string` | ❌ | Text content of the message | -| `tool_call_id` | `string` | ❌ | ID of the tool call (for tool messages) | -| `tool_calls` | [`ToolCall[]`](#toolcall) | ❌ | Tool calls made by assistant | -| `image_content` | [`ImageContent`](#imagecontent) | ❌ | Image data in the message | -| `refusal` | `string` | ❌ | Refusal message from assistant | -| `annotations` | `Annotation[]` | ❌ | Message annotations | -| `thought` | `string` | ❌ | Assistant's internal thought process | +| Field | Type | Required | Description | +| -------------- | --------------------------------------------- | -------- | ------------------------------------------------------------------------------- | +| `role` | `string` | ✅ | Message role (`user`, `assistant`, `system`, `tool`) | +| `content` | `string` or [`ContentBlock[]`](#contentblock) | ✅ | Message content - can be simple text or structured content with text and images | +| `tool_call_id` | `string` | ❌ | ID of the tool call (for tool messages) | +| `tool_calls` | [`ToolCall[]`](#toolcall) | ❌ | Tool calls made by assistant | +| `refusal` | `string` | ❌ | Refusal message from assistant | +| `annotations` | `Annotation[]` | ❌ | Message annotations | +| `thought` | `string` | ❌ | Assistant's internal thought process | + +### ContentBlock + +Represents a structured content block in a message (for text and image content). + +| Field | Type | Required | Description | +| ----------- | ----------------------------------- | -------- | ---------------------------------------------- | +| `type` | `string` | ✅ | Content type (`text` or `image_url`) | +| `text` | `string` | ❌ | Text content (required when type is `text`) | +| `image_url` | [`ImageURLStruct`](#imageurlstruct) | ❌ | Image data (required when type is `image_url`) | + +### ImageURLStruct + +Represents image data in a message. + +| Field | Type | Required | Description | +| -------- | -------- | -------- | ------------------------------------------ | +| `url` | `string` | ✅ | Image URL or data URI | +| `detail` | `string` | ❌ | Image detail level (`low`, `high`, `auto`) | ### ModelParameters @@ -369,17 +417,6 @@ Details of a function call. | `name` | `string` | Function name | | `arguments` | `string` | JSON string of function arguments | -### ImageContent - -Image data in a message. - -| Field | Type | Description | -| ------------ | -------- | ------------------------------------------ | -| `type` | `string` | Content type | -| `url` | `string` | Image URL or data URI | -| `media_type` | `string` | MIME type of the image | -| `detail` | `string` | Image detail level (`low`, `high`, `auto`) | - ### BifrostError Error response format. @@ -446,6 +483,26 @@ curl -X POST http://localhost:8080/v1/chat/completions \ }' ``` +### Chat with Images + +```bash +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What do you see in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} + ] + } + ] + }' +``` + ### Chat with Tools ```bash @@ -532,11 +589,23 @@ def chat_completion(messages, provider="openai", model="gpt-4o"): ) return response.json() -# Usage +# Simple text message result = chat_completion([ {"role": "user", "content": "Hello, how are you?"} ]) print(result["choices"][0]["message"]["content"]) + +# Structured content with image +result = chat_completion([ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} + ] + } +]) +print(result["choices"][0]["message"]["content"]) ``` ### Node.js @@ -562,12 +631,21 @@ async function chatCompletion(messages, provider = "openai", model = "gpt-4o") { } } -// Usage -chatCompletion([{ role: "user", content: "Hello, how are you?" }]).then( - (result) => { - console.log(result.choices[0].message.content); - } -); +// Usage with structured content +chatCompletion([ + { + role: "user", + content: [ + { type: "text", text: "Describe this image" }, + { + type: "image_url", + image_url: { url: "https://example.com/image.jpg" }, + }, + ], + }, +]).then((result) => { + console.log(result.choices[0].message.content); +}); ``` ### Go @@ -590,8 +668,19 @@ type ChatRequest struct { } type BifrostMessage struct { - Role string `json:"role"` - Content string `json:"content"` + Role string `json:"role"` + Content interface{} `json:"content"` // Can be string or []ContentBlock +} + +type ContentBlock struct { + Type string `json:"type"` + Text *string `json:"text,omitempty"` + ImageURL *ImageURLStruct `json:"image_url,omitempty"` +} + +type ImageURLStruct struct { + URL string `json:"url"` + Detail *string `json:"detail,omitempty"` } type ModelParameters struct { diff --git a/docs/openapi.json b/docs/openapi.json index c7a50e6b07..f72a0ee464 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -109,6 +109,35 @@ } ] } + }, + "structured_content": { + "summary": "Chat with structured content (text and image)", + "value": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's happening in this image? What's the weather like?" + }, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/weather-photo.jpg", + "detail": "high" + } + } + ] + } + ], + "params": { + "max_tokens": 1000, + "temperature": 0.7 + } + } } } } @@ -415,9 +444,21 @@ "$ref": "#/components/schemas/MessageRole" }, "content": { - "type": "string", - "description": "Text content of the message", - "example": "Hello, how are you?" + "oneOf": [ + { + "type": "string", + "description": "Simple text content", + "example": "Hello, how are you?" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/ContentBlock" + }, + "description": "Structured content with text and images" + } + ], + "description": "Message content - can be simple text or structured content with text and images" }, "tool_call_id": { "type": "string", @@ -430,9 +471,6 @@ }, "description": "Tool calls made by assistant" }, - "image_content": { - "$ref": "#/components/schemas/ImageContent" - }, "refusal": { "type": "string", "description": "Refusal message from assistant" @@ -456,6 +494,67 @@ "description": "Role of the message sender", "example": "user" }, + "ContentBlock": { + "type": "object", + "required": ["type"], + "discriminator": { + "propertyName": "type" + }, + "oneOf": [ + { + "type": "object", + "required": ["type", "text"], + "properties": { + "type": { + "type": "string", + "enum": ["text"], + "description": "Content type for text blocks", + "example": "text" + }, + "text": { + "type": "string", + "description": "Text content", + "example": "What do you see in this image?" + } + }, + "additionalProperties": false + }, + { + "type": "object", + "required": ["type", "image_url"], + "properties": { + "type": { + "type": "string", + "enum": ["image_url"], + "description": "Content type for image blocks", + "example": "image_url" + }, + "image_url": { + "$ref": "#/components/schemas/ImageURLStruct", + "description": "Image data" + } + }, + "additionalProperties": false + } + ] + }, + "ImageURLStruct": { + "type": "object", + "required": ["url"], + "properties": { + "url": { + "type": "string", + "description": "Image URL or data URI", + "example": "https://example.com/image.jpg" + }, + "detail": { + "type": "string", + "enum": ["low", "high", "auto"], + "description": "Image detail level", + "example": "auto" + } + } + }, "ModelParameters": { "type": "object", "properties": { @@ -657,31 +756,6 @@ } } }, - "ImageContent": { - "type": "object", - "properties": { - "type": { - "type": "string", - "description": "Content type" - }, - "url": { - "type": "string", - "description": "Image URL or data URI", - "example": "https://example.com/image.jpg" - }, - "media_type": { - "type": "string", - "description": "MIME type of the image", - "example": "image/jpeg" - }, - "detail": { - "type": "string", - "enum": ["low", "high", "auto"], - "description": "Image detail level", - "example": "auto" - } - } - }, "Annotation": { "type": "object", "required": ["type", "url_citation"],