diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs index 6afe1409e75..792bb750b5c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs @@ -23,6 +23,11 @@ internal static class DataUriParser { public static string Scheme => "data:"; + /// + /// The default media type per RFC 2397 when the media type is omitted. + /// + public const string DefaultMediaType = "text/plain;charset=US-ASCII"; + public static DataUri Parse(ReadOnlyMemory dataUri) { // Validate, then trim off the "data:" scheme. @@ -59,9 +64,14 @@ public static DataUri Parse(ReadOnlyMemory dataUri) } // Validate the media type, if present. + // Per RFC 2397, if the media type is omitted, it defaults to "text/plain;charset=US-ASCII". ReadOnlySpan span = metadata.Span.Trim(); string? mediaType = null; - if (!span.IsEmpty && !IsValidMediaType(span, ref mediaType)) + if (span.IsEmpty) + { + mediaType = DefaultMediaType; + } + else if (!IsValidMediaType(span, ref mediaType)) { throw new UriFormatException("Invalid data URI format: the media type is not a valid."); } @@ -91,6 +101,7 @@ public static bool IsValidMediaType(ReadOnlySpan mediaTypeSpan, [NotNull] // For common media types, we can avoid both allocating a string for the span and avoid parsing overheads. string? knownType = mediaTypeSpan switch { + DefaultMediaType => DefaultMediaType, "application/json" => "application/json", "application/octet-stream" => "application/octet-stream", "application/pdf" => "application/pdf", diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs index d87d776185a..3e6fa85a489 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs @@ -109,6 +109,35 @@ public void Ctor_NoMediaType_Roundtrips() Assert.Equal("aGVsbG8=", content.Base64Data.ToString()); } + [Theory] + [InlineData("data:,hello", "hello")] + [InlineData("data:;base64,aGVsbG8=", "hello")] + [InlineData("data:,hello%20world", "hello world")] + [InlineData("data:,", "")] + [InlineData("data:;base64,", "")] + public void Ctor_OmittedMediaType_DefaultsToTextPlain(string uri, string expectedData) + { + // Per RFC 2397, if the media type is omitted, it defaults to "text/plain;charset=US-ASCII" + static void Validate(DataContent content, string expectedData) + { + Assert.Equal("text/plain;charset=US-ASCII", content.MediaType); + Assert.Equal(expectedData, Encoding.UTF8.GetString(content.Data.ToArray())); + } + + Validate(new DataContent(uri), expectedData); + Validate(new DataContent(new Uri(uri)), expectedData); + } + + [Theory] + [InlineData("data:,hello", "application/json")] + [InlineData("data:;base64,aGVsbG8=", "application/octet-stream")] + public void Ctor_OmittedMediaType_CanBeOverridden(string uri, string mediaType) + { + // When media type is omitted in the URI but provided as a parameter, the parameter takes precedence + var content = new DataContent(uri, mediaType); + Assert.Equal(mediaType, content.MediaType); + } + [Fact] public void Serialize_MatchesExpectedJson() {