diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..774bdcd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,13 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Local", + "type": "coreclr", + "request": "launch", + "program": "${workspaceFolder}/src/Elzik.Breef.Api/bin/Debug/net8.0/Elzik.Breef.Api.dll", + "cwd": "${workspaceFolder}/src/Elzik.Breef.Api", + "stopAtEntry": false + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 1659fec..f087e43 100644 --- a/README.md +++ b/README.md @@ -145,4 +145,15 @@ Logging is handled by Serilog and configuration is documented [here](https://git "MinimumLevel": { "Default": "Debug" // breef_Serilog__MinimumLevel__Default } -} \ No newline at end of file +} +``` + +#### Time Zone + +By default, the Docker container for the Breef API uses the UTC time zone. If you need to set a specific time zone for the application running in the container, set the `TZ` environment variable when building or running the container. For example: + +```sh +docker run -e TZ=Europe/London ... +``` + +Replace `Europe/London` with your desired time zone identifier. A [comprehensive list can be found in Wikipedia](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). \ No newline at end of file diff --git a/src/Elzik.Breef.Application/BreefGenerator.cs b/src/Elzik.Breef.Application/BreefGenerator.cs index 0affe27..a642213 100644 --- a/src/Elzik.Breef.Application/BreefGenerator.cs +++ b/src/Elzik.Breef.Application/BreefGenerator.cs @@ -16,7 +16,7 @@ public async Task GenerateBreefAsync(string url) var summary = await contentSummariser.SummariseAsync(extract.Content, instructions); - var breef = new Domain.Breef(url, extract.Title, summary, extract.PreviewImageUrl); + var breef = new Domain.Breef(extract.OriginalUrl, extract.Title, summary, extract.PreviewImageUrl); var publishedBreef = await breefPublisher.PublishAsync(breef); diff --git a/src/Elzik.Breef.Domain/Extract.cs b/src/Elzik.Breef.Domain/Extract.cs index 6e2dd4a..d77fb50 100644 --- a/src/Elzik.Breef.Domain/Extract.cs +++ b/src/Elzik.Breef.Domain/Extract.cs @@ -1,3 +1,8 @@ namespace Elzik.Breef.Domain; -public record Extract(string Title, string Content, string? PreviewImageUrl, string ExtractType); +public record Extract( + string Title, + string Content, + string OriginalUrl, + string? PreviewImageUrl, + string ExtractType); diff --git a/src/Elzik.Breef.Infrastructure/CallerFixableHttpRequestException.cs b/src/Elzik.Breef.Infrastructure/CallerFixableHttpRequestException.cs index fcd96e1..a996e84 100644 --- a/src/Elzik.Breef.Infrastructure/CallerFixableHttpRequestException.cs +++ b/src/Elzik.Breef.Infrastructure/CallerFixableHttpRequestException.cs @@ -2,11 +2,8 @@ namespace Elzik.Breef.Infrastructure { - public class CallerFixableHttpRequestException : HttpRequestException, ICallerFixableException + public class CallerFixableHttpRequestException(string message, Exception? innerException = null) + : HttpRequestException(message, innerException), ICallerFixableException { - public CallerFixableHttpRequestException(string message, Exception? innerException = null) - : base(message, innerException) - { - } } } diff --git a/src/Elzik.Breef.Infrastructure/ContentExtractors/HtmlContentExtractor.cs b/src/Elzik.Breef.Infrastructure/ContentExtractors/HtmlContentExtractor.cs index a055040..8fb5bd1 100644 --- a/src/Elzik.Breef.Infrastructure/ContentExtractors/HtmlContentExtractor.cs +++ b/src/Elzik.Breef.Infrastructure/ContentExtractors/HtmlContentExtractor.cs @@ -31,8 +31,7 @@ protected override async Task CreateUntypedExtractAsync(string w var title = GetTitle(htmlDocument, webPageUrl); var largestImageUrl = GetLargestImageUrl(htmlDocument); - - return new UntypedExtract(title, content, largestImageUrl); + return new UntypedExtract(title, content, webPageUrl, largestImageUrl); } private static string GetContent(HtmlDocument htmlDocument) diff --git a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/Raw/RawRedditPostTransformer.cs b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/Raw/RawRedditPostTransformer.cs index ec30a74..8f2d5ef 100644 --- a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/Raw/RawRedditPostTransformer.cs +++ b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/Raw/RawRedditPostTransformer.cs @@ -1,43 +1,21 @@ using System.Text.Json; using System.Web; +using Microsoft.Extensions.Options; namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw; public class RawRedditPostTransformer : IRawRedditPostTransformer { - public RedditPost Transform(RawRedditPost rawRedditPost) - { - ArgumentNullException.ThrowIfNull(rawRedditPost); - if (rawRedditPost.Count < 2) - throw new ArgumentException("Reddit post must have at least 2 listings (post and comments)", nameof(rawRedditPost)); + private readonly RedditOptions _options; - var postListing = rawRedditPost[0]; - var commentsListing = rawRedditPost[1]; - - var postChildren = postListing.Data?.Children; - if (postChildren == null || postChildren.Count == 0) - throw new ArgumentException("Post listing must contain at least one child", nameof(rawRedditPost)); + public RawRedditPostTransformer(IOptions options) + { + ArgumentNullException.ThrowIfNull(options); - var mainPostData = postChildren[0].Data; - var bestImage = ExtractBestImage(mainPostData); + if (options.Value == null) + throw new InvalidOperationException("RedditOptions configuration is missing or not bound."); - var redditPost = new RedditPost - { - Post = new RedditPostContent - { - Id = mainPostData.Id ?? string.Empty, - Title = mainPostData.Title ?? throw new InvalidOperationException("Reddit post must have a title"), - Author = mainPostData.Author ?? string.Empty, - Subreddit = mainPostData.Subreddit ?? string.Empty, - Score = mainPostData.Score, - Content = mainPostData.Content ?? string.Empty, - CreatedUtc = mainPostData.CreatedUtc, - ImageUrl = bestImage - }, - Comments = TransformComments(commentsListing) - }; - - return redditPost; + _options = options.Value; } private static string? ExtractBestImage(RawRedditCommentData postData) @@ -91,7 +69,6 @@ public RedditPost Transform(RawRedditPost rawRedditPost) return null; } - private static bool IsImageUrl(string? url) { if (string.IsNullOrEmpty(url)) @@ -104,7 +81,7 @@ private static bool IsImageUrl(string? url) return extension is ".jpg" or ".jpeg" or ".png" or ".gif" or ".webp" or ".bmp" or ".svg"; } - private List TransformComments(List children) + private List TransformComments(List children, string subreddit, string postId, string host) { var comments = new List(); @@ -112,6 +89,7 @@ private List TransformComments(List children) { if (child.Kind == "t1") { + var commentUrl = $"https://{host}/r/{subreddit}/comments/{postId}/comment/{child.Data.Id}/"; var comment = new RedditComment { Id = child.Data.Id ?? string.Empty, @@ -119,7 +97,8 @@ private List TransformComments(List children) Score = child.Data.Score, Content = child.Data.Content ?? string.Empty, CreatedUtc = child.Data.CreatedUtc, - Replies = TransformComments(child.Data.Replies) + PostUrl = commentUrl, + Replies = TransformComments(child.Data.Replies, subreddit, postId, host) }; comments.Add(comment); @@ -129,7 +108,7 @@ private List TransformComments(List children) return comments; } - private List TransformComments(object? replies) + private List TransformComments(object? replies, string subreddit, string postId, string host) { if (replies == null) return []; @@ -148,7 +127,7 @@ private List TransformComments(object? replies) try { var deserializedListing = JsonSerializer.Deserialize(jsonElement.GetRawText()); - return TransformComments(deserializedListing); + return TransformComments(deserializedListing, subreddit, postId, host); } catch { @@ -157,12 +136,12 @@ private List TransformComments(object? replies) } if (replies is RawRedditListing listing) - return TransformComments(listing); + return TransformComments(listing, subreddit, postId, host); return []; } - private List TransformComments(RawRedditListing? replies) + private List TransformComments(RawRedditListing? replies, string subreddit, string postId, string host) { if (replies == null) return []; @@ -173,6 +152,51 @@ private List TransformComments(RawRedditListing? replies) if (replies.Data.Children == null) return []; - return TransformComments(replies.Data.Children); + return TransformComments(replies.Data.Children, subreddit, postId, host); + } + + public RedditPost Transform(RawRedditPost rawRedditPost) + { + ArgumentNullException.ThrowIfNull(rawRedditPost); + if (rawRedditPost.Count < 2) + throw new ArgumentException("Reddit post must have at least 2 listings (post and comments)", nameof(rawRedditPost)); + + var postListing = rawRedditPost[0]; + var commentsListing = rawRedditPost[1]; + + var postChildren = postListing.Data?.Children; + if (postChildren == null || postChildren.Count == 0) + throw new ArgumentException("Post listing must contain at least one child", nameof(rawRedditPost)); + + var mainPostData = postChildren[0].Data; + var bestImage = ExtractBestImage(mainPostData); + + var subreddit = mainPostData.Subreddit ?? string.Empty; + var postId = mainPostData.Id ?? string.Empty; + var postUrl = mainPostData.Url ?? string.Empty; + + if (!Uri.TryCreate(_options.DefaultBaseAddress, UriKind.Absolute, out var defaultUri)) + { + throw new InvalidOperationException("RedditOptions.DefaultBaseAddress is not a valid absolute URI"); + } + + var redditPost = new RedditPost + { + Post = new RedditPostContent + { + Id = mainPostData.Id ?? string.Empty, + Title = mainPostData.Title ?? throw new InvalidOperationException("Reddit post must have a title"), + Author = mainPostData.Author ?? string.Empty, + Subreddit = subreddit, + Score = mainPostData.Score, + Content = mainPostData.Content ?? string.Empty, + CreatedUtc = mainPostData.CreatedUtc, + ImageUrl = bestImage, + PostUrl = postUrl + }, + Comments = TransformComments(commentsListing.Data?.Children ?? [], subreddit, postId, defaultUri.Host) + }; + + return redditPost; } } \ No newline at end of file diff --git a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/RedditPost.cs b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/RedditPost.cs index b9815cf..fe66196 100644 --- a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/RedditPost.cs +++ b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/Client/RedditPost.cs @@ -16,6 +16,7 @@ public class RedditPostContent public string Content { get; set; } = string.Empty; public DateTime CreatedUtc { get; set; } public string? ImageUrl { get; set; } + public string PostUrl { get; set; } = string.Empty; } public class RedditComment @@ -26,4 +27,5 @@ public class RedditComment public string Content { get; set; } = string.Empty; public DateTime CreatedUtc { get; set; } public List Replies { get; set; } = []; + public string PostUrl { get; set; } = string.Empty; } \ No newline at end of file diff --git a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/RedditPostContentExtractor.cs b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/RedditPostContentExtractor.cs index aebe1c1..442375e 100644 --- a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/RedditPostContentExtractor.cs +++ b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/RedditPostContentExtractor.cs @@ -60,6 +60,7 @@ protected override async Task CreateUntypedExtractAsync(string w var postId = segments[3]; var post = await redditPostClient.GetPost(postId); + post.Post.PostUrl = webPageUrl; if (string.IsNullOrWhiteSpace(post.Post.ImageUrl)) { @@ -69,6 +70,6 @@ protected override async Task CreateUntypedExtractAsync(string w var postJson = JsonSerializer.Serialize(post); - return new UntypedExtract(post.Post.Title, postJson, post.Post.ImageUrl); + return new UntypedExtract(post.Post.Title, postJson, webPageUrl, post.Post.ImageUrl); } } diff --git a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/SubRedditContentExtractor.cs b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/SubRedditContentExtractor.cs index fef43f3..e0c4e32 100644 --- a/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/SubRedditContentExtractor.cs +++ b/src/Elzik.Breef.Infrastructure/ContentExtractors/Reddit/SubRedditContentExtractor.cs @@ -8,6 +8,7 @@ namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit; public class SubredditContentExtractor( ISubredditClient subredditClient, IHttpClientFactory httpClientFactory, + TimeProvider timeProvider, IOptions redditOptions) : ContentExtractorBase, ISubredditImageExtractor { @@ -43,7 +44,11 @@ protected override async Task CreateUntypedExtractAsync(string w var jsonContent = JsonSerializer.Serialize(newInSubreddit); var imageUrl = await ExtractImageUrlAsync(webPageUri); - return new UntypedExtract($"New in r/{subredditName}", jsonContent, imageUrl); + var dateAndTime = timeProvider.GetLocalNow().ToString("yyyy-MM-dd HH:mm"); + var title = $"New in r/{subredditName} as of {dateAndTime}"; + var instanceSpecificOriginalUrl = $"{webPageUrl}#{dateAndTime}"; + + return new UntypedExtract(title, jsonContent, instanceSpecificOriginalUrl, imageUrl); } public async Task GetSubredditImageUrlAsync(string subredditName) diff --git a/src/Elzik.Breef.Infrastructure/ContentExtractors/UntypedExtract.cs b/src/Elzik.Breef.Infrastructure/ContentExtractors/UntypedExtract.cs index d713d1e..6740dce 100644 --- a/src/Elzik.Breef.Infrastructure/ContentExtractors/UntypedExtract.cs +++ b/src/Elzik.Breef.Infrastructure/ContentExtractors/UntypedExtract.cs @@ -2,8 +2,8 @@ namespace Elzik.Breef.Infrastructure.ContentExtractors; -public record UntypedExtract(string Title, string Content, string? PreviewImageUrl) +public record UntypedExtract(string Title, string Content, string OriginalUrl, string? PreviewImageUrl) { public Extract WithType(string extractType) - => new(Title, Content, PreviewImageUrl, extractType); + => new(Title, Content, OriginalUrl, PreviewImageUrl, extractType); } diff --git a/src/Elzik.Breef.Infrastructure/SummarisationInstructions/HtmlContent.md b/src/Elzik.Breef.Infrastructure/SummarisationInstructions/HtmlContent.md index fa84cd9..42c61d0 100644 --- a/src/Elzik.Breef.Infrastructure/SummarisationInstructions/HtmlContent.md +++ b/src/Elzik.Breef.Infrastructure/SummarisationInstructions/HtmlContent.md @@ -1,8 +1,24 @@ -You are an expert summarizer. Your task is to summarize the provided text: -- Summarise text, including HTML entities. -- Limit summaries to 10% of the original length but never more than 200 words. -- Ensure accurate attribution of information to the correct entities. -- Do not include a link to the original article. -- Do not include the title in the response. -- Do not include any metadata in the response. -- Do not include a code block in the response. \ No newline at end of file +# Task + +Summarise the provided web page text. + +## Input Structure + +The provided text is the main content extracted from a web page. + +## Requirements + +1. **Overview** + - Summarise text, including HTML entities. + - Limit summaries to 10% of the original length but never more than 200 words. + - Ensure accurate attribution of information to the correct entities. +2. **Exclude** + - Links to original article + - Web page title + - Metadata + - Code blocks + +## Output Formatting + +- Strictly well-formatted HTML output +- Do not include any markdown notation nor put the summary in a codeblock diff --git a/src/Elzik.Breef.Infrastructure/SummarisationInstructions/RedditPostContent.md b/src/Elzik.Breef.Infrastructure/SummarisationInstructions/RedditPostContent.md index de2b8cd..ec4c3fb 100644 --- a/src/Elzik.Breef.Infrastructure/SummarisationInstructions/RedditPostContent.md +++ b/src/Elzik.Breef.Infrastructure/SummarisationInstructions/RedditPostContent.md @@ -1,8 +1,30 @@ -You are an expert summarizer. Your task is to summarize the provided text: - - Summarise text, including HTML entities. - - Limit summaries to 10% of the original length but never more then 200 words. - - Ensure accurate attribution of information to the correct entities. - - Do not include a link to the original articles. - - Do not include the title in the response. - - Do not include any metadata in the response. - - Do not include a code block in the response. \ No newline at end of file +# Task + +Summarise the provided Reddit post JSON data containing a single post and its nested comments. + +## Input Structure + +JSON with a top-level "Post" item containing: +- Metadata: Title, Author, Subreddit, Score, Content, CreatedUtc, PostUrl +- Comments: Array with Author, Score, Content, CreatedUtc, PostUrl and nested Replies + +## Requirements + +1. **Overview**: Describe the general state/themes of the Reddit post +2. **Top Level Post**: Summarise the top-level post in detail +3. **Replies**: Summarise only the highest-scoring replies in lower detail +4. **Summaries**: + - Maximum 200 words OR 10% of original length (whichever is shorter) + - For a top-level post, include post title as HTML link to the post URL: `Title` + - For a post's highest-scoring replies, also summarise them and include author attribution with comment links: `@author`. Link to the author's comment, not to the author's profile. +5. **Exclude**: + - Links to Subreddit + - Root post title + - Metadata timestamps/scores + - Code blocks + - General description of the subreddit itself + +## Output Formatting + +- Strictly well-formatted HTML output +- Do not include any markdown notation nor put the summary in a codeblock \ No newline at end of file diff --git a/src/Elzik.Breef.Infrastructure/SummarisationInstructions/SubredditContent.md b/src/Elzik.Breef.Infrastructure/SummarisationInstructions/SubredditContent.md index de2b8cd..96d0a66 100644 --- a/src/Elzik.Breef.Infrastructure/SummarisationInstructions/SubredditContent.md +++ b/src/Elzik.Breef.Infrastructure/SummarisationInstructions/SubredditContent.md @@ -1,8 +1,29 @@ -You are an expert summarizer. Your task is to summarize the provided text: - - Summarise text, including HTML entities. - - Limit summaries to 10% of the original length but never more then 200 words. - - Ensure accurate attribution of information to the correct entities. - - Do not include a link to the original articles. - - Do not include the title in the response. - - Do not include any metadata in the response. - - Do not include a code block in the response. \ No newline at end of file +# Task + +Summarise the provided Reddit subreddit JSON data containing posts and nested comments. + +## Input Structure + +JSON with a "Posts" array, where each post contains: +- Metadata: Title, Author, Subreddit, Score, Content, CreatedUtc, PostUrl +- Comments: Array with Author, Score, Content, CreatedUtc, PostUrl and nested Replies + +## Requirements + +1. **Overview**: Describe the general state/themes of the subreddit +2. **Posts**: Summarise every post with a thematic summary of its comments +3. **Summaries**: + - Maximum 200 words OR 10% of original length (whichever is shorter) + - For a top-level post, include post title as HTML link to the post URL: `Title` + - For a post's highest-scoring replies, also summarise them and include author attribution with comment links: `@author`. Link to the author's comment, not to the author's profile. +4. **Exclude**: + - Links to Subreddit + - Root post title + - Metadata timestamps/scores + - Code blocks + - General description of the subreddit itself + +## Output Formatting + +- Strictly well-formatted HTML output +- Do not include any markdown notation nor put the summary in a codeblock diff --git a/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsBase.cs b/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsBase.cs index 74e6526..7c42396 100644 --- a/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsBase.cs +++ b/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsBase.cs @@ -44,7 +44,7 @@ public async Task EndToEndHappyPath() Skip.If(SkipTestsIf, SkipTestsReason); // Arrange - var breef = new { Url = $"https://example.com" }; + var breef = new { Url = "http://example.com" }; // Act var response = await Client.PostAsJsonAsync($"{BaseUrl}/breefs", breef); diff --git a/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsDocker.cs b/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsDocker.cs index c9c9b4a..59686fc 100644 --- a/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsDocker.cs +++ b/tests/Elzik.Breef.Api.Tests.Functional/Breefs/BreefTestsDocker.cs @@ -5,6 +5,7 @@ namespace Elzik.Breef.Api.Tests.Functional.Breefs; +[Collection("Docker Tests")] public class BreefTestsDocker : BreefTestsBase, IAsyncLifetime { private const string DockerImageName = "ghcr.io/elzik/elzik-breef-api:latest"; diff --git a/tests/Elzik.Breef.Api.Tests.Functional/DockerTestsCollection.cs b/tests/Elzik.Breef.Api.Tests.Functional/DockerTestsCollection.cs new file mode 100644 index 0000000..36cb00b --- /dev/null +++ b/tests/Elzik.Breef.Api.Tests.Functional/DockerTestsCollection.cs @@ -0,0 +1,13 @@ +namespace Elzik.Breef.Api.Tests.Functional; + +/// +/// xUnit collection definition for Docker-based tests. +/// Ensures Docker tests run serially to prevent container conflicts. +/// +[CollectionDefinition("Docker Tests", DisableParallelization = true)] +public class DockerTestsCollection +{ + // This class has no code, and is never created. Its purpose is simply + // to be the place to apply [CollectionDefinition] and all the + // ICollectionFixture<> interfaces. +} diff --git a/tests/Elzik.Breef.Api.Tests.Functional/Health/HealthTestsDocker.cs b/tests/Elzik.Breef.Api.Tests.Functional/Health/HealthTestsDocker.cs index ade61b6..b062b87 100644 --- a/tests/Elzik.Breef.Api.Tests.Functional/Health/HealthTestsDocker.cs +++ b/tests/Elzik.Breef.Api.Tests.Functional/Health/HealthTestsDocker.cs @@ -5,6 +5,7 @@ namespace Elzik.Breef.Api.Tests.Functional.Health; +[Collection("Docker Tests")] public class HealthTestsDocker : HealthTestsBase, IAsyncLifetime { private const string DockerImageName = "ghcr.io/elzik/elzik-breef-api:latest"; diff --git a/tests/Elzik.Breef.Api.Tests.Integration/Elzik.Breef.Api.Tests.Integration.csproj b/tests/Elzik.Breef.Api.Tests.Integration/Elzik.Breef.Api.Tests.Integration.csproj index c6b0c4d..10a7b61 100644 --- a/tests/Elzik.Breef.Api.Tests.Integration/Elzik.Breef.Api.Tests.Integration.csproj +++ b/tests/Elzik.Breef.Api.Tests.Integration/Elzik.Breef.Api.Tests.Integration.csproj @@ -17,6 +17,7 @@ + diff --git a/tests/Elzik.Breef.Api.Tests.Integration/FileBasedContentSummarisationInstructionProviderTests.cs b/tests/Elzik.Breef.Api.Tests.Integration/FileBasedContentSummarisationInstructionProviderTests.cs index 6c29055..a45d329 100644 --- a/tests/Elzik.Breef.Api.Tests.Integration/FileBasedContentSummarisationInstructionProviderTests.cs +++ b/tests/Elzik.Breef.Api.Tests.Integration/FileBasedContentSummarisationInstructionProviderTests.cs @@ -98,10 +98,10 @@ public void Instantiated_InvalidRequiredExtractTypeNames_Throws(string[]? requir public static TheoryData EmptyArrayTestData() { - return new TheoryData - { - Array.Empty() - }; + return + [ + [] + ]; } [Fact] diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/HtmlContentExtractorTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/HtmlContentExtractorTests.cs index 187e998..b8e2a36 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/HtmlContentExtractorTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/HtmlContentExtractorTests.cs @@ -1,4 +1,3 @@ -using Elzik.Breef.Domain; using Elzik.Breef.Infrastructure.ContentExtractors; using NSubstitute; using Shouldly; @@ -42,6 +41,7 @@ public async Task Extract_WithValidUrl_ExtractsContent(string testFileName, stri result.Title.ShouldBe(expectedTitle); result.PreviewImageUrl.ShouldBe(expectedPreviewImageUrl); result.ExtractType.ShouldBe("HtmlContent"); + result.OriginalUrl.ShouldBe(mockTestUrl); } [Fact] diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RawRedditPostClientTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RawRedditPostClientTests.cs index bc5022a..57d3add 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RawRedditPostClientTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RawRedditPostClientTests.cs @@ -44,6 +44,7 @@ public async Task GetPost_ValidPostId_ReturnsRedditPost() "at companies – 12 weeks\n\nClean code – 6 weeks\n\nApprenticeship at companies – 16 weeks\n\nExam " + "thesis – 4 weeks"); mainPost.Content.ShouldBe(mainPost.SelfText); + mainPost.Url.ShouldNotBeNullOrWhiteSpace(); var replies = redditPost[1].Data.Children; diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RedditPostClientTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RedditPostClientTests.cs index 3b2d69b..5ee882b 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RedditPostClientTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/Client/RedditPostClientTests.cs @@ -1,5 +1,7 @@ +using Elzik.Breef.Infrastructure.ContentExtractors.Reddit; using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client; using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw; +using Microsoft.Extensions.Options; using Refit; using Shouldly; @@ -17,7 +19,11 @@ public async Task GetPost_ValidPostId_ReturnsExpectedRedditPost() "always blocked meaning this test case always fails. This must be run locally instead."); var rawRedditClient = RestService.For("https://www.reddit.com/"); - var transformer = new RawRedditPostTransformer(); + var options = Options.Create(new RedditOptions() + { + DefaultBaseAddress = "https://www.test-reddit.com" + }); + var transformer = new RawRedditPostTransformer(options); var redditClient = new RedditPostClient(rawRedditClient, transformer); var postId = "1kqiwzc"; // https://www.reddit.com/r/learnprogramming/comments/1kqiwzc diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs index ddf6f70..87bdb68 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs @@ -2,6 +2,7 @@ using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client; using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw; using Microsoft.Extensions.Options; +using Microsoft.Extensions.Time.Testing; using NSubstitute; using Refit; using Shouldly; @@ -15,11 +16,16 @@ public sealed class RedditPostContentExtractorTests : IDisposable private readonly RedditPostContentExtractor _extractor; private readonly HttpClient _httpClient; + private readonly FakeTimeProvider _fakeTimeProvider; public RedditPostContentExtractorTests() { var rawRedditClient = RestService.For("https://www.reddit.com/"); - var transformer = new RawRedditPostTransformer(); + var options = Options.Create(new RedditOptions() + { + DefaultBaseAddress = "https://www.test-reddit.com" + }); + var transformer = new RawRedditPostTransformer(options); var redditPostClient = new RedditPostClient(rawRedditClient, transformer); var rawSubredditClient = RestService.For("https://www.reddit.com/"); @@ -35,7 +41,10 @@ public RedditPostContentExtractorTests() _httpClient.Timeout = TimeSpan.FromSeconds(httpClientOptions.Value.TimeoutSeconds); mockHttpClientFactory.CreateClient("BreefDownloader").Returns(_httpClient); - var subredditImageExtractor = new SubredditContentExtractor(subredditClient, mockHttpClientFactory, redditOptions); + _fakeTimeProvider = new FakeTimeProvider(); + + var subredditImageExtractor = new SubredditContentExtractor( + subredditClient, mockHttpClientFactory, _fakeTimeProvider, redditOptions); _extractor = new RedditPostContentExtractor(redditPostClient, subredditImageExtractor, redditOptions); } @@ -57,12 +66,14 @@ public async Task ExtractAsync_RealRedditPost_ReturnsValidExtract(string url) result.Title.ShouldNotBeNullOrWhiteSpace(); result.Content.ShouldNotBeNullOrWhiteSpace(); result.PreviewImageUrl.ShouldNotBeNullOrWhiteSpace(); + result.OriginalUrl.ShouldBe(url); var redditPost = JsonSerializer.Deserialize(result.Content); redditPost.ShouldNotBeNull(); redditPost.Post.ShouldNotBeNull(); redditPost.Post.Id.ShouldBe("1kqiwzc"); redditPost.Post.Title.ShouldNotBeNullOrWhiteSpace(); + redditPost.Post.PostUrl.ShouldNotBeNullOrWhiteSpace(); redditPost.Comments.ShouldNotBeNull(); } @@ -136,6 +147,7 @@ public async Task ExtractAsync_ValidPost_ContentContainsCompleteRedditStructure( redditPost.Post.Author.ShouldNotBeNullOrEmpty(); redditPost.Post.Subreddit.ShouldNotBeNullOrEmpty(); redditPost.Post.CreatedUtc.ShouldNotBe(default); + redditPost.Post.PostUrl.ShouldNotBeNullOrWhiteSpace(); redditPost.Comments.ShouldNotBeNull(); if (redditPost.Comments.Count != 0) { diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Integration/Elzik.Breef.Infrastructure.Tests.Integration.csproj b/tests/Elzik.Breef.Infrastructure.Tests.Integration/Elzik.Breef.Infrastructure.Tests.Integration.csproj index 6eda64a..42b9916 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Integration/Elzik.Breef.Infrastructure.Tests.Integration.csproj +++ b/tests/Elzik.Breef.Infrastructure.Tests.Integration/Elzik.Breef.Infrastructure.Tests.Integration.csproj @@ -25,6 +25,7 @@ + diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorBaseTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorBaseTests.cs index 4f1fa4e..81d3163 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorBaseTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorBaseTests.cs @@ -51,6 +51,7 @@ public async Task ExtractAsync_WhenCalled_PreservesExtractDataFromCore() result.Title.ShouldBe("Test Title"); result.Content.ShouldBe("Test Content"); result.PreviewImageUrl.ShouldBe("https://example.com/image.jpg"); + result.OriginalUrl.ShouldBe(url); } [Fact] @@ -106,7 +107,8 @@ private class ValidTestExtractor : ContentExtractorBase protected override Task CreateUntypedExtractAsync(string webPageUrl) { - return Task.FromResult(new UntypedExtract("Test Title", "Test Content", "https://example.com/image.jpg")); + return Task.FromResult(new + UntypedExtract("Test Title", "Test Content", webPageUrl, "https://example.com/image.jpg")); } } @@ -116,7 +118,7 @@ private class AnotherValidExtractor : ContentExtractorBase protected override Task CreateUntypedExtractAsync(string webPageUrl) { - return Task.FromResult(new UntypedExtract("Another Title", "Another Content", null)); + return Task.FromResult(new UntypedExtract("Another Title", "Another Content", "https://original.url.com", null)); } } @@ -126,7 +128,7 @@ private class HtmlContentLikeExtractor : ContentExtractorBase protected override Task CreateUntypedExtractAsync(string webPageUrl) { - return Task.FromResult(new UntypedExtract("HTML Title", "HTML Content", null)); + return Task.FromResult(new UntypedExtract("HTML Title", "HTML Content", "https://original.url.com", null)); } } @@ -136,7 +138,7 @@ private class InvalidTestClass : ContentExtractorBase protected override Task CreateUntypedExtractAsync(string webPageUrl) { - return Task.FromResult(new UntypedExtract("Invalid", "Invalid", null)); + return Task.FromResult(new UntypedExtract("Invalid", "Invalid", "https://original.url.com", null)); } } diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorStrategyTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorStrategyTests.cs index a4da945..e53b8de 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorStrategyTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/ContentExtractorStrategyTests.cs @@ -8,9 +8,9 @@ namespace Elzik.Breef.Infrastructure.Tests.Unit.ContentExtractors; public class ContentExtractorStrategyTests { - private readonly Extract _extractedByExtractor1 = new("Title1", "Content1", "Image1", "Extractor1Type"); - private readonly Extract _extractedByExtractor2 = new("Title2", "Content2", "Image2", "Extractor2Type"); - private readonly Extract _extractedByDefaultExtractor = new("DefaultTitle", "DefaultContent", "DefaultImage", "DefaultExtractorType"); + private readonly Extract _extractedByExtractor1 = new("Title1", "Content1", "https://original.url.com", "Image1", "Extractor1Type"); + private readonly Extract _extractedByExtractor2 = new("Title2", "Content2", "https://original.url.com", "Image2", "Extractor2Type"); + private readonly Extract _extractedByDefaultExtractor = new("DefaultTitle", "DefaultContent", "https://original.url.com", "DefaultImage", "DefaultExtractorType"); private readonly IContentExtractor _extractor1 = Substitute.For(); private readonly IContentExtractor _extractor2 = Substitute.For(); diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawNewInSubredditTransformerTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawNewInSubredditTransformerTests.cs index e81949b..7f86645 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawNewInSubredditTransformerTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawNewInSubredditTransformerTests.cs @@ -56,7 +56,8 @@ public async Task Transform_ValidRawNewInSubreddit_ReturnsExpectedStructure() Author = "author1", Score = 100, Content = "Content 1", - CreatedUtc = new DateTime(2025, 1, 1, 12, 0, 0, DateTimeKind.Utc) + CreatedUtc = new DateTime(2025, 1, 1, 12, 0, 0, DateTimeKind.Utc), + PostUrl = "https://reddit.com/r/testsubreddit/comments/post1" }, Comments = [] }; @@ -70,7 +71,8 @@ public async Task Transform_ValidRawNewInSubreddit_ReturnsExpectedStructure() Author = "author2", Score = 200, Content = "Content 2", - CreatedUtc = new DateTime(2025, 1, 1, 13, 0, 0, DateTimeKind.Utc) + CreatedUtc = new DateTime(2025, 1, 1, 13, 0, 0, DateTimeKind.Utc), + PostUrl = "https://reddit.com/r/testsubreddit/comments/post2" }, Comments = [] }; @@ -92,6 +94,7 @@ public async Task Transform_ValidRawNewInSubreddit_ReturnsExpectedStructure() firstPost.Post.Author.ShouldBe("author1"); firstPost.Post.Score.ShouldBe(100); firstPost.Post.Content.ShouldBe("Content 1"); + firstPost.Post.PostUrl.ShouldBe("https://reddit.com/r/testsubreddit/comments/post1"); var secondPost = result.Posts[1]; secondPost.Post.Id.ShouldBe("post2"); @@ -99,6 +102,7 @@ public async Task Transform_ValidRawNewInSubreddit_ReturnsExpectedStructure() secondPost.Post.Author.ShouldBe("author2"); secondPost.Post.Score.ShouldBe(200); secondPost.Post.Content.ShouldBe("Content 2"); + secondPost.Post.PostUrl.ShouldBe("https://reddit.com/r/testsubreddit/comments/post2"); } [Fact] @@ -319,7 +323,8 @@ public async Task Transform_SinglePost_ReturnsNewInSubredditWithOnePost() Score = 42, Subreddit = "test", CreatedUtc = new DateTime(2025, 1, 1, 14, 0, 0, DateTimeKind.Utc), - ImageUrl = "https://example.com/image.jpg" + ImageUrl = "https://example.com/image.jpg", + PostUrl = "https://reddit.com/r/test/single_post" }, Comments = [ @@ -352,6 +357,7 @@ public async Task Transform_SinglePost_ReturnsNewInSubredditWithOnePost() post.Post.Score.ShouldBe(42); post.Post.Subreddit.ShouldBe("test"); post.Post.ImageUrl.ShouldBe("https://example.com/image.jpg"); + post.Post.PostUrl.ShouldNotBeNullOrEmpty(); post.Comments.Count.ShouldBe(1); post.Comments[0].Content.ShouldBe("Great post!"); } diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawRedditPostTransformerTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawRedditPostTransformerTests.cs index 41f8371..4ff5e0c 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawRedditPostTransformerTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RawRedditPostTransformerTests.cs @@ -1,11 +1,22 @@ +using Elzik.Breef.Infrastructure.ContentExtractors.Reddit; using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw; +using Microsoft.Extensions.Options; using Shouldly; namespace Elzik.Breef.Infrastructure.Tests.Unit.ContentExtractors.Reddit.Client; public class RawRedditPostTransformerTests { - private readonly RawRedditPostTransformer _transformer = new(); + private readonly RawRedditPostTransformer _transformer; + + public RawRedditPostTransformerTests() + { + var options = Options.Create(new RedditOptions() + { + DefaultBaseAddress = "https://www.test-reddit.com" + }); + _transformer = new RawRedditPostTransformer(options); + } [Fact] public void Transform_ValidRedditPost_ReturnsExpectedStructure() @@ -112,6 +123,7 @@ public void Transform_ValidRedditPost_ReturnsExpectedStructure() comment.Content.ShouldBe("This is a comment"); comment.Score.ShouldBe(50); comment.CreatedUtc.ShouldBe(new DateTime(2025, 1, 1, 12, 30, 0, DateTimeKind.Utc)); + comment.PostUrl.ShouldBe("https://www.test-reddit.com/r/testsubreddit/comments/test123/comment/comment123/"); comment.Replies.Count.ShouldBe(1); var reply = comment.Replies[0]; @@ -120,6 +132,7 @@ public void Transform_ValidRedditPost_ReturnsExpectedStructure() reply.Content.ShouldBe("This is a reply"); reply.Score.ShouldBe(25); reply.CreatedUtc.ShouldBe(new DateTime(2025, 1, 1, 13, 0, 0, DateTimeKind.Utc)); + reply.PostUrl.ShouldBe("https://www.test-reddit.com/r/testsubreddit/comments/test123/comment/reply123/"); reply.Replies.Count.ShouldBe(0); } @@ -432,4 +445,775 @@ public void Transform_NullRawRedditPost_ThrowsArgumentNullException() Should.Throw(() => _transformer.Transform(null!)) .ParamName.ShouldBe("rawRedditPost"); } -} \ No newline at end of file + + + [Fact] + public void Instantiated_NullOptions_ThrowsArgumentNullException() + { + // Act & Assert + Should.Throw(() => new RawRedditPostTransformer(null!)) + .ParamName.ShouldBe("options"); + } + + [Fact] + public void Instantiated_NullOptionsValue_ThrowsInvalidOperationException() + { + // Arrange + var options = Options.Create(null!); + + // Act & Assert + Should.Throw(() => new RawRedditPostTransformer(options)) + .Message.ShouldContain("RedditOptions configuration is missing or not bound"); + } + + [Fact] + public void Transform_PostListingWithNoChildren_ThrowsArgumentException() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = [] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act & Assert + Should.Throw(() => _transformer.Transform(redditPost)) + .Message.ShouldContain("Post listing must contain at least one child"); + } + + [Fact] + public void Transform_PostWithNullTitle_ThrowsInvalidOperationException() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = null, + Author = "testuser", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act & Assert + Should.Throw(() => _transformer.Transform(redditPost)) + .Message.ShouldContain("Reddit post must have a title"); + } + + [Fact] + public void Transform_InvalidRedditOptionsBaseAddress_ThrowsInvalidOperationException() + { + // Arrange + var options = Options.Create(new RedditOptions + { + DefaultBaseAddress = "not-a-valid-uri" + }); + var transformer = new RawRedditPostTransformer(options); + + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Test Post", + Author = "testuser", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act & Assert + Should.Throw(() => transformer.Transform(redditPost)) + .Message.ShouldContain("RedditOptions.DefaultBaseAddress is not a valid absolute URI"); + } + + [Fact] + public void Transform_PostWithHtmlEncodedGalleryUrl_DecodesUrlCorrectly() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Gallery Post", + Author = "testuser", + IsGallery = true, + GalleryData = new RawRedditGalleryData + { + Items = [new RawRedditGalleryItem { MediaId = "img1" }] + }, + MediaMetadata = new Dictionary + { + ["img1"] = new RawRedditMediaMetadata + { + Status = "valid", + Source = new RawRedditImageSource + { + Url = "https://i.redd.it/gallery&test.jpg", + Width = 1000, + Height = 800 + } + } + }, + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBe("https://i.redd.it/gallery&test.jpg"); + } + + [Fact] + public void Transform_PostWithHtmlEncodedPreviewUrl_DecodesUrlCorrectly() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Preview Post", + Author = "testuser", + Preview = new RawRedditPreview + { + Enabled = true, + Images = + [ + new RawRedditPreviewImage + { + Source = new RawRedditImageSource + { + Url = "https://preview.redd.it/test&image.jpg", + Width = 800, + Height = 600 + } + } + ] + }, + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBe("https://preview.redd.it/test&image.jpg"); + } + + [Fact] + public void Transform_PostWithUrlOverriddenByDest_UsesOverriddenUrl() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Overridden URL Post", + Author = "testuser", + Url = "https://i.redd.it/original.jpg", + UrlOverriddenByDest = "https://i.redd.it/overridden.jpg", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBe("https://i.redd.it/overridden.jpg"); + } + + [Fact] + public void Transform_PostWithMultiplePreviewImages_SelectsLargestImage() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Multiple Preview Images", + Author = "testuser", + Preview = new RawRedditPreview + { + Enabled = true, + Images = + [ + new RawRedditPreviewImage + { + Source = new RawRedditImageSource + { + Url = "https://preview.redd.it/small.jpg", + Width = 400, + Height = 300 + } + }, + new RawRedditPreviewImage + { + Source = new RawRedditImageSource + { + Url = "https://preview.redd.it/large.jpg", + Width = 1600, + Height = 1200 + } + }, + new RawRedditPreviewImage + { + Source = new RawRedditImageSource + { + Url = "https://preview.redd.it/medium.jpg", + Width = 800, + Height = 600 + } + } + ] + }, + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBe("https://preview.redd.it/large.jpg"); + } + + [Fact] + public void Transform_PostWithInvalidGalleryMetadata_SkipsInvalidImages() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Gallery with Invalid Metadata", + Author = "testuser", + IsGallery = true, + GalleryData = new RawRedditGalleryData + { + Items = + [ + new RawRedditGalleryItem { MediaId = "invalid" }, + new RawRedditGalleryItem { MediaId = "valid" } + ] + }, + MediaMetadata = new Dictionary + { + ["invalid"] = new RawRedditMediaMetadata + { + Status = "invalid", + Source = new RawRedditImageSource + { + Url = "https://i.redd.it/invalid.jpg", + Width = 1000, + Height = 800 + } + }, + ["valid"] = new RawRedditMediaMetadata + { + Status = "valid", + Source = new RawRedditImageSource + { + Url = "https://i.redd.it/valid.jpg", + Width = 800, + Height = 600 + } + } + }, + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBe("https://i.redd.it/valid.jpg"); + } + + [Fact] + public void Transform_PostWithDefaultThumbnail_IgnoresThumbnail() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Default Thumbnail Post", + Author = "testuser", + Thumbnail = "default", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBeNull(); + } + + [Fact] + public void Transform_PostWithNsfwThumbnail_IgnoresThumbnail() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "NSFW Thumbnail Post", + Author = "testuser", + Thumbnail = "nsfw", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBeNull(); + } + + [Fact] + public void Transform_PostWithNonImageUrl_ReturnsNullImageUrl() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Non-Image URL Post", + Author = "testuser", + Url = "https://example.com/article.html", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing { Data = new RawRedditListingData { Children = [] } } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Post.ImageUrl.ShouldBeNull(); + } + + [Fact] + public void Transform_CommentsWithEmptyStringReplies_ReturnsEmptyList() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Test Post", + Author = "testuser", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t1", + Data = new RawRedditCommentData + { + Id = "comment123", + Author = "commenter", + Body = "Comment with empty string replies", + Score = 10, + CreatedUtc = DateTime.UtcNow, + Replies = "" + } + } + ] + } + } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Comments.Count.ShouldBe(1); + result.Comments[0].Replies.Count.ShouldBe(0); + } + + [Fact] + public void Transform_CommentsWithNullListingData_ReturnsEmptyList() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Test Post", + Author = "testuser", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t1", + Data = new RawRedditCommentData + { + Id = "comment123", + Author = "commenter", + Body = "Comment", + Score = 10, + CreatedUtc = DateTime.UtcNow, + Replies = new RawRedditListing { Data = null! } + } + } + ] + } + } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Comments.Count.ShouldBe(1); + result.Comments[0].Replies.Count.ShouldBe(0); + } + + [Fact] + public void Transform_CommentsWithNullChildren_ReturnsEmptyList() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Test Post", + Author = "testuser", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t1", + Data = new RawRedditCommentData + { + Id = "comment123", + Author = "commenter", + Body = "Comment", + Score = 10, + CreatedUtc = DateTime.UtcNow, + Replies = new RawRedditListing + { + Data = new RawRedditListingData { Children = null! } + } + } + } + ] + } + } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Comments.Count.ShouldBe(1); + result.Comments[0].Replies.Count.ShouldBe(0); + } + + [Fact] + public void Transform_CommentsWithNonT1Kind_SkipsComment() + { + // Arrange + var redditPost = new RawRedditPost + { + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "t3", + Data = new RawRedditCommentData + { + Id = "test123", + Title = "Test Post", + Author = "testuser", + CreatedUtc = DateTime.UtcNow + } + } + ] + } + }, + new RawRedditListing + { + Kind = "Listing", + Data = new RawRedditListingData + { + Children = + [ + new RawRedditChild + { + Kind = "more", + Data = new RawRedditCommentData + { + Id = "more123", + Author = "system", + CreatedUtc = DateTime.UtcNow + } + }, + new RawRedditChild + { + Kind = "t1", + Data = new RawRedditCommentData + { + Id = "comment123", + Author = "commenter", + Body = "Valid comment", + Score = 10, + CreatedUtc = DateTime.UtcNow + } + } + ] + } + } + }; + + // Act + var result = _transformer.Transform(redditPost); + + // Assert + result.Comments.Count.ShouldBe(1); + result.Comments[0].Id.ShouldBe("comment123"); + } +} diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RedditPostClientTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RedditPostClientTests.cs index 35a8988..a633c71 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RedditPostClientTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/Client/RedditPostClientTests.cs @@ -55,7 +55,7 @@ public async Task GetPost_ValidRedditPost_ReturnsTransformedPost() comment.Content.ShouldBe("This is a comment"); comment.Score.ShouldBe(50); comment.CreatedUtc.ShouldBe(new DateTime(2025, 1, 1, 12, 30, 0, DateTimeKind.Utc)); - + comment.PostUrl.ShouldBe($"https://www.reddit.com/r/testsubreddit/comments/test123/comment/comment123/"); comment.Replies.Count.ShouldBe(1); var reply = comment.Replies[0]; @@ -64,6 +64,7 @@ public async Task GetPost_ValidRedditPost_ReturnsTransformedPost() reply.Content.ShouldBe("This is a reply"); reply.Score.ShouldBe(25); reply.Replies.Count.ShouldBe(0); + reply.PostUrl.ShouldBe($"https://www.reddit.com/r/testsubreddit/comments/test123/comment/reply123/"); _ = _mockRawClient.Received(1).GetPost(postId); @@ -267,6 +268,7 @@ private static RedditPost CreateExpectedTransformedResult() Content = "This is a comment", Score = 50, CreatedUtc = new DateTime(2025, 1, 1, 12, 30, 0, DateTimeKind.Utc), + PostUrl = "https://www.reddit.com/r/testsubreddit/comments/test123/comment/comment123/", Replies = [ new() { @@ -275,6 +277,7 @@ private static RedditPost CreateExpectedTransformedResult() Content = "This is a reply", Score = 25, CreatedUtc = new DateTime(2025, 1, 1, 13, 0, 0, DateTimeKind.Utc), + PostUrl = "https://www.reddit.com/r/testsubreddit/comments/test123/comment/reply123/", Replies = [] } ] diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs index 6d51bad..7b1326c 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/RedditPostContentExtractorTests.cs @@ -210,12 +210,27 @@ public async Task ExtractAsync_ValidUrl_ReturnsCorrectTitle() result.Title.ShouldBe(expectedTitle); } + [Fact] + public async Task ExtractAsync_ValidUrl_ReturnsCorrectOriginalUrl() + { + // Arrange + var url = "https://www.reddit.com/r/programming/comments/abc123/title"; + var testPost = CreateTestRedditPost("abc123", "title", "https://example.com/image.jpg"); + _mockRedditPostClient.GetPost("abc123").Returns(testPost); + + // Act + var result = await _extractor.ExtractAsync(url); + + // Assert + result.OriginalUrl.ShouldBe(url); + } + [Fact] public async Task ExtractAsync_ValidUrl_ReturnsSerializedPostAsContent() { // Arrange var url = "https://www.reddit.com/r/programming/comments/abc123/title"; - var testPost = CreateTestRedditPost("abc123", "Test Title", "https://example.com/image.jpg"); + var testPost = CreateTestRedditPost("abc123", "Test Title", "https://example.com/image.jpg", url); _mockRedditPostClient.GetPost("abc123").Returns(testPost); // Act @@ -226,6 +241,7 @@ public async Task ExtractAsync_ValidUrl_ReturnsSerializedPostAsContent() deserializedPost.ShouldNotBeNull(); deserializedPost.Post.Id.ShouldBe("abc123"); deserializedPost.Post.Title.ShouldBe("Test Title"); + deserializedPost.Post.PostUrl.ShouldBe(url); } [Theory] @@ -349,7 +365,7 @@ public async Task ExtractAsync_SubredditImageExtractorThrows_PropagatesException await Should.ThrowAsync(() => _extractor.ExtractAsync(url)); } - private static RedditPost CreateTestRedditPost(string id, string title, string? imageUrl) => new() + private static RedditPost CreateTestRedditPost(string id, string title, string? imageUrl, string? postUrl = null) => new() { Post = new RedditPostContent { @@ -360,7 +376,8 @@ public async Task ExtractAsync_SubredditImageExtractorThrows_PropagatesException Score = 100, Content = "Test post content", CreatedUtc = DateTime.UtcNow, - ImageUrl = imageUrl + ImageUrl = imageUrl, + PostUrl = postUrl ?? $"https://reddit.com/r/testsubreddit/comments/{id}" }, Comments = [ diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/SubRedditExtractorTests.cs b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/SubRedditExtractorTests.cs index 3b6f49e..f342af4 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/SubRedditExtractorTests.cs +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/Reddit/SubRedditExtractorTests.cs @@ -1,8 +1,11 @@ -using Elzik.Breef.Infrastructure.ContentExtractors.Reddit; +using Elzik.Breef.Domain; +using Elzik.Breef.Infrastructure.ContentExtractors.Reddit; using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client; using Microsoft.Extensions.Options; +using Microsoft.Extensions.Time.Testing; using NSubstitute; using Shouldly; +using System; using System.Text.Json; namespace Elzik.Breef.Infrastructure.Tests.Unit.ContentExtractors.Reddit @@ -15,12 +18,13 @@ public class SubredditExtractorTests private readonly IHttpClientFactory _mockHttpClientFactory; private readonly IOptions _mockRedditOptions; private readonly SubredditContentExtractor _extractor; + private readonly FakeTimeProvider _fakeTimeProvider; public SubredditExtractorTests() { _mockSubredditClient = Substitute.For(); _mockSubredditClient.GetNewInSubreddit(Arg.Any()) - .Returns(new NewInSubreddit { Posts = new List() }); + .Returns(new NewInSubreddit { Posts = [] }); _mockHttpClientFactory = Substitute.For(); var mockHandler = new MockHttpMessageHandler(JsonSerializer.Serialize(new { data = new { } }), System.Net.HttpStatusCode.OK); @@ -35,7 +39,9 @@ public SubredditExtractorTests() FallbackImageUrl = FallbackImageUrl }); - _extractor = new SubredditContentExtractor(_mockSubredditClient, _mockHttpClientFactory, _mockRedditOptions); + _fakeTimeProvider = new FakeTimeProvider(new DateTimeOffset(2015, 10, 21, 7, 28, 0, TimeSpan.Zero)); + + _extractor = new SubredditContentExtractor(_mockSubredditClient, _mockHttpClientFactory, _fakeTimeProvider, _mockRedditOptions); } [Theory] @@ -81,7 +87,8 @@ public void CanHandle_CustomRedditInstance_ReturnsTrue(string url) FallbackImageUrl = FallbackImageUrl }; _mockRedditOptions.Value.Returns(customOptions); - var extractor = new SubredditContentExtractor(_mockSubredditClient, _mockHttpClientFactory, _mockRedditOptions); + var extractor = new SubredditContentExtractor( + _mockSubredditClient, _mockHttpClientFactory, _fakeTimeProvider, _mockRedditOptions); // Act var canHandle = extractor.CanHandle(url); @@ -103,7 +110,8 @@ public void CanHandle_UnknownRedditInstance_ReturnsFalse(string url) FallbackImageUrl = FallbackImageUrl }; _mockRedditOptions.Value.Returns(customOptions); - var extractor = new SubredditContentExtractor(_mockSubredditClient, _mockHttpClientFactory, _mockRedditOptions); + var extractor = new SubredditContentExtractor( + _mockSubredditClient, _mockHttpClientFactory, _fakeTimeProvider, _mockRedditOptions); // Act var canHandle = extractor.CanHandle(url); @@ -193,7 +201,7 @@ public async Task ExtractAsync_AvailableContent_ReturnsExpectedTitle() var result = await _extractor.ExtractAsync(url); // Assert - result.Title.ShouldBe($"New in r/subreddit"); + result.Title.ShouldBe($"New in r/subreddit as of 2015-10-21 07:28"); } [Fact] @@ -213,12 +221,12 @@ public async Task ExtractAsync_AvailableContent_ReturnsExpectedContent() Content = "Test content", CreatedUtc = new DateTime(2024, 1, 1, 0, 0, 0, DateTimeKind.Utc) }, - Comments = new List() + Comments = [] }; var newInSubreddit = new NewInSubreddit { - Posts = new List { samplePost } + Posts = [samplePost] }; var expectedJson = JsonSerializer.Serialize(newInSubreddit); @@ -485,10 +493,27 @@ public async Task ExtractAsync_UrlWithQueryString_ExtractsCorrectSubredditName() var result = await _extractor.ExtractAsync("https://www.reddit.com/r/dotnet/?utm_source=share#section"); // Assert - result.Title.ShouldBe("New in r/dotnet"); + result.Title.ShouldBe("New in r/dotnet as of 2015-10-21 07:28"); await _mockSubredditClient.Received(1).GetNewInSubreddit("dotnet"); } + [Fact] + public async Task ExtractAsync_ValidUrl_GeneratesInstanceSpecificOriginalUrl() + { + // Arrange + var json = JsonSerializer.Serialize(new { data = new { } }); + var mockHandler = new MockHttpMessageHandler(json, System.Net.HttpStatusCode.OK); + var httpClient = new HttpClient(mockHandler); + _mockHttpClientFactory.CreateClient("BreefDownloader").Returns(httpClient); + var url = "https://www.reddit.com/r/dotnet"; + + // Act - URL with both query string and fragment + var result = await _extractor.ExtractAsync(url); + + // Assert + result.OriginalUrl.ShouldBe($"{url}#{_fakeTimeProvider.GetLocalNow():yyyy-MM-dd HH:mm}"); + } + [Theory] [InlineData("null")] [InlineData("empty")] @@ -512,7 +537,7 @@ public async Task ExtractAsync_ImageUrlIsInvalid_UsesFallbackImageUrl(string inv var json = CreateJsonWithImageKey("icon_img", imageUrl); _mockSubredditClient.GetNewInSubreddit("subreddit") - .Returns(new NewInSubreddit { Posts = new List() }); + .Returns(new NewInSubreddit { Posts = [] }); var mockHandler = new MockHttpMessageHandler(json, System.Net.HttpStatusCode.OK); var httpClient = new HttpClient(mockHandler); @@ -540,52 +565,38 @@ private static string CreateJsonWithImageKey(string key, string? value) return JsonSerializer.Serialize(new { data }); } - private class MockHttpMessageHandler : HttpMessageHandler + private class MockHttpMessageHandler( + string defaultResponse, + System.Net.HttpStatusCode defaultStatusCode, + string? failUrl = null, + System.Net.HttpStatusCode failStatusCode = System.Net.HttpStatusCode.NotFound) + : HttpMessageHandler { - private readonly string _defaultResponse; - private readonly System.Net.HttpStatusCode _defaultStatusCode; - private readonly string? _failUrl; - private readonly System.Net.HttpStatusCode _failStatusCode; - - public MockHttpMessageHandler(string defaultResponse, System.Net.HttpStatusCode defaultStatusCode, string? failUrl = null, System.Net.HttpStatusCode failStatusCode = System.Net.HttpStatusCode.NotFound) - { - _defaultResponse = defaultResponse; - _defaultStatusCode = defaultStatusCode; - _failUrl = failUrl; - _failStatusCode = failStatusCode; - } - protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) { - if (_failUrl != null && request.RequestUri?.AbsoluteUri == _failUrl) + if (failUrl != null && request.RequestUri?.AbsoluteUri == failUrl) { return Task.FromResult(new HttpResponseMessage { - StatusCode = _failStatusCode, + StatusCode = failStatusCode, Content = new StringContent("") }); } return Task.FromResult(new HttpResponseMessage { - StatusCode = _defaultStatusCode, - Content = new StringContent(_defaultResponse) + StatusCode = defaultStatusCode, + Content = new StringContent(defaultResponse) }); } } - private class ThrowingMockHttpMessageHandler : HttpMessageHandler + private class ThrowingMockHttpMessageHandler(Exception exception) : HttpMessageHandler { - private readonly Exception _exception; - - public ThrowingMockHttpMessageHandler(Exception exception) - { - _exception = exception; - } - - protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + protected override Task SendAsync( + HttpRequestMessage request, CancellationToken cancellationToken) { - throw _exception; + throw exception; } } } diff --git a/tests/Elzik.Breef.Infrastructure.Tests.Unit/Elzik.Breef.Infrastructure.Tests.Unit.csproj b/tests/Elzik.Breef.Infrastructure.Tests.Unit/Elzik.Breef.Infrastructure.Tests.Unit.csproj index b48d794..aa6a0f5 100644 --- a/tests/Elzik.Breef.Infrastructure.Tests.Unit/Elzik.Breef.Infrastructure.Tests.Unit.csproj +++ b/tests/Elzik.Breef.Infrastructure.Tests.Unit/Elzik.Breef.Infrastructure.Tests.Unit.csproj @@ -20,6 +20,7 @@ + diff --git a/tests/SampleRequests/Elzik.Breef.Api.http b/tests/SampleRequests/Elzik.Breef.Api.http index a65c9cf..ebc7ef8 100644 --- a/tests/SampleRequests/Elzik.Breef.Api.http +++ b/tests/SampleRequests/Elzik.Breef.Api.http @@ -1,33 +1,36 @@ -@Elzik.Breef.Api_HostAddress = http://localhost:5079 +@host = http://localhost:5079 ### HTML -Post {{Elzik.Breef.Api_HostAddress}}/breefs +POST {{host}}/breefs Content-Type: application/json BREEF-API-KEY: test-key + { - "url":"https://www.positive.news/society/swiping-less-living-more-how-to-take-control-of-our-digital-lives/" + "url": "https://www.positive.news/society/swiping-less-living-more-how-to-take-control-of-our-digital-lives/" } ### Reddit Post -Post {{Elzik.Breef.Api_HostAddress}}/breefs +POST {{host}}/breefs Content-Type: application/json BREEF-API-KEY: test-key + { - "url":"https://www.reddit.com/r/selfhosted/comments/1ojndg6/advice_should_i_buy_a_new_router_or_build_one/" + "url": "https://www.reddit.com/r/selfhosted/comments/1ojndg6/advice_should_i_buy_a_new_router_or_build_one/" } ### Subreddit -Post {{Elzik.Breef.Api_HostAddress}}/breefs +POST {{host}}/breefs Content-Type: application/json BREEF-API-KEY: test-key + { - "url":"https://www.reddit.com/r/dotnet/" + "url": "https://www.reddit.com/r/dotnet/" } ### Health -Get {{Elzik.Breef.Api_HostAddress}}/health -Content-Type: application/json \ No newline at end of file +GET {{host}}/health +Content-Type: application/json