Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
941f61e
Create specific Subreddit instructions
elzik Nov 1, 2025
6ecdb42
Merge branch 'main' of https://github.com/elzik/breef into improve-pe…
elzik Nov 25, 2025
a5ee7c4
Support development in VS Code
elzik Nov 25, 2025
e5506c6
Ensure Subreddit summaries include valid URLs
elzik Nov 25, 2025
9bdb8ef
Fix summarisation instruction typo
elzik Nov 25, 2025
339b0ce
Instruct to use commet URLs for comments
elzik Nov 25, 2025
5784130
Refactor method overloads & ordering
elzik Nov 25, 2025
94365e9
Remove unused overloads
elzik Nov 26, 2025
dd6c7d8
Use configured Reddit base URL
elzik Nov 26, 2025
a6e5da5
Open links externally and increase likelihood of replies being refernced
elzik Nov 26, 2025
134b56e
Add unhappy path & edge case tests
elzik Nov 26, 2025
0550b04
Allow assigning of null literal to non-nullable reference type for te…
elzik Nov 26, 2025
b0f0d7a
Use ArgumentNullException.ThrowIfNull
elzik Nov 26, 2025
1b7a14e
Make extract title & urls unique for subreddits to make Wallabag entr…
elzik Nov 27, 2025
f127eb9
Avoid Docker tests running in parallel to fix hangs
elzik Nov 28, 2025
b59a87f
Code quality fixes
elzik Nov 28, 2025
25fd92e
Fix typos in subreddit instructions
elzik Nov 28, 2025
49ef7db
Improve reddit post instructions
elzik Nov 28, 2025
ad66d25
Update src/Elzik.Breef.Infrastructure/SummarisationInstructions/Reddi…
elzik Nov 28, 2025
7441b5b
Fix grammatical errors in instructions
elzik Nov 29, 2025
4571b59
Clarify reddit instructions
elzik Nov 29, 2025
c5afe8a
Imrpove HTML content instructions
elzik Nov 29, 2025
6f90e5d
Fix markdown indentation
elzik Nov 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Local",
"type": "coreclr",
"request": "launch",
"program": "${workspaceFolder}/src/Elzik.Breef.Api/bin/Debug/net8.0/Elzik.Breef.Api.dll",
"cwd": "${workspaceFolder}/src/Elzik.Breef.Api",
"stopAtEntry": false
}
]
}
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,15 @@ Logging is handled by Serilog and configuration is documented [here](https://git
"MinimumLevel": {
"Default": "Debug" // breef_Serilog__MinimumLevel__Default
}
}
}
```

#### Time Zone

By default, the Docker container for the Breef API uses the UTC time zone. If you need to set a specific time zone for the application running in the container, set the `TZ` environment variable when building or running the container. For example:

```sh
docker run -e TZ=Europe/London ...
```

Replace `Europe/London` with your desired time zone identifier. A [comprehensive list can be found in Wikipedia](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones).
2 changes: 1 addition & 1 deletion src/Elzik.Breef.Application/BreefGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public async Task<PublishedBreef> GenerateBreefAsync(string url)

var summary = await contentSummariser.SummariseAsync(extract.Content, instructions);

var breef = new Domain.Breef(url, extract.Title, summary, extract.PreviewImageUrl);
var breef = new Domain.Breef(extract.OriginalUrl, extract.Title, summary, extract.PreviewImageUrl);

var publishedBreef = await breefPublisher.PublishAsync(breef);

Expand Down
7 changes: 6 additions & 1 deletion src/Elzik.Breef.Domain/Extract.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
namespace Elzik.Breef.Domain;

public record Extract(string Title, string Content, string? PreviewImageUrl, string ExtractType);
public record Extract(
string Title,
string Content,
string OriginalUrl,
string? PreviewImageUrl,
string ExtractType);
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

namespace Elzik.Breef.Infrastructure
{
public class CallerFixableHttpRequestException : HttpRequestException, ICallerFixableException
public class CallerFixableHttpRequestException(string message, Exception? innerException = null)
: HttpRequestException(message, innerException), ICallerFixableException
{
public CallerFixableHttpRequestException(string message, Exception? innerException = null)
: base(message, innerException)
{
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ protected override async Task<UntypedExtract> CreateUntypedExtractAsync(string w
var title = GetTitle(htmlDocument, webPageUrl);
var largestImageUrl = GetLargestImageUrl(htmlDocument);


return new UntypedExtract(title, content, largestImageUrl);
return new UntypedExtract(title, content, webPageUrl, largestImageUrl);
}

private static string GetContent(HtmlDocument htmlDocument)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,43 +1,21 @@
using System.Text.Json;
using System.Web;
using Microsoft.Extensions.Options;

namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw;

public class RawRedditPostTransformer : IRawRedditPostTransformer
{
public RedditPost Transform(RawRedditPost rawRedditPost)
{
ArgumentNullException.ThrowIfNull(rawRedditPost);
if (rawRedditPost.Count < 2)
throw new ArgumentException("Reddit post must have at least 2 listings (post and comments)", nameof(rawRedditPost));
private readonly RedditOptions _options;

var postListing = rawRedditPost[0];
var commentsListing = rawRedditPost[1];

var postChildren = postListing.Data?.Children;
if (postChildren == null || postChildren.Count == 0)
throw new ArgumentException("Post listing must contain at least one child", nameof(rawRedditPost));
public RawRedditPostTransformer(IOptions<RedditOptions> options)
{
ArgumentNullException.ThrowIfNull(options);

var mainPostData = postChildren[0].Data;
var bestImage = ExtractBestImage(mainPostData);
if (options.Value == null)
throw new InvalidOperationException("RedditOptions configuration is missing or not bound.");

var redditPost = new RedditPost
{
Post = new RedditPostContent
{
Id = mainPostData.Id ?? string.Empty,
Title = mainPostData.Title ?? throw new InvalidOperationException("Reddit post must have a title"),
Author = mainPostData.Author ?? string.Empty,
Subreddit = mainPostData.Subreddit ?? string.Empty,
Score = mainPostData.Score,
Content = mainPostData.Content ?? string.Empty,
CreatedUtc = mainPostData.CreatedUtc,
ImageUrl = bestImage
},
Comments = TransformComments(commentsListing)
};

return redditPost;
_options = options.Value;
}

private static string? ExtractBestImage(RawRedditCommentData postData)
Expand Down Expand Up @@ -91,7 +69,6 @@ public RedditPost Transform(RawRedditPost rawRedditPost)

return null;
}

private static bool IsImageUrl(string? url)
{
if (string.IsNullOrEmpty(url))
Expand All @@ -104,22 +81,24 @@ private static bool IsImageUrl(string? url)
return extension is ".jpg" or ".jpeg" or ".png" or ".gif" or ".webp" or ".bmp" or ".svg";
}

private List<RedditComment> TransformComments(List<RawRedditChild> children)
private List<RedditComment> TransformComments(List<RawRedditChild> children, string subreddit, string postId, string host)
{
var comments = new List<RedditComment>();

foreach (var child in children)
{
if (child.Kind == "t1")
{
var commentUrl = $"https://{host}/r/{subreddit}/comments/{postId}/comment/{child.Data.Id}/";
var comment = new RedditComment
{
Id = child.Data.Id ?? string.Empty,
Author = child.Data.Author ?? string.Empty,
Score = child.Data.Score,
Content = child.Data.Content ?? string.Empty,
CreatedUtc = child.Data.CreatedUtc,
Replies = TransformComments(child.Data.Replies)
PostUrl = commentUrl,
Replies = TransformComments(child.Data.Replies, subreddit, postId, host)
};

comments.Add(comment);
Expand All @@ -129,7 +108,7 @@ private List<RedditComment> TransformComments(List<RawRedditChild> children)
return comments;
}

private List<RedditComment> TransformComments(object? replies)
private List<RedditComment> TransformComments(object? replies, string subreddit, string postId, string host)
{
if (replies == null)
return [];
Expand All @@ -148,7 +127,7 @@ private List<RedditComment> TransformComments(object? replies)
try
{
var deserializedListing = JsonSerializer.Deserialize<RawRedditListing>(jsonElement.GetRawText());
return TransformComments(deserializedListing);
return TransformComments(deserializedListing, subreddit, postId, host);
}
catch
{
Expand All @@ -157,12 +136,12 @@ private List<RedditComment> TransformComments(object? replies)
}

if (replies is RawRedditListing listing)
return TransformComments(listing);
return TransformComments(listing, subreddit, postId, host);

return [];
}

private List<RedditComment> TransformComments(RawRedditListing? replies)
private List<RedditComment> TransformComments(RawRedditListing? replies, string subreddit, string postId, string host)
{
if (replies == null)
return [];
Expand All @@ -173,6 +152,51 @@ private List<RedditComment> TransformComments(RawRedditListing? replies)
if (replies.Data.Children == null)
return [];

return TransformComments(replies.Data.Children);
return TransformComments(replies.Data.Children, subreddit, postId, host);
}

public RedditPost Transform(RawRedditPost rawRedditPost)
{
ArgumentNullException.ThrowIfNull(rawRedditPost);
if (rawRedditPost.Count < 2)
throw new ArgumentException("Reddit post must have at least 2 listings (post and comments)", nameof(rawRedditPost));

var postListing = rawRedditPost[0];
var commentsListing = rawRedditPost[1];

var postChildren = postListing.Data?.Children;
if (postChildren == null || postChildren.Count == 0)
throw new ArgumentException("Post listing must contain at least one child", nameof(rawRedditPost));

var mainPostData = postChildren[0].Data;
var bestImage = ExtractBestImage(mainPostData);

var subreddit = mainPostData.Subreddit ?? string.Empty;
var postId = mainPostData.Id ?? string.Empty;
var postUrl = mainPostData.Url ?? string.Empty;

if (!Uri.TryCreate(_options.DefaultBaseAddress, UriKind.Absolute, out var defaultUri))
{
throw new InvalidOperationException("RedditOptions.DefaultBaseAddress is not a valid absolute URI");
}

var redditPost = new RedditPost
{
Post = new RedditPostContent
{
Id = mainPostData.Id ?? string.Empty,
Title = mainPostData.Title ?? throw new InvalidOperationException("Reddit post must have a title"),
Author = mainPostData.Author ?? string.Empty,
Subreddit = subreddit,
Score = mainPostData.Score,
Content = mainPostData.Content ?? string.Empty,
CreatedUtc = mainPostData.CreatedUtc,
ImageUrl = bestImage,
PostUrl = postUrl
},
Comments = TransformComments(commentsListing.Data?.Children ?? [], subreddit, postId, defaultUri.Host)
};

return redditPost;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class RedditPostContent
public string Content { get; set; } = string.Empty;
public DateTime CreatedUtc { get; set; }
public string? ImageUrl { get; set; }
public string PostUrl { get; set; } = string.Empty;
}

public class RedditComment
Expand All @@ -26,4 +27,5 @@ public class RedditComment
public string Content { get; set; } = string.Empty;
public DateTime CreatedUtc { get; set; }
public List<RedditComment> Replies { get; set; } = [];
public string PostUrl { get; set; } = string.Empty;
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ protected override async Task<UntypedExtract> CreateUntypedExtractAsync(string w

var postId = segments[3];
var post = await redditPostClient.GetPost(postId);
post.Post.PostUrl = webPageUrl;

if (string.IsNullOrWhiteSpace(post.Post.ImageUrl))
{
Expand All @@ -69,6 +70,6 @@ protected override async Task<UntypedExtract> CreateUntypedExtractAsync(string w

var postJson = JsonSerializer.Serialize(post);

return new UntypedExtract(post.Post.Title, postJson, post.Post.ImageUrl);
return new UntypedExtract(post.Post.Title, postJson, webPageUrl, post.Post.ImageUrl);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit;
public class SubredditContentExtractor(
ISubredditClient subredditClient,
IHttpClientFactory httpClientFactory,
TimeProvider timeProvider,
IOptions<RedditOptions> redditOptions)
: ContentExtractorBase, ISubredditImageExtractor
{
Expand Down Expand Up @@ -43,7 +44,11 @@ protected override async Task<UntypedExtract> CreateUntypedExtractAsync(string w
var jsonContent = JsonSerializer.Serialize(newInSubreddit);
var imageUrl = await ExtractImageUrlAsync(webPageUri);

return new UntypedExtract($"New in r/{subredditName}", jsonContent, imageUrl);
var dateAndTime = timeProvider.GetLocalNow().ToString("yyyy-MM-dd HH:mm");
var title = $"New in r/{subredditName} as of {dateAndTime}";
var instanceSpecificOriginalUrl = $"{webPageUrl}#{dateAndTime}";

return new UntypedExtract(title, jsonContent, instanceSpecificOriginalUrl, imageUrl);
}

public async Task<string> GetSubredditImageUrlAsync(string subredditName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

namespace Elzik.Breef.Infrastructure.ContentExtractors;

public record UntypedExtract(string Title, string Content, string? PreviewImageUrl)
public record UntypedExtract(string Title, string Content, string OriginalUrl, string? PreviewImageUrl)
{
public Extract WithType(string extractType)
=> new(Title, Content, PreviewImageUrl, extractType);
=> new(Title, Content, OriginalUrl, PreviewImageUrl, extractType);
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
You are an expert summarizer. Your task is to summarize the provided text:
- Summarise text, including HTML entities.
- Limit summaries to 10% of the original length but never more than 200 words.
- Ensure accurate attribution of information to the correct entities.
- Do not include a link to the original article.
- Do not include the title in the response.
- Do not include any metadata in the response.
- Do not include a code block in the response.
# Task

Summarise the provided web page text.

## Input Structure

The provided text is the main content extracted from a web page.

## Requirements

1. **Overview**
- Summarise text, including HTML entities.
- Limit summaries to 10% of the original length but never more than 200 words.
- Ensure accurate attribution of information to the correct entities.
2. **Exclude**
- Links to original article
- Web page title
- Metadata
- Code blocks

## Output Formatting

- Strictly well-formatted HTML output
- Do not include any markdown notation nor put the summary in a codeblock
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
You are an expert summarizer. Your task is to summarize the provided text:
- Summarise text, including HTML entities.
- Limit summaries to 10% of the original length but never more then 200 words.
- Ensure accurate attribution of information to the correct entities.
- Do not include a link to the original articles.
- Do not include the title in the response.
- Do not include any metadata in the response.
- Do not include a code block in the response.
# Task

Summarise the provided Reddit post JSON data containing a single post and its nested comments.

## Input Structure

JSON with a top-level "Post" item containing:
- Metadata: Title, Author, Subreddit, Score, Content, CreatedUtc, PostUrl
- Comments: Array with Author, Score, Content, CreatedUtc, PostUrl and nested Replies

## Requirements

1. **Overview**: Describe the general state/themes of the Reddit post
2. **Top Level Post**: Summarise the top-level post in detail
3. **Replies**: Summarise only the highest-scoring replies in lower detail
4. **Summaries**:
- Maximum 200 words OR 10% of original length (whichever is shorter)
- For a top-level post, include post title as HTML link to the post URL: `<a href="RedditPostContent.PostUrl" target="_blank" rel="noopener noreferrer">Title</a>`
- For a post's highest-scoring replies, also summarise them and include author attribution with comment links: `<a href="RedditComment.PostUrl" target="_blank" rel="noopener noreferrer">@author</a>`. Link to the author's comment, not to the author's profile.
5. **Exclude**:
- Links to Subreddit
- Root post title
- Metadata timestamps/scores
- Code blocks
- General description of the subreddit itself

## Output Formatting

- Strictly well-formatted HTML output
- Do not include any markdown notation nor put the summary in a codeblock
Loading
Loading