diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py index 4524be3..e51ad7d 100644 --- a/src/ecosystem_analyzer/diff.py +++ b/src/ecosystem_analyzer/diff.py @@ -54,6 +54,13 @@ class DiagnosticDiff: RAW_DIFF_SAMPLE_SEED = 137 LARGE_TIMING_CHANGE_THRESHOLD = 0.5 + # GitHub's comment body limit is 65,536 characters. We keep a small + # margin so surrounding markup (details/summary tags, etc.) and any + # future additions don't push us over. This margin is effectively a + # safety buffer _on top_ of the calculated size of the summary table. + GITHUB_COMMENT_CHAR_LIMIT = 65_536 + GITHUB_COMMENT_CHAR_MARGIN = 1_024 + def __init__( self, old_file: str, @@ -1164,7 +1171,6 @@ def render_statistics_markdown( self, *, inline_threshold: int = 15, - max_raw_diff_lines: int = 100, ) -> str: statistics = self._calculate_statistics() failed_projects = self.diffs.get("failed_projects", []) @@ -1246,21 +1252,72 @@ def render_statistics_markdown( markdown_content += "\n\n" + # Determine the character budget available for the raw diff block. + # We account for the wrapping markup (details/summary tags, code + # fence, sampling note) so that the final comment stays within + # GitHub's character limit. + char_budget = ( + self.GITHUB_COMMENT_CHAR_LIMIT + - self.GITHUB_COMMENT_CHAR_MARGIN + - len(markdown_content) + ) + + # Reserve space for the static wrapper markup that surrounds the + # diff content. We estimate generously so the budget refers to + # the diff payload itself. + # - code fence: "```diff\n" + "\n```" = 12 chars + # - details/summary (worst case): ~80 chars + # - sampling note (worst case): ~120 chars + # - extra newlines / padding: ~30 chars + _wrapper_overhead = 250 + char_budget -= _wrapper_overhead + displayed_lines = raw_diff_lines sampled = False - if total_raw_diff_changes > max_raw_diff_lines: + displayed_change_count = total_raw_diff_changes + + full_diff_text = "\n".join(raw_diff_lines) + needs_sampling = len(full_diff_text) > char_budget + + if needs_sampling: sampled = True rng = random.Random(self.RAW_DIFF_SAMPLE_SEED) - change_entries = [ - (header, index) - for header, entries in sorted(raw_diff_sections.items()) - for index, (_lines, counts_as_change) in enumerate(entries) - if counts_as_change - ] - selected_entries = { - sampled_entry - for sampled_entry in rng.sample(change_entries, k=max_raw_diff_lines) - } + + # Build a list of (header, index, char_cost) for every + # change entry so we can greedily pick as many as fit. + change_entries: list[tuple[str, int, int]] = [] + for header, entries in sorted(raw_diff_sections.items()): + for index, (lines, counts_as_change) in enumerate(entries): + if counts_as_change: + # +1 for the newline joining + cost = sum(len(line) + 1 for line in lines) + change_entries.append((header, index, cost)) + + # Shuffle deterministically, then greedily pick entries that + # fit within the character budget. + rng.shuffle(change_entries) + + # Account for non-change lines (headers, etc.) that will + # always be included. Compute their cost first. + non_change_cost = 0 + for header, entries in sorted(raw_diff_sections.items()): + # header line + newline + non_change_cost += len(header) + 1 + for _lines, counts_as_change in entries: + if not counts_as_change: + non_change_cost += sum(len(line) + 1 for line in _lines) + # trailing blank line between sections + non_change_cost += 1 + + remaining = char_budget - non_change_cost + selected_entries: set[tuple[str, int]] = set() + for entry_header, entry_index, cost in change_entries: + if cost <= remaining: + selected_entries.add((entry_header, entry_index)) + remaining -= cost + + displayed_change_count = len(selected_entries) + displayed_sections: dict[str, list[tuple[list[str], bool]]] = {} for header, entries in sorted(raw_diff_sections.items()): kept_entries = [] @@ -1276,7 +1333,7 @@ def render_statistics_markdown( if sampled: markdown_content += ( f"_Showing a random sample of " - f"{max_raw_diff_lines} of {total_raw_diff_changes} changes. " + f"{displayed_change_count} of {total_raw_diff_changes} changes. " "See the HTML report for the full diff._\n\n" ) @@ -1288,7 +1345,7 @@ def render_statistics_markdown( else: summary = "Raw diff" if sampled: - summary += f" sample ({max_raw_diff_lines} of {total_raw_diff_changes} changes)" + summary += f" sample ({displayed_change_count} of {total_raw_diff_changes} changes)" else: summary += f" ({total_raw_diff_changes} changes)" markdown_content += f"
\n{summary}\n\n" diff --git a/src/ecosystem_analyzer/main.py b/src/ecosystem_analyzer/main.py index a3b2bb4..245ed18 100644 --- a/src/ecosystem_analyzer/main.py +++ b/src/ecosystem_analyzer/main.py @@ -497,13 +497,6 @@ def generate_timing_diff( show_default=True, help="Show the raw diff inline when it has fewer than this many changes", ) -@click.option( - "--max-raw-diff-lines", - type=int, - default=100, - show_default=True, - help="Maximum number of raw diff changes to include in Markdown before sampling", -) @click.option( "--old-name", type=str, @@ -525,7 +518,6 @@ def generate_diff_statistics( new_file: str, output: str, inline_threshold: int, - max_raw_diff_lines: int, old_name: str | None, new_name: str | None, fail_on_new_abnormal_exits: bool, @@ -539,7 +531,6 @@ def generate_diff_statistics( diff = DiagnosticDiff(old_file, new_file, old_name=old_name, new_name=new_name) markdown_content = diff.render_statistics_markdown( inline_threshold=inline_threshold, - max_raw_diff_lines=max_raw_diff_lines, ) with open(output, "w") as f: