From c040e1fc64157f570277e8821934a61312a01dec Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 22 Mar 2026 23:06:09 +0000 Subject: [PATCH 1/3] Use GitHub comment character limit instead of fixed 100-line cap for PR diff display Instead of truncating the raw diff to a hardcoded 100 entries, compute a character budget based on GitHub's 65,536-character comment limit minus the space already used by the rest of the comment. The sampling logic now greedily fills the budget with randomly-shuffled entries, displaying as many changes as will fit. The old --max-raw-diff-lines flag is preserved (default None) for backward compatibility but is no longer needed in the common case. https://claude.ai/code/session_01AQNDHAwavNHvBNnCQKAkx3 --- src/ecosystem_analyzer/diff.py | 89 ++++++++++++++++++++++++++++------ src/ecosystem_analyzer/main.py | 6 ++- 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py index 4524be3..ddf72d6 100644 --- a/src/ecosystem_analyzer/diff.py +++ b/src/ecosystem_analyzer/diff.py @@ -53,6 +53,11 @@ class DiagnosticDiff: RAW_DIFF_SAMPLE_SEED = 137 LARGE_TIMING_CHANGE_THRESHOLD = 0.5 + # GitHub comment body limit is 65,536 characters. Keep a small + # margin so surrounding markup (details/summary tags, etc.) and any + # future additions don't push us over. + GITHUB_COMMENT_CHAR_LIMIT = 65_536 + GITHUB_COMMENT_CHAR_MARGIN = 1_024 def __init__( self, @@ -1164,7 +1169,7 @@ def render_statistics_markdown( self, *, inline_threshold: int = 15, - max_raw_diff_lines: int = 100, + max_raw_diff_lines: int | None = None, ) -> str: statistics = self._calculate_statistics() failed_projects = self.diffs.get("failed_projects", []) @@ -1246,21 +1251,77 @@ def render_statistics_markdown( markdown_content += "\n\n" + # Determine the character budget available for the raw diff block. + # We account for the wrapping markup (details/summary tags, code + # fence, sampling note) so that the final comment stays within + # GitHub's character limit. + char_budget = ( + self.GITHUB_COMMENT_CHAR_LIMIT + - self.GITHUB_COMMENT_CHAR_MARGIN + - len(markdown_content) + ) + + # Reserve space for the static wrapper markup that surrounds the + # diff content. We estimate generously so the budget refers to + # the diff payload itself. + # - code fence: "```diff\n" + "\n```" = 12 chars + # - details/summary (worst case): ~80 chars + # - sampling note (worst case): ~120 chars + # - extra newlines / padding: ~30 chars + _wrapper_overhead = 250 + char_budget -= _wrapper_overhead + displayed_lines = raw_diff_lines sampled = False - if total_raw_diff_changes > max_raw_diff_lines: + displayed_change_count = total_raw_diff_changes + + full_diff_text = "\n".join(raw_diff_lines) + if max_raw_diff_lines is not None: + needs_sampling = total_raw_diff_changes > max_raw_diff_lines + else: + needs_sampling = len(full_diff_text) > char_budget + + if needs_sampling: sampled = True rng = random.Random(self.RAW_DIFF_SAMPLE_SEED) - change_entries = [ - (header, index) - for header, entries in sorted(raw_diff_sections.items()) - for index, (_lines, counts_as_change) in enumerate(entries) - if counts_as_change - ] - selected_entries = { - sampled_entry - for sampled_entry in rng.sample(change_entries, k=max_raw_diff_lines) - } + + # Build a list of (header, index, char_cost) for every + # change entry so we can greedily pick as many as fit. + change_entries: list[tuple[str, int, int]] = [] + for header, entries in sorted(raw_diff_sections.items()): + for index, (lines, counts_as_change) in enumerate(entries): + if counts_as_change: + # +1 for the newline joining + cost = sum(len(line) + 1 for line in lines) + change_entries.append((header, index, cost)) + + # Shuffle deterministically, then greedily pick entries that + # fit within the character budget. + rng.shuffle(change_entries) + + # Account for non-change lines (headers, etc.) that will + # always be included. Compute their cost first. + non_change_cost = 0 + for header, entries in sorted(raw_diff_sections.items()): + # header line + newline + non_change_cost += len(header) + 1 + for _lines, counts_as_change in entries: + if not counts_as_change: + non_change_cost += sum(len(line) + 1 for line in _lines) + # trailing blank line between sections + non_change_cost += 1 + + remaining = char_budget - non_change_cost + selected_entries: set[tuple[str, int]] = set() + for entry_header, entry_index, cost in change_entries: + if max_raw_diff_lines is not None and len(selected_entries) >= max_raw_diff_lines: + break + if cost <= remaining: + selected_entries.add((entry_header, entry_index)) + remaining -= cost + + displayed_change_count = len(selected_entries) + displayed_sections: dict[str, list[tuple[list[str], bool]]] = {} for header, entries in sorted(raw_diff_sections.items()): kept_entries = [] @@ -1276,7 +1337,7 @@ def render_statistics_markdown( if sampled: markdown_content += ( f"_Showing a random sample of " - f"{max_raw_diff_lines} of {total_raw_diff_changes} changes. " + f"{displayed_change_count} of {total_raw_diff_changes} changes. " "See the HTML report for the full diff._\n\n" ) @@ -1288,7 +1349,7 @@ def render_statistics_markdown( else: summary = "Raw diff" if sampled: - summary += f" sample ({max_raw_diff_lines} of {total_raw_diff_changes} changes)" + summary += f" sample ({displayed_change_count} of {total_raw_diff_changes} changes)" else: summary += f" ({total_raw_diff_changes} changes)" markdown_content += f"
\n{summary}\n\n" diff --git a/src/ecosystem_analyzer/main.py b/src/ecosystem_analyzer/main.py index a3b2bb4..68c0944 100644 --- a/src/ecosystem_analyzer/main.py +++ b/src/ecosystem_analyzer/main.py @@ -500,9 +500,11 @@ def generate_timing_diff( @click.option( "--max-raw-diff-lines", type=int, - default=100, + default=None, show_default=True, - help="Maximum number of raw diff changes to include in Markdown before sampling", + help="Maximum number of raw diff changes to include in Markdown before sampling. " + "By default, as many changes as possible are included while staying within " + "GitHub's comment character limit.", ) @click.option( "--old-name", From 044414f7c6c38a5a60aceb67af40504479b849ee Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Mon, 23 Mar 2026 12:31:43 +0000 Subject: [PATCH 2/3] remove `--max-raw-diff-lines` --- src/ecosystem_analyzer/diff.py | 8 +------- src/ecosystem_analyzer/main.py | 11 ----------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py index ddf72d6..58d774a 100644 --- a/src/ecosystem_analyzer/diff.py +++ b/src/ecosystem_analyzer/diff.py @@ -1169,7 +1169,6 @@ def render_statistics_markdown( self, *, inline_threshold: int = 15, - max_raw_diff_lines: int | None = None, ) -> str: statistics = self._calculate_statistics() failed_projects = self.diffs.get("failed_projects", []) @@ -1276,10 +1275,7 @@ def render_statistics_markdown( displayed_change_count = total_raw_diff_changes full_diff_text = "\n".join(raw_diff_lines) - if max_raw_diff_lines is not None: - needs_sampling = total_raw_diff_changes > max_raw_diff_lines - else: - needs_sampling = len(full_diff_text) > char_budget + needs_sampling = len(full_diff_text) > char_budget if needs_sampling: sampled = True @@ -1314,8 +1310,6 @@ def render_statistics_markdown( remaining = char_budget - non_change_cost selected_entries: set[tuple[str, int]] = set() for entry_header, entry_index, cost in change_entries: - if max_raw_diff_lines is not None and len(selected_entries) >= max_raw_diff_lines: - break if cost <= remaining: selected_entries.add((entry_header, entry_index)) remaining -= cost diff --git a/src/ecosystem_analyzer/main.py b/src/ecosystem_analyzer/main.py index 68c0944..245ed18 100644 --- a/src/ecosystem_analyzer/main.py +++ b/src/ecosystem_analyzer/main.py @@ -497,15 +497,6 @@ def generate_timing_diff( show_default=True, help="Show the raw diff inline when it has fewer than this many changes", ) -@click.option( - "--max-raw-diff-lines", - type=int, - default=None, - show_default=True, - help="Maximum number of raw diff changes to include in Markdown before sampling. " - "By default, as many changes as possible are included while staying within " - "GitHub's comment character limit.", -) @click.option( "--old-name", type=str, @@ -527,7 +518,6 @@ def generate_diff_statistics( new_file: str, output: str, inline_threshold: int, - max_raw_diff_lines: int, old_name: str | None, new_name: str | None, fail_on_new_abnormal_exits: bool, @@ -541,7 +531,6 @@ def generate_diff_statistics( diff = DiagnosticDiff(old_file, new_file, old_name=old_name, new_name=new_name) markdown_content = diff.render_statistics_markdown( inline_threshold=inline_threshold, - max_raw_diff_lines=max_raw_diff_lines, ) with open(output, "w") as f: From f0e2c431c91ee5471597939272369a7f057ebeac Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Mon, 23 Mar 2026 12:38:28 +0000 Subject: [PATCH 3/3] better comment --- src/ecosystem_analyzer/diff.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py index 58d774a..e51ad7d 100644 --- a/src/ecosystem_analyzer/diff.py +++ b/src/ecosystem_analyzer/diff.py @@ -53,9 +53,11 @@ class DiagnosticDiff: RAW_DIFF_SAMPLE_SEED = 137 LARGE_TIMING_CHANGE_THRESHOLD = 0.5 - # GitHub comment body limit is 65,536 characters. Keep a small + + # GitHub's comment body limit is 65,536 characters. We keep a small # margin so surrounding markup (details/summary tags, etc.) and any - # future additions don't push us over. + # future additions don't push us over. This margin is effectively a + # safety buffer _on top_ of the calculated size of the summary table. GITHUB_COMMENT_CHAR_LIMIT = 65_536 GITHUB_COMMENT_CHAR_MARGIN = 1_024