From c040e1fc64157f570277e8821934a61312a01dec Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 22 Mar 2026 23:06:09 +0000
Subject: [PATCH 1/3] Use GitHub comment character limit instead of fixed
 100-line cap for PR diff display

Instead of truncating the raw diff to a hardcoded 100 entries, compute
a character budget based on GitHub's 65,536-character comment limit
minus the space already used by the rest of the comment.  The sampling
logic now greedily fills the budget with randomly-shuffled entries,
displaying as many changes as will fit.  The old --max-raw-diff-lines
flag is preserved (default None) for backward compatibility but is no
longer needed in the common case.

https://claude.ai/code/session_01AQNDHAwavNHvBNnCQKAkx3
---
 src/ecosystem_analyzer/diff.py | 89 ++++++++++++++++++++++++++++------
 src/ecosystem_analyzer/main.py |  6 ++-
 2 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py
index 4524be3..ddf72d6 100644
--- a/src/ecosystem_analyzer/diff.py
+++ b/src/ecosystem_analyzer/diff.py
@@ -53,6 +53,11 @@ class DiagnosticDiff:
 
     RAW_DIFF_SAMPLE_SEED = 137
     LARGE_TIMING_CHANGE_THRESHOLD = 0.5
+    # GitHub comment body limit is 65,536 characters.  Keep a small
+    # margin so surrounding markup (details/summary tags, etc.) and any
+    # future additions don't push us over.
+    GITHUB_COMMENT_CHAR_LIMIT = 65_536
+    GITHUB_COMMENT_CHAR_MARGIN = 1_024
 
     def __init__(
         self,
@@ -1164,7 +1169,7 @@ def render_statistics_markdown(
         self,
         *,
         inline_threshold: int = 15,
-        max_raw_diff_lines: int = 100,
+        max_raw_diff_lines: int | None = None,
     ) -> str:
         statistics = self._calculate_statistics()
         failed_projects = self.diffs.get("failed_projects", [])
@@ -1246,21 +1251,77 @@ def render_statistics_markdown(
 
         markdown_content += "\n\n"
 
+        # Determine the character budget available for the raw diff block.
+        # We account for the wrapping markup (details/summary tags, code
+        # fence, sampling note) so that the final comment stays within
+        # GitHub's character limit.
+        char_budget = (
+            self.GITHUB_COMMENT_CHAR_LIMIT
+            - self.GITHUB_COMMENT_CHAR_MARGIN
+            - len(markdown_content)
+        )
+
+        # Reserve space for the static wrapper markup that surrounds the
+        # diff content.  We estimate generously so the budget refers to
+        # the diff payload itself.
+        #   - code fence: "```diff\n" + "\n```" = 12 chars
+        #   - details/summary (worst case): ~80 chars
+        #   - sampling note (worst case): ~120 chars
+        #   - extra newlines / padding: ~30 chars
+        _wrapper_overhead = 250
+        char_budget -= _wrapper_overhead
+
         displayed_lines = raw_diff_lines
         sampled = False
-        if total_raw_diff_changes > max_raw_diff_lines:
+        displayed_change_count = total_raw_diff_changes
+
+        full_diff_text = "\n".join(raw_diff_lines)
+        if max_raw_diff_lines is not None:
+            needs_sampling = total_raw_diff_changes > max_raw_diff_lines
+        else:
+            needs_sampling = len(full_diff_text) > char_budget
+
+        if needs_sampling:
             sampled = True
             rng = random.Random(self.RAW_DIFF_SAMPLE_SEED)
-            change_entries = [
-                (header, index)
-                for header, entries in sorted(raw_diff_sections.items())
-                for index, (_lines, counts_as_change) in enumerate(entries)
-                if counts_as_change
-            ]
-            selected_entries = {
-                sampled_entry
-                for sampled_entry in rng.sample(change_entries, k=max_raw_diff_lines)
-            }
+
+            # Build a list of (header, index, char_cost) for every
+            # change entry so we can greedily pick as many as fit.
+            change_entries: list[tuple[str, int, int]] = []
+            for header, entries in sorted(raw_diff_sections.items()):
+                for index, (lines, counts_as_change) in enumerate(entries):
+                    if counts_as_change:
+                        # +1 for the newline joining
+                        cost = sum(len(line) + 1 for line in lines)
+                        change_entries.append((header, index, cost))
+
+            # Shuffle deterministically, then greedily pick entries that
+            # fit within the character budget.
+            rng.shuffle(change_entries)
+
+            # Account for non-change lines (headers, etc.) that will
+            # always be included.  Compute their cost first.
+            non_change_cost = 0
+            for header, entries in sorted(raw_diff_sections.items()):
+                # header line + newline
+                non_change_cost += len(header) + 1
+                for _lines, counts_as_change in entries:
+                    if not counts_as_change:
+                        non_change_cost += sum(len(line) + 1 for line in _lines)
+                # trailing blank line between sections
+                non_change_cost += 1
+
+            remaining = char_budget - non_change_cost
+            selected_entries: set[tuple[str, int]] = set()
+            for entry_header, entry_index, cost in change_entries:
+                if max_raw_diff_lines is not None and len(selected_entries) >= max_raw_diff_lines:
+                    break
+                if cost <= remaining:
+                    selected_entries.add((entry_header, entry_index))
+                    remaining -= cost
+
+            displayed_change_count = len(selected_entries)
+
             displayed_sections: dict[str, list[tuple[list[str], bool]]] = {}
             for header, entries in sorted(raw_diff_sections.items()):
                 kept_entries = []
@@ -1276,7 +1337,7 @@ def render_statistics_markdown(
         if sampled:
             markdown_content += (
                 f"_Showing a random sample of "
-                f"{max_raw_diff_lines} of {total_raw_diff_changes} changes. "
+                f"{displayed_change_count} of {total_raw_diff_changes} changes. "
                 "See the HTML report for the full diff._\n\n"
             )
 
@@ -1288,7 +1349,7 @@ def render_statistics_markdown(
         else:
             summary = "Raw diff"
             if sampled:
-                summary += f" sample ({max_raw_diff_lines} of {total_raw_diff_changes} changes)"
+                summary += f" sample ({displayed_change_count} of {total_raw_diff_changes} changes)"
             else:
                 summary += f" ({total_raw_diff_changes} changes)"
             markdown_content += f"<details>\n<summary>{summary}</summary>\n\n"
diff --git a/src/ecosystem_analyzer/main.py b/src/ecosystem_analyzer/main.py
index a3b2bb4..68c0944 100644
--- a/src/ecosystem_analyzer/main.py
+++ b/src/ecosystem_analyzer/main.py
@@ -500,9 +500,11 @@ def generate_timing_diff(
 @click.option(
     "--max-raw-diff-lines",
     type=int,
-    default=100,
+    default=None,
     show_default=True,
-    help="Maximum number of raw diff changes to include in Markdown before sampling",
+    help="Maximum number of raw diff changes to include in Markdown before sampling. "
+    "By default, as many changes as possible are included while staying within "
+    "GitHub's comment character limit.",
 )
 @click.option(
     "--old-name",

From 044414f7c6c38a5a60aceb67af40504479b849ee Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Mon, 23 Mar 2026 12:31:43 +0000
Subject: [PATCH 2/3] remove `--max-raw-diff-lines`

---
 src/ecosystem_analyzer/diff.py |  8 +-------
 src/ecosystem_analyzer/main.py | 11 -----------
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py
index ddf72d6..58d774a 100644
--- a/src/ecosystem_analyzer/diff.py
+++ b/src/ecosystem_analyzer/diff.py
@@ -1169,7 +1169,6 @@ def render_statistics_markdown(
         self,
         *,
         inline_threshold: int = 15,
-        max_raw_diff_lines: int | None = None,
     ) -> str:
         statistics = self._calculate_statistics()
         failed_projects = self.diffs.get("failed_projects", [])
@@ -1276,10 +1275,7 @@ def render_statistics_markdown(
         displayed_change_count = total_raw_diff_changes
 
         full_diff_text = "\n".join(raw_diff_lines)
-        if max_raw_diff_lines is not None:
-            needs_sampling = total_raw_diff_changes > max_raw_diff_lines
-        else:
-            needs_sampling = len(full_diff_text) > char_budget
+        needs_sampling = len(full_diff_text) > char_budget
 
         if needs_sampling:
             sampled = True
@@ -1314,8 +1310,6 @@ def render_statistics_markdown(
             remaining = char_budget - non_change_cost
             selected_entries: set[tuple[str, int]] = set()
             for entry_header, entry_index, cost in change_entries:
-                if max_raw_diff_lines is not None and len(selected_entries) >= max_raw_diff_lines:
-                    break
                 if cost <= remaining:
                     selected_entries.add((entry_header, entry_index))
                     remaining -= cost
diff --git a/src/ecosystem_analyzer/main.py b/src/ecosystem_analyzer/main.py
index 68c0944..245ed18 100644
--- a/src/ecosystem_analyzer/main.py
+++ b/src/ecosystem_analyzer/main.py
@@ -497,15 +497,6 @@ def generate_timing_diff(
     show_default=True,
     help="Show the raw diff inline when it has fewer than this many changes",
 )
-@click.option(
-    "--max-raw-diff-lines",
-    type=int,
-    default=None,
-    show_default=True,
-    help="Maximum number of raw diff changes to include in Markdown before sampling. "
-    "By default, as many changes as possible are included while staying within "
-    "GitHub's comment character limit.",
-)
 @click.option(
     "--old-name",
     type=str,
@@ -527,7 +518,6 @@ def generate_diff_statistics(
     new_file: str,
     output: str,
     inline_threshold: int,
-    max_raw_diff_lines: int,
     old_name: str | None,
     new_name: str | None,
     fail_on_new_abnormal_exits: bool,
@@ -541,7 +531,6 @@ def generate_diff_statistics(
     diff = DiagnosticDiff(old_file, new_file, old_name=old_name, new_name=new_name)
     markdown_content = diff.render_statistics_markdown(
         inline_threshold=inline_threshold,
-        max_raw_diff_lines=max_raw_diff_lines,
     )
 
     with open(output, "w") as f:

From f0e2c431c91ee5471597939272369a7f057ebeac Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Mon, 23 Mar 2026 12:38:28 +0000
Subject: [PATCH 3/3] better comment

---
 src/ecosystem_analyzer/diff.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ecosystem_analyzer/diff.py b/src/ecosystem_analyzer/diff.py
index 58d774a..e51ad7d 100644
--- a/src/ecosystem_analyzer/diff.py
+++ b/src/ecosystem_analyzer/diff.py
@@ -53,9 +53,11 @@ class DiagnosticDiff:
 
     RAW_DIFF_SAMPLE_SEED = 137
     LARGE_TIMING_CHANGE_THRESHOLD = 0.5
-    # GitHub comment body limit is 65,536 characters.  Keep a small
+
+    # GitHub's comment body limit is 65,536 characters. We keep a small
     # margin so surrounding markup (details/summary tags, etc.) and any
-    # future additions don't push us over.
+    # future additions don't push us over. This margin is effectively a
+    # safety buffer _on top_ of the calculated size of the summary table.
     GITHUB_COMMENT_CHAR_LIMIT = 65_536
     GITHUB_COMMENT_CHAR_MARGIN = 1_024