From 4ed916df4dfeb42beb2f5364f9091570267457f2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:13:26 +0000
Subject: [PATCH 1/4] Initial plan


From 5062b645cbbc8a7617220300e69292db8b452cf3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:23:58 +0000
Subject: [PATCH 2/4] fix: improve metrics comparison CI readability

- Fix backwards diff direction in compare_metrics.py: swap unified_diff
  args so - = in baseline but not current (regression), + = newly added
- Update compare_metrics_test.py assertions to reflect correct semantics
- Fix get_raw_diff_sample in metrics_summary.py to interleave - and +
  lines so both sides are visible when truncating modified metrics
- Remove set -x from metrics_summary.sh and add GitHub group markers
  to fold verbose output sections in CI logs
- Add ciRunUrl parameter to formatMetricsDetail in ci-summary-report-
  publish.js to link directly to CI run logs for detailed diff output
- Update ci-summary-report-publish.test.js with new ciRunUrl tests

Agent-Logs-Url: https://github.com/jaegertracing/jaeger/sessions/9a76075a-0fc6-4e8a-a951-4dfe09c297fe

Co-authored-by: yurishkuro <3523016+yurishkuro@users.noreply.github.com>
---
 .github/scripts/ci-summary-report-publish.js  | 15 ++++++---
 .../scripts/ci-summary-report-publish.test.js | 33 +++++++++++++++----
 scripts/e2e/compare_metrics.py                |  7 +++-
 scripts/e2e/compare_metrics_test.py           | 21 ++++++------
 scripts/e2e/metrics_summary.py                | 31 +++++++++++++----
 scripts/e2e/metrics_summary.sh                | 18 ++++++----
 6 files changed, 92 insertions(+), 33 deletions(-)

diff --git a/.github/scripts/ci-summary-report-publish.js b/.github/scripts/ci-summary-report-publish.js
index 31ae50865fc..611a64319d3 100644
--- a/.github/scripts/ci-summary-report-publish.js
+++ b/.github/scripts/ci-summary-report-publish.js
@@ -158,9 +158,10 @@ function computeCoverage(s) {
  * All text is built from trusted templates; metric names have been validated
  * through sanitizeMetricName() and are rendered in backtick-code spans.
  * @param {Array|null} snapshots - Sanitized snapshots from computeMetrics
+ * @param {string} [ciRunUrl] - URL to the CI run for linking to detailed diff logs
  * @returns {string} - Markdown detail block, or empty string if no data
  */
-function formatMetricsDetail(snapshots) {
+function formatMetricsDetail(snapshots, ciRunUrl) {
   if (!snapshots || snapshots.length === 0) return '';
 
   const lines = [
@@ -170,6 +171,11 @@ function formatMetricsDetail(snapshots) {
     '',
   ];
 
+  if (ciRunUrl) {
+    lines.push(`_For label-level diff details, open the [CI run](${ciRunUrl}) and expand the "Compare metrics and generate summary" step logs._`);
+    lines.push('');
+  }
+
   for (const snap of snapshots) {
     lines.push(`**${snap.snapshot}**`);
     const parts = [];
@@ -201,9 +207,10 @@ function formatMetricsDetail(snapshots) {
  * @param {string} footer - links + timestamp line
  * @param {object} [opts]
  * @param {Array|null} [opts.metricsSnapshots] - sanitized snapshot data for detail rendering
+ * @param {string} [opts.ciRunUrl] - URL to the CI run, passed through to formatMetricsDetail
  * @returns {string}
  */
-function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots } = {}) {
+function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots, ciRunUrl } = {}) {
   const parts = [
     COMMENT_TAG,
     '## CI Summary Report',
@@ -212,7 +219,7 @@ function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots
     metricsText,
   ];
 
-  const detail = formatMetricsDetail(metricsSnapshots);
+  const detail = formatMetricsDetail(metricsSnapshots, ciRunUrl);
   if (detail) {
     parts.push(detail);
   }
@@ -349,7 +356,7 @@ async function handler({ github, core, fs, inputs }) {
     // after a green run.  Only create a new comment when there is something to report.
     const hasIssues = metrics.conclusion === 'failure' || coverage.conclusion === 'failure'
                       || metrics.totalChanges > 0;
-    const body = buildCommentBody(metrics.text, coverage.text, footer, { metricsSnapshots: metrics.snapshots });
+    const body = buildCommentBody(metrics.text, coverage.text, footer, { metricsSnapshots: metrics.snapshots, ciRunUrl });
     await postOrUpdateComment(github, owner, repo, prNumber, body, core, { createNew: hasIssues });
   } else {
     core.info('No PR number; skipping PR comment.');
diff --git a/.github/scripts/ci-summary-report-publish.test.js b/.github/scripts/ci-summary-report-publish.test.js
index 8334c131578..1bbd4c118b6 100644
--- a/.github/scripts/ci-summary-report-publish.test.js
+++ b/.github/scripts/ci-summary-report-publish.test.js
@@ -285,14 +285,23 @@ describe('formatMetricsDetail', () => {
     expect(detail).toContain('3 removed');
   });
 
-  test('omits zero counts from summary line', () => {
+  test('includes link to CI run when ciRunUrl provided', () => {
     const detail = formatMetricsDetail([{
-      snapshot: 'test', added: 0, removed: 5, modified: 0,
-      metric_names: [],
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+    }], 'https://github.com/org/repo/actions/runs/123');
+    expect(detail).toContain('[CI run](https://github.com/org/repo/actions/runs/123)');
+    expect(detail).toContain('Compare metrics and generate summary');
+  });
+
+  test('omits CI run link when ciRunUrl not provided', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
     }]);
-    expect(detail).not.toContain('added');
-    expect(detail).toContain('5 removed');
-    expect(detail).not.toContain('modified');
+    expect(detail).not.toContain('[CI run]');
   });
 });
 
@@ -420,6 +429,18 @@ describe('buildCommentBody', () => {
     expect(metricsPos).toBeLessThan(detailsPos);
     expect(detailsPos).toBeLessThan(coveragePos);
   });
+
+  test('includes CI run link in metrics detail when ciRunUrl provided', () => {
+    const snapshots = [{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+    }];
+    const ciRunUrl = 'https://github.com/org/repo/actions/runs/999';
+    const body = buildCommentBody('❌ 1 metric change(s) detected', coverageText, footer, { metricsSnapshots: snapshots, ciRunUrl });
+    expect(body).toContain(`[CI run](${ciRunUrl})`);
+    expect(body).toContain('Compare metrics and generate summary');
+  });
 });
 
 // ── postCheckRun ──────────────────────────────────────────────────────────────
diff --git a/scripts/e2e/compare_metrics.py b/scripts/e2e/compare_metrics.py
index a64481a2eda..70873905a91 100644
--- a/scripts/e2e/compare_metrics.py
+++ b/scripts/e2e/compare_metrics.py
@@ -132,6 +132,9 @@ def generate_diff(file1_content, file2_content):
     The diff is performed on these sorted, value-free metric strings.  If the two
     snapshots produce the same set of strings the diff is empty and this function
     returns ''.  When there are differences, the return value is a unified diff
+    using the standard convention:
+        - lines  = present in file2 (baseline) but absent from file1 (current) → regression/removed
+        + lines  = present in file1 (current) but absent from file2 (baseline) → newly added
     followed by optional comment lines reporting how many metrics were excluded, e.g.:
         # Metrics excluded from A: 3
         # Metrics excluded from B: 5
@@ -146,7 +149,9 @@ def generate_diff(file1_content, file2_content):
     metrics1,excluded_metrics_count1 = parse_metrics(file1_content)
     metrics2,excluded_metrics_count2 = parse_metrics(file2_content)
 
-    diff = list(unified_diff(metrics1, metrics2, lineterm='', n=0))
+    # unified_diff(baseline, current): - = in baseline but not current (removed/regression),
+    #                                  + = in current but not baseline (newly added).
+    diff = list(unified_diff(metrics2, metrics1, lineterm='', n=0))
 
     # Exclusion counts are informational context appended to the diff output.
     # They must not be written when the diff itself is empty: two snapshots with
diff --git a/scripts/e2e/compare_metrics_test.py b/scripts/e2e/compare_metrics_test.py
index 10068cdba19..7d64739101f 100644
--- a/scripts/e2e/compare_metrics_test.py
+++ b/scripts/e2e/compare_metrics_test.py
@@ -54,11 +54,11 @@ def test_empty_snapshots_returns_empty(self):
 
     def test_regression_detected(self):
         """Metric present in baseline but absent from current snapshot → diff is non-empty."""
-        # current=A only, baseline=A+B → B is missing from current (regression)
+        # current=A only, baseline=A+B → B is missing from current (regression/removed)
         result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
         self.assertNotEqual(result, '', 'Expected a non-empty diff for a regression')
-        # The diff must contain a '+' line for the missing metric (counter_b)
-        self.assertIn('+counter_b', result)
+        # '-' line = in baseline but not in current (regression)
+        self.assertIn('-counter_b', result)
 
     def test_new_metric_in_current_snapshot_produces_diff(self):
         """Metric present in current snapshot but absent from baseline → diff is non-empty.
@@ -68,11 +68,11 @@ def test_new_metric_in_current_snapshot_produces_diff(self):
         Silently ignoring new metrics would mask intermittent behaviour where a
         metric alternates between appearing and disappearing across runs.
         """
-        # current=A+B, baseline=A only → B is new in current
+        # current=A+B, baseline=A only → B is new in current (newly added)
         result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
         self.assertNotEqual(result, '', 'New metrics in current snapshot should produce a diff')
-        # '-' line = in current but not in baseline
-        self.assertIn('-counter_b', result)
+        # '+' line = in current but not in baseline (newly added)
+        self.assertIn('+counter_b', result)
 
     def test_exclusion_count_difference_does_not_produce_diff(self):
         """Snapshots that differ only in excluded-metric counts produce no diff.
@@ -92,12 +92,13 @@ def test_exclusion_count_difference_does_not_produce_diff(self):
 
     def test_mixed_regression_and_new_metric_returns_diff(self):
         """When there is both a regression AND a new metric, the diff is non-empty."""
-        # current=B only, baseline=A only → A is missing (regression), B is new
+        # current=B only, baseline=A only → A is missing (regression/removed), B is new (added)
         result = generate_diff(_METRIC_B, _METRIC_A)
         self.assertNotEqual(result, '')
-        self.assertIn('+counter_a', result)
-        # The new metric should still appear in the raw diff output for visibility
-        self.assertIn('-counter_b', result)
+        # '-' line = in baseline but not in current (regression)
+        self.assertIn('-counter_a', result)
+        # '+' line = in current but not in baseline (newly added)
+        self.assertIn('+counter_b', result)
 
     def test_regression_with_exclusions_includes_exclusion_summary(self):
         """When there is a regression and excluded metrics, the output includes counts."""
diff --git a/scripts/e2e/metrics_summary.py b/scripts/e2e/metrics_summary.py
index f5b1c6e15dc..0ba2fe11119 100644
--- a/scripts/e2e/metrics_summary.py
+++ b/scripts/e2e/metrics_summary.py
@@ -73,19 +73,38 @@ def extract_metric_name(line):
         return line.split('{')[0].strip()
     return line.strip()
 
-def get_raw_diff_sample(raw_lines, max_lines=7):
+def get_raw_diff_sample(raw_lines, max_lines=8):
     """
     Get sample raw diff lines, preserving original diff formatting.
+
+    For modified metrics that have both removed (-) and added (+) lines,
+    interleaves them so both sides are visible when the sample is truncated,
+    avoiding the problem of showing only the removed side.
     """
     if not raw_lines:
         return []
 
-    # Take up to max_lines
-    sample_lines = raw_lines[:max_lines]
-    if len(raw_lines) > max_lines:
-        sample_lines.append("...")
+    removed_lines = [l for l in raw_lines if l.startswith('-')]
+    added_lines = [l for l in raw_lines if l.startswith('+')]
+
+    # If only one direction of change (pure add or pure remove), just truncate.
+    if not removed_lines or not added_lines:
+        sample_lines = raw_lines[:max_lines]
+        if len(raw_lines) > max_lines:
+            sample_lines.append("...")
+        return sample_lines
+
+    # Interleave pairs of (removed, added) lines so both sides are always visible.
+    max_pairs = max(1, max_lines // 2)
+    interleaved = []
+    for i in range(min(max_pairs, len(removed_lines), len(added_lines))):
+        interleaved.append(removed_lines[i])
+        interleaved.append(added_lines[i])
+
+    if len(removed_lines) > max_pairs or len(added_lines) > max_pairs:
+        interleaved.append("...")
 
-    return sample_lines
+    return interleaved
 
 def generate_diff_summary(changes, raw_diff_sections, exclusion_count):
     """
diff --git a/scripts/e2e/metrics_summary.sh b/scripts/e2e/metrics_summary.sh
index 67983ece935..70439e7d05a 100644
--- a/scripts/e2e/metrics_summary.sh
+++ b/scripts/e2e/metrics_summary.sh
@@ -3,22 +3,24 @@
 # Copyright (c) 2025 The Jaeger Authors.
 # SPDX-License-Identifier: Apache-2.0
 
-# Enable debug tracing and exit on error
-set -exo pipefail
+# Exit on error, treat unset variables as errors, fail on pipe errors.
+set -eo pipefail
 
 METRICS_DIR="${METRICS_DIR:-./.artifacts}"
 declare -a summary_files=()
 declare -a json_files=()
 total_changes=0
 
+echo "::group::Metrics diff setup"
 echo "Starting metrics diff processing in directory: $METRICS_DIR"
 echo "Directory structure:"
 ls -la "$METRICS_DIR" || echo "Metrics directory listing failed"
+echo "::endgroup::"
 
 # Verify 1-to-1: every metrics_snapshot_* artifact must have a diff_metrics_snapshot_* artifact.
 # verify-metrics-snapshot always uploads a diff artifact on PRs (empty stub if no baseline),
 # so a missing diff dir means that action never ran for that snapshot — an infra failure.
-echo "=== Checking for missing diff artifacts ==="
+echo "::group::Checking for missing diff artifacts"
 declare -a missing_diffs=()
 found_any_snapshot=false
 for snapshot_dir in "$METRICS_DIR"/metrics_snapshot_*/; do
@@ -41,12 +43,14 @@ if [ ${#missing_diffs[@]} -gt 0 ]; then
 else
     echo "INFRA_ERRORS=false" >> "$GITHUB_OUTPUT"
 fi
+echo "::endgroup::"
 
 # Debug: List all diff files found
-echo "=== Searching for diff files ==="
+echo "::group::Searching for diff files"
 find "$METRICS_DIR" -type f -name "diff_*.txt" | while read -r file; do
     echo "Found diff file: $file"
 done
+echo "::endgroup::"
 
 # Process all non-empty diff files.
 # Empty diff files are stubs uploaded by verify-metrics-snapshot when there is no
@@ -74,11 +78,12 @@ while IFS= read -r -d '' diff_file; do
     summary_file="$dir/summary_$snapshot_name.md"
     json_file="$dir/changes_$snapshot_name.json"
 
-    echo "Generating summary for $snapshot_name"
+    echo "::group::Generating summary for $snapshot_name"
     python3 ./scripts/e2e/metrics_summary.py \
         --diff "$diff_file" \
         --output "$summary_file" \
         --json-output "$json_file"
+    echo "::endgroup::"
 
     summary_files+=("$summary_file")
     json_files+=("$json_file")
@@ -152,10 +157,11 @@ if [ "$total_changes" -gt 0 ]; then
     echo ""
     for summary_file in "${summary_files[@]}"; do
         file_name=$(basename "$summary_file" .md)
-        echo "--- ${file_name} ---"
+        echo "::group::${file_name}"
         echo ""
         cat "$summary_file"
         echo ""
+        echo "::endgroup::"
     done
 elif [ ${#missing_diffs[@]} -gt 0 ]; then
     echo "No metric changes in available diffs, but some diff artifacts were missing (see above)."

From abc5eab190040bb9bf64c50d62b4fa16296ecd2c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:31:44 +0000
Subject: [PATCH 3/4] fix: address review feedback - rename terminology, add
 tests, add artifact links

- Rename A/B/1/2 terminology to baseline/current throughout:
  - compare_metrics.py: rename function args, vars, CLI args (--current/
    --baseline), and exclusion count labels (current:/baseline:)
  - metrics_summary.py: update exclusion label parsing
  - compare_metrics_test.py: update assertions
  - action.yaml: update --file1/--file2 to --current/--baseline
- Add metrics_summary_test.py with 25 tests covering get_raw_diff_sample,
  parse_diff_file, extract_metric_name, generate_diff_summary,
  generate_structured_json
- Add per-snapshot diff artifact download links in PR comment:
  - metrics_summary.sh: query GitHub API for artifact IDs; include in
    metrics_snapshots.json via merge Python snippet
  - ci-summary-report.yml: add GH_TOKEN to compare-metrics step
  - ci-summary-report-publish.yml: pass sourceRunId to handler
  - ci-summary-report-publish.js: sanitize artifact_id; construct per-
    snapshot download URLs from trusted pieces; add artifactUrlPrefix
    parameter to formatMetricsDetail and buildCommentBody
  - ci-summary-report-publish.test.js: add tests for artifact_id
    sanitization and download link rendering

Agent-Logs-Url: https://github.com/jaegertracing/jaeger/sessions/a021a645-ff42-4ce9-89f1-6a2918708a39

Co-authored-by: yurishkuro <3523016+yurishkuro@users.noreply.github.com>
---
 .../verify-metrics-snapshot/action.yaml       |   2 +-
 .github/scripts/ci-summary-report-publish.js  |  42 ++-
 .../scripts/ci-summary-report-publish.test.js |  83 ++++-
 .../workflows/ci-summary-report-publish.yml   |  13 +-
 .github/workflows/ci-summary-report.yml       |   2 +
 scripts/e2e/compare_metrics.py                |  50 +--
 scripts/e2e/compare_metrics_test.py           |   4 +-
 scripts/e2e/metrics_summary.py                |   2 +-
 scripts/e2e/metrics_summary.sh                |  40 ++-
 scripts/e2e/metrics_summary_test.py           | 288 ++++++++++++++++++
 10 files changed, 482 insertions(+), 44 deletions(-)
 create mode 100644 scripts/e2e/metrics_summary_test.py

diff --git a/.github/actions/verify-metrics-snapshot/action.yaml b/.github/actions/verify-metrics-snapshot/action.yaml
index 326a2861674..e0612a3242d 100644
--- a/.github/actions/verify-metrics-snapshot/action.yaml
+++ b/.github/actions/verify-metrics-snapshot/action.yaml
@@ -94,7 +94,7 @@ runs:
       shell: bash
       run: |
         python3 -m pip install prometheus-client
-        if python3 ./scripts/e2e/compare_metrics.py --file1 ./.metrics/${{ inputs.snapshot }}.txt --file2 ./.metrics/baseline_${{ inputs.snapshot }}.txt --output ./.metrics/diff_${{ inputs.snapshot }}.txt; then
+        if python3 ./scripts/e2e/compare_metrics.py --current ./.metrics/${{ inputs.snapshot }}.txt --baseline ./.metrics/baseline_${{ inputs.snapshot }}.txt --output ./.metrics/diff_${{ inputs.snapshot }}.txt; then
           echo "No differences found in metrics"
         else
           echo "🛑 Differences found in metrics"
diff --git a/.github/scripts/ci-summary-report-publish.js b/.github/scripts/ci-summary-report-publish.js
index 611a64319d3..06c20e92de9 100644
--- a/.github/scripts/ci-summary-report-publish.js
+++ b/.github/scripts/ci-summary-report-publish.js
@@ -97,7 +97,12 @@ function sanitizeSnapshots(raw) {
         if (clean) names.push(clean);
       }
     }
-    result.push({ snapshot, added, removed, modified, metric_names: names });
+    // Validate artifact_id (optional, non-negative integer for the diff artifact download link)
+    const artifactId = safeNum(entry.artifact_id);
+    // artifact_id must be a positive integer (GitHub artifact IDs are always > 0)
+    const sanitizedArtifactId = (artifactId !== null && Number.isInteger(artifactId) && artifactId > 0)
+      ? artifactId : null;
+    result.push({ snapshot, added, removed, modified, metric_names: names, artifact_id: sanitizedArtifactId });
   }
   return result.length > 0 ? result : null;
 }
@@ -158,10 +163,14 @@ function computeCoverage(s) {
  * All text is built from trusted templates; metric names have been validated
  * through sanitizeMetricName() and are rendered in backtick-code spans.
  * @param {Array|null} snapshots - Sanitized snapshots from computeMetrics
- * @param {string} [ciRunUrl] - URL to the CI run for linking to detailed diff logs
+ * @param {string} [ciRunUrl] - URL to the CI run for linking to step logs
+ * @param {string} [artifactUrlPrefix] - Base URL for artifact downloads:
+ *   `https://github.com/{owner}/{repo}/actions/runs/{runId}/artifacts`.
+ *   When provided, each snapshot with a valid artifact_id gets a direct
+ *   download link appended to its header line.
  * @returns {string} - Markdown detail block, or empty string if no data
  */
-function formatMetricsDetail(snapshots, ciRunUrl) {
+function formatMetricsDetail(snapshots, { ciRunUrl, artifactUrlPrefix } = {}) {
   if (!snapshots || snapshots.length === 0) return '';
 
   const lines = [
@@ -177,7 +186,12 @@ function formatMetricsDetail(snapshots, ciRunUrl) {
   }
 
   for (const snap of snapshots) {
-    lines.push(`**${snap.snapshot}**`);
+    // Build the snapshot header, optionally with a direct diff artifact download link.
+    let snapHeader = `**${snap.snapshot}**`;
+    if (artifactUrlPrefix && snap.artifact_id !== null && snap.artifact_id !== undefined) {
+      snapHeader += ` — [⬇️ download diff](${artifactUrlPrefix}/${snap.artifact_id})`;
+    }
+    lines.push(snapHeader);
     const parts = [];
     if (snap.added    !== null && snap.added    > 0) parts.push(`${snap.added} added`);
     if (snap.removed  !== null && snap.removed  > 0) parts.push(`${snap.removed} removed`);
@@ -208,9 +222,10 @@ function formatMetricsDetail(snapshots, ciRunUrl) {
  * @param {object} [opts]
  * @param {Array|null} [opts.metricsSnapshots] - sanitized snapshot data for detail rendering
  * @param {string} [opts.ciRunUrl] - URL to the CI run, passed through to formatMetricsDetail
+ * @param {string} [opts.artifactUrlPrefix] - Base URL for artifact downloads, passed through
  * @returns {string}
  */
-function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots, ciRunUrl } = {}) {
+function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots, ciRunUrl, artifactUrlPrefix } = {}) {
   const parts = [
     COMMENT_TAG,
     '## CI Summary Report',
@@ -219,7 +234,7 @@ function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots,
     metricsText,
   ];
 
-  const detail = formatMetricsDetail(metricsSnapshots, ciRunUrl);
+  const detail = formatMetricsDetail(metricsSnapshots, { ciRunUrl, artifactUrlPrefix });
   if (detail) {
     parts.push(detail);
   }
@@ -308,6 +323,8 @@ async function postOrUpdateComment(github, owner, repo, prNumber, body, core, {
  * @param {string} opts.inputs.prNumber  - raw string from step output
  * @param {string} opts.inputs.ciRunUrl
  * @param {string} opts.inputs.publishUrl
+ * @param {string} [opts.inputs.sourceRunId] - numeric run ID of the CI Orchestrator run;
+ *   used to construct direct artifact download URLs (e.g. .../artifacts/{id})
  */
 async function handler({ github, core, fs, inputs }) {
   const { owner, repo, headSha, ciRunUrl, publishUrl } = inputs;
@@ -317,6 +334,13 @@ async function handler({ github, core, fs, inputs }) {
   const ts     = new Date().toISOString().replace('T', ' ').replace(/\.\d+Z$/, ' UTC');
   const footer = `${links}\n_${ts}_`;
 
+  // Construct the artifact URL prefix for per-snapshot diff download links.
+  // All parts come from trusted sources: owner/repo from context, runId validated as integer.
+  const sourceRunId = parseInt(inputs.sourceRunId, 10) || null;
+  const artifactUrlPrefix = (owner && repo && sourceRunId)
+    ? `https://github.com/${owner}/${repo}/actions/runs/${sourceRunId}/artifacts`
+    : null;
+
   // Read structured data written by ci-summary-report.yml.
   // All fields are primitives (enums, numbers, booleans) — no free-form text.
   let s;
@@ -356,7 +380,11 @@ async function handler({ github, core, fs, inputs }) {
     // after a green run.  Only create a new comment when there is something to report.
     const hasIssues = metrics.conclusion === 'failure' || coverage.conclusion === 'failure'
                       || metrics.totalChanges > 0;
-    const body = buildCommentBody(metrics.text, coverage.text, footer, { metricsSnapshots: metrics.snapshots, ciRunUrl });
+    const body = buildCommentBody(metrics.text, coverage.text, footer, {
+      metricsSnapshots: metrics.snapshots,
+      ciRunUrl,
+      artifactUrlPrefix,
+    });
     await postOrUpdateComment(github, owner, repo, prNumber, body, core, { createNew: hasIssues });
   } else {
     core.info('No PR number; skipping PR comment.');
diff --git a/.github/scripts/ci-summary-report-publish.test.js b/.github/scripts/ci-summary-report-publish.test.js
index 1bbd4c118b6..4b2ab570552 100644
--- a/.github/scripts/ci-summary-report-publish.test.js
+++ b/.github/scripts/ci-summary-report-publish.test.js
@@ -175,6 +175,44 @@ describe('sanitizeSnapshots', () => {
     expect(result).toHaveLength(1);
     expect(result[0].snapshot).toBe('valid');
   });
+
+  test('sanitizes artifact_id as a positive integer', () => {
+    const input = [{
+      snapshot: 'test_snapshot',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBe(6359406281);
+  });
+
+  test('rejects non-integer artifact_id', () => {
+    const input = [{
+      snapshot: 'test_snapshot',
+      added: 1, removed: 0, modified: 0,
+      metric_names: [],
+      artifact_id: 'evil_string',
+    }];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBeNull();
+  });
+
+  test('rejects zero and negative artifact_id', () => {
+    const input = [
+      { snapshot: 'snap_a', added: 0, removed: 0, modified: 0, metric_names: [], artifact_id: 0 },
+      { snapshot: 'snap_b', added: 0, removed: 0, modified: 0, metric_names: [], artifact_id: -1 },
+    ];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBeNull();
+    expect(result[1].artifact_id).toBeNull();
+  });
+
+  test('sets artifact_id to null when absent', () => {
+    const input = [{ snapshot: 'test_snapshot', added: 1, removed: 0, modified: 0, metric_names: [] }];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBeNull();
+  });
 });
 
 // ── computeMetrics ────────────────────────────────────────────────────────────
@@ -290,7 +328,7 @@ describe('formatMetricsDetail', () => {
       snapshot: 'cassandra_v2',
       added: 1, removed: 0, modified: 0,
       metric_names: ['http_server_duration'],
-    }], 'https://github.com/org/repo/actions/runs/123');
+    }], { ciRunUrl: 'https://github.com/org/repo/actions/runs/123' });
     expect(detail).toContain('[CI run](https://github.com/org/repo/actions/runs/123)');
     expect(detail).toContain('Compare metrics and generate summary');
   });
@@ -303,6 +341,36 @@ describe('formatMetricsDetail', () => {
     }]);
     expect(detail).not.toContain('[CI run]');
   });
+
+  test('renders per-snapshot artifact download link when artifactUrlPrefix and artifact_id provided', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }], { artifactUrlPrefix: 'https://github.com/org/repo/actions/runs/123/artifacts' });
+    expect(detail).toContain('[⬇️ download diff](https://github.com/org/repo/actions/runs/123/artifacts/6359406281)');
+  });
+
+  test('omits artifact download link when artifact_id is null', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: null,
+    }], { artifactUrlPrefix: 'https://github.com/org/repo/actions/runs/123/artifacts' });
+    expect(detail).not.toContain('download diff');
+  });
+
+  test('omits artifact download link when artifactUrlPrefix not provided', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }]);
+    expect(detail).not.toContain('download diff');
+  });
 });
 
 // ── computeCoverage ───────────────────────────────────────────────────────────
@@ -435,12 +503,25 @@ describe('buildCommentBody', () => {
       snapshot: 'cassandra_v2',
       added: 1, removed: 0, modified: 0,
       metric_names: ['http_server_duration'],
+      artifact_id: null,
     }];
     const ciRunUrl = 'https://github.com/org/repo/actions/runs/999';
     const body = buildCommentBody('❌ 1 metric change(s) detected', coverageText, footer, { metricsSnapshots: snapshots, ciRunUrl });
     expect(body).toContain(`[CI run](${ciRunUrl})`);
     expect(body).toContain('Compare metrics and generate summary');
   });
+
+  test('includes per-snapshot artifact download link when artifactUrlPrefix and artifact_id provided', () => {
+    const snapshots = [{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }];
+    const artifactUrlPrefix = 'https://github.com/org/repo/actions/runs/999/artifacts';
+    const body = buildCommentBody('❌ 1 metric change(s) detected', coverageText, footer, { metricsSnapshots: snapshots, artifactUrlPrefix });
+    expect(body).toContain(`[⬇️ download diff](${artifactUrlPrefix}/6359406281)`);
+  });
 });
 
 // ── postCheckRun ──────────────────────────────────────────────────────────────
diff --git a/.github/workflows/ci-summary-report-publish.yml b/.github/workflows/ci-summary-report-publish.yml
index 88ad0559e48..cc532d2e821 100644
--- a/.github/workflows/ci-summary-report-publish.yml
+++ b/.github/workflows/ci-summary-report-publish.yml
@@ -116,12 +116,13 @@ jobs:
             await handler({
               github, core, fs: require('fs'),
               inputs: {
-                owner:      context.repo.owner,
-                repo:       context.repo.repo,
-                headSha:    '${{ steps.source-run.outputs.head_sha }}',
-                prNumber:   '${{ steps.source-run.outputs.pr_number || steps.pr.outputs.pr_number }}',
-                ciRunUrl:   '${{ steps.source-run.outputs.source_run_url }}',
-                publishUrl: '${{ steps.source-run.outputs.summary_run_url }}',
+                owner:       context.repo.owner,
+                repo:        context.repo.repo,
+                headSha:     '${{ steps.source-run.outputs.head_sha }}',
+                prNumber:    '${{ steps.source-run.outputs.pr_number || steps.pr.outputs.pr_number }}',
+                ciRunUrl:    '${{ steps.source-run.outputs.source_run_url }}',
+                publishUrl:  '${{ steps.source-run.outputs.summary_run_url }}',
+                sourceRunId: '${{ steps.source-run.outputs.run_id }}',
               },
             });
         env:
diff --git a/.github/workflows/ci-summary-report.yml b/.github/workflows/ci-summary-report.yml
index 1d26cb75d3f..b83bfbf1e66 100644
--- a/.github/workflows/ci-summary-report.yml
+++ b/.github/workflows/ci-summary-report.yml
@@ -40,6 +40,8 @@ jobs:
 
       - name: Compare metrics and generate summary
         id: compare-metrics
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         shell: bash
         run: bash ./scripts/e2e/metrics_summary.sh
 
diff --git a/scripts/e2e/compare_metrics.py b/scripts/e2e/compare_metrics.py
index 70873905a91..8e3d454715c 100644
--- a/scripts/e2e/compare_metrics.py
+++ b/scripts/e2e/compare_metrics.py
@@ -111,7 +111,7 @@ def parse_metrics(content):
     return metrics,metrics_exclusion_count
 
 
-def generate_diff(file1_content, file2_content):
+def generate_diff(current_content, baseline_content):
     """Compare two Prometheus metrics snapshots and return a unified diff of metric names.
 
     The input files are raw Prometheus text exposition format, scraped directly from
@@ -133,25 +133,25 @@ def generate_diff(file1_content, file2_content):
     snapshots produce the same set of strings the diff is empty and this function
     returns ''.  When there are differences, the return value is a unified diff
     using the standard convention:
-        - lines  = present in file2 (baseline) but absent from file1 (current) → regression/removed
-        + lines  = present in file1 (current) but absent from file2 (baseline) → newly added
+        - lines  = present in baseline but absent from current → regression/removed
+        + lines  = present in current but absent from baseline → newly added
     followed by optional comment lines reporting how many metrics were excluded, e.g.:
-        # Metrics excluded from A: 3
-        # Metrics excluded from B: 5
+        # Metrics excluded from current: 3
+        # Metrics excluded from baseline: 5
     These comment lines (prefixed with `# `) are appended only when the diff is
     non-empty; they are informational context, not metric differences themselves.
     """
-    if isinstance(file1_content, list):
-        file1_content = ''.join(file1_content)
-    if isinstance(file2_content, list):
-        file2_content = ''.join(file2_content)
+    if isinstance(current_content, list):
+        current_content = ''.join(current_content)
+    if isinstance(baseline_content, list):
+        baseline_content = ''.join(baseline_content)
 
-    metrics1,excluded_metrics_count1 = parse_metrics(file1_content)
-    metrics2,excluded_metrics_count2 = parse_metrics(file2_content)
+    current_metrics, excluded_count_current = parse_metrics(current_content)
+    baseline_metrics, excluded_count_baseline = parse_metrics(baseline_content)
 
     # unified_diff(baseline, current): - = in baseline but not current (removed/regression),
     #                                  + = in current but not baseline (newly added).
-    diff = list(unified_diff(metrics2, metrics1, lineterm='', n=0))
+    diff = list(unified_diff(baseline_metrics, current_metrics, lineterm='', n=0))
 
     # Exclusion counts are informational context appended to the diff output.
     # They must not be written when the diff itself is empty: two snapshots with
@@ -160,12 +160,12 @@ def generate_diff(file1_content, file2_content):
     if len(diff) == 0:
         return ''
 
-    total_excluded = excluded_metrics_count1 + excluded_metrics_count2
-    
+    total_excluded = excluded_count_current + excluded_count_baseline
+
     exclusion_lines = ''
     if total_excluded > 0:
-        exclusion_lines = f'\n# Metrics excluded from A: {excluded_metrics_count1}\n# Metrics excluded from B: {excluded_metrics_count2}'
-    
+        exclusion_lines = f'\n# Metrics excluded from current: {excluded_count_current}\n# Metrics excluded from baseline: {excluded_count_baseline}'
+
     return '\n'.join(diff) + exclusion_lines
 
 def write_diff_file(diff_lines, output_path):
@@ -177,20 +177,20 @@ def write_diff_file(diff_lines, output_path):
 
 def main():
     parser = argparse.ArgumentParser(description='Generate diff between two Jaeger metric files')
-    parser.add_argument('--file1', help='Path to first metric file')
-    parser.add_argument('--file2', help='Path to second metric file')
+    parser.add_argument('--current', help='Path to the current metric file (e.g. from the PR)')
+    parser.add_argument('--baseline', help='Path to the baseline metric file (e.g. from main branch)')
     parser.add_argument('--output', '-o', default='metrics_diff.txt',
                        help='Output diff file path (default: metrics_diff.txt)')
-    
+
     args = parser.parse_args()
-    
+
     # Read input files
-    file1_lines = read_metric_file(args.file1)
-    file2_lines = read_metric_file(args.file2)
-    
+    current_lines = read_metric_file(args.current)
+    baseline_lines = read_metric_file(args.baseline)
+
     # Generate diff
-    diff_lines = generate_diff(file1_lines, file2_lines)
-    
+    diff_lines = generate_diff(current_lines, baseline_lines)
+
     # Check if there are any differences
     if diff_lines:
         print("differences found between the metric files.")
diff --git a/scripts/e2e/compare_metrics_test.py b/scripts/e2e/compare_metrics_test.py
index 7d64739101f..9750808932c 100644
--- a/scripts/e2e/compare_metrics_test.py
+++ b/scripts/e2e/compare_metrics_test.py
@@ -105,8 +105,8 @@ def test_regression_with_exclusions_includes_exclusion_summary(self):
         # current=excluded only, baseline=A+excluded → A is missing (regression)
         result = generate_diff(_METRIC_EXCLUDED_5XX, _METRIC_A + _METRIC_EXCLUDED_5XX)
         self.assertNotEqual(result, '')
-        self.assertIn('# Metrics excluded from A:', result)
-        self.assertIn('# Metrics excluded from B:', result)
+        self.assertIn('# Metrics excluded from current:', result)
+        self.assertIn('# Metrics excluded from baseline:', result)
 
     def test_no_exclusions_means_no_exclusion_summary(self):
         """When there are no excluded metrics, the exclusion summary is omitted."""
diff --git a/scripts/e2e/metrics_summary.py b/scripts/e2e/metrics_summary.py
index 0ba2fe11119..6e4ed9cd655 100644
--- a/scripts/e2e/metrics_summary.py
+++ b/scripts/e2e/metrics_summary.py
@@ -28,7 +28,7 @@ def parse_diff_file(diff_path):
         original_line = line.rstrip('\n')
         stripped = original_line.strip()
 
-        if stripped.startswith('# Metrics excluded from A: ') or stripped.startswith('# Metrics excluded from B: '):
+        if stripped.startswith('# Metrics excluded from current: ') or stripped.startswith('# Metrics excluded from baseline: '):
             count_str = stripped.split(': ')[1]
             exclusion_count += int(count_str)
             continue
diff --git a/scripts/e2e/metrics_summary.sh b/scripts/e2e/metrics_summary.sh
index 70439e7d05a..2edc95e59e0 100644
--- a/scripts/e2e/metrics_summary.sh
+++ b/scripts/e2e/metrics_summary.sh
@@ -52,6 +52,27 @@ find "$METRICS_DIR" -type f -name "diff_*.txt" | while read -r file; do
 done
 echo "::endgroup::"
 
+# Query artifact IDs for all diff_metrics_snapshot_* artifacts in this run.
+# These IDs are used by the publish workflow to render direct download links
+# in the PR comment. The query is optional: if it fails (e.g. when running
+# locally without a token) we write an empty map and the publish workflow
+# degrades gracefully (no download links, but the rest of the summary still works).
+echo "::group::Querying diff artifact IDs"
+if [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then
+    if gh api "repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/artifacts?per_page=100" \
+        --jq '[.artifacts[] | select(.name | startswith("diff_metrics_snapshot_")) | {key: .name, value: .id}] | from_entries' \
+        > "$METRICS_DIR/artifact_ids.json" 2>/dev/null; then
+        echo "Artifact IDs written to $METRICS_DIR/artifact_ids.json"
+    else
+        echo "Could not query artifact IDs (OK for local runs without a token)"
+        echo '{}' > "$METRICS_DIR/artifact_ids.json"
+    fi
+else
+    echo "GITHUB_RUN_ID or GITHUB_REPOSITORY not set; skipping artifact ID query"
+    echo '{}' > "$METRICS_DIR/artifact_ids.json"
+fi
+echo "::endgroup::"
+
 # Process all non-empty diff files.
 # Empty diff files are stubs uploaded by verify-metrics-snapshot when there is no
 # baseline or when compare_metrics.py found no differences (it only writes to the
@@ -115,13 +136,24 @@ else
 fi
 
 # Merge per-snapshot JSON files into a single metrics_snapshots.json.
-# Each entry gets a "snapshot" field with the snapshot name.
+# Each entry gets a "snapshot" field with the snapshot name and an optional
+# "artifact_id" field (integer) for the corresponding diff artifact download link.
 # Capped at 50 entries to match the publish workflow's MAX_SNAPSHOTS limit.
 # The trusted publish workflow validates this data before rendering.
 python3 - "$METRICS_DIR" "${json_files[@]}" <<'PYEOF'
 import json, os, sys
 metrics_dir = sys.argv[1]
 MAX_SNAPSHOTS = 50
+
+# Load artifact IDs if available (produced by the gh api query above).
+# artifact_ids maps artifact name → artifact ID (integer).
+artifact_ids = {}
+try:
+    with open(os.path.join(metrics_dir, 'artifact_ids.json')) as f:
+        artifact_ids = json.load(f)
+except (FileNotFoundError, json.JSONDecodeError):
+    pass
+
 snapshots = []
 for path in sys.argv[2:]:
     if len(snapshots) >= MAX_SNAPSHOTS:
@@ -133,6 +165,12 @@ for path in sys.argv[2:]:
         basename = os.path.basename(path)
         name = basename.removeprefix('changes_').removesuffix('.json')
         data['snapshot'] = name
+        # Look up the artifact ID for this snapshot's diff artifact so the
+        # publish workflow can render a direct download link.
+        diff_artifact_name = f"diff_{name}"
+        artifact_id = artifact_ids.get(diff_artifact_name)
+        if isinstance(artifact_id, int):
+            data['artifact_id'] = artifact_id
         snapshots.append(data)
     except Exception as e:
         print(f"Warning: could not read {path}: {e}", file=sys.stderr)
diff --git a/scripts/e2e/metrics_summary_test.py b/scripts/e2e/metrics_summary_test.py
new file mode 100644
index 00000000000..cd8d54a3459
--- /dev/null
+++ b/scripts/e2e/metrics_summary_test.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2025 The Jaeger Authors.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import tempfile
+import unittest
+from metrics_summary import (
+    extract_metric_name,
+    get_raw_diff_sample,
+    parse_diff_file,
+    generate_diff_summary,
+    generate_structured_json,
+)
+
+
+# A minimal unified diff that exercises added/removed/modified cases and exclusion counts.
+_DIFF_WITH_ALL_CATEGORIES = """\
+--- 
++++ 
+@@ -1,3 +1,3 @@
+-baseline_only{job="a"}
+-http_server_duration{le="+Inf"}
++current_only{job="b"}
++http_server_duration{http_route="/status",le="+Inf"}
+# Metrics excluded from current: 2
+# Metrics excluded from baseline: 3
+"""
+
+# A diff with only added metrics (present in current, absent from baseline).
+_DIFF_ADDED_ONLY = """\
+--- 
++++ 
+@@ -1 +2 @@
++new_metric{job="a"}
+"""
+
+# A diff with only removed metrics (present in baseline, absent from current).
+_DIFF_REMOVED_ONLY = """\
+--- 
++++ 
+@@ -2 +1 @@
+-old_metric{job="b"}
+"""
+
+
+def _write_tmp(content):
+    """Write content to a temp file and return its path."""
+    f = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
+    f.write(content)
+    f.close()
+    return f.name
+
+
+class TestExtractMetricName(unittest.TestCase):
+    """Tests for extract_metric_name()."""
+
+    def test_extracts_name_before_braces(self):
+        self.assertEqual(extract_metric_name('http_server_duration{le="+Inf"}'), 'http_server_duration')
+
+    def test_returns_bare_name_when_no_braces(self):
+        self.assertEqual(extract_metric_name('my_metric'), 'my_metric')
+
+    def test_strips_whitespace(self):
+        self.assertEqual(extract_metric_name('  my_metric  '), 'my_metric')
+
+
+class TestGetRawDiffSample(unittest.TestCase):
+    """Tests for get_raw_diff_sample()."""
+
+    def test_empty_input_returns_empty(self):
+        self.assertEqual(get_raw_diff_sample([]), [])
+
+    def test_pure_removals_are_truncated(self):
+        lines = [f'-metric_{i}' for i in range(10)]
+        result = get_raw_diff_sample(lines, max_lines=4)
+        self.assertEqual(result[:4], lines[:4])
+        self.assertEqual(result[-1], '...')
+
+    def test_pure_additions_are_truncated(self):
+        lines = [f'+metric_{i}' for i in range(10)]
+        result = get_raw_diff_sample(lines, max_lines=4)
+        self.assertEqual(result[:4], lines[:4])
+        self.assertEqual(result[-1], '...')
+
+    def test_interleaves_removed_and_added_lines(self):
+        """Modified metrics with both - and + lines are interleaved."""
+        removed = [f'-metric{{a="{i}"}}' for i in range(4)]
+        added   = [f'+metric{{a="{i}",b="x"}}' for i in range(4)]
+        result = get_raw_diff_sample(removed + added, max_lines=4)
+        # Pairs: (-, +, -, +, ...)
+        self.assertTrue(result[0].startswith('-'))
+        self.assertTrue(result[1].startswith('+'))
+        self.assertTrue(result[2].startswith('-'))
+        self.assertTrue(result[3].startswith('+'))
+
+    def test_interleaved_truncation_adds_ellipsis(self):
+        removed = [f'-metric{{a="{i}"}}' for i in range(10)]
+        added   = [f'+metric{{a="{i}",b="x"}}' for i in range(10)]
+        result = get_raw_diff_sample(removed + added, max_lines=4)
+        self.assertIn('...', result)
+        # Should only show max_lines//2 = 2 pairs
+        non_ellipsis = [l for l in result if l != '...']
+        self.assertEqual(len(non_ellipsis), 4)  # 2 pairs × 2 lines each
+
+    def test_no_ellipsis_when_within_limit(self):
+        lines = ['-m1', '+m1_v2']
+        result = get_raw_diff_sample(lines, max_lines=8)
+        self.assertNotIn('...', result)
+
+
+class TestParseDiffFile(unittest.TestCase):
+    """Tests for parse_diff_file()."""
+
+    def test_parses_added_removed_and_modified(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+
+        # baseline_only is in baseline but not current → removed
+        self.assertIn('baseline_only', changes['removed'])
+        # current_only is in current but not baseline → added
+        self.assertIn('current_only', changes['added'])
+        # http_server_duration appears in both → modified
+        self.assertIn('http_server_duration', changes['modified'])
+        # Not in added or removed after reclassification
+        self.assertNotIn('http_server_duration', changes['added'])
+        self.assertNotIn('http_server_duration', changes['removed'])
+
+    def test_accumulates_exclusion_counts(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            _, _, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertEqual(excl_count, 5)  # 2 + 3
+
+    def test_zero_exclusion_count_when_no_exclusion_lines(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            _, _, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertEqual(excl_count, 0)
+
+    def test_raw_diff_sections_populated(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            _, raw_sections, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertIn('http_server_duration', raw_sections)
+        # The section should have at least one - line and one + line
+        lines = raw_sections['http_server_duration']
+        self.assertTrue(any(l.startswith('-') for l in lines))
+        self.assertTrue(any(l.startswith('+') for l in lines))
+
+    def test_added_only_diff(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertIn('new_metric', changes['added'])
+        self.assertEqual(len(changes['removed']), 0)
+        self.assertEqual(len(changes['modified']), 0)
+
+    def test_removed_only_diff(self):
+        path = _write_tmp(_DIFF_REMOVED_ONLY)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertIn('old_metric', changes['removed'])
+        self.assertEqual(len(changes['added']), 0)
+        self.assertEqual(len(changes['modified']), 0)
+
+
+class TestGenerateDiffSummary(unittest.TestCase):
+    """Tests for generate_diff_summary()."""
+
+    def test_total_changes_header_present(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('**Total Changes:**', summary)
+
+    def test_added_section_rendered(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Added Metrics', summary)
+        self.assertIn('new_metric', summary)
+
+    def test_removed_section_rendered(self):
+        path = _write_tmp(_DIFF_REMOVED_ONLY)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Removed Metrics', summary)
+        self.assertIn('old_metric', summary)
+
+    def test_modified_section_rendered(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Modified Metrics', summary)
+        self.assertIn('http_server_duration', summary)
+
+    def test_diff_sample_block_present_for_changed_metric(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('```diff', summary)
+
+    def test_exclusion_count_shown(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Excluded: 5', summary)
+
+
+class TestGenerateStructuredJson(unittest.TestCase):
+    """Tests for generate_structured_json()."""
+
+    def test_counts_are_correct(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        self.assertEqual(data['added'], 1)    # current_only
+        self.assertEqual(data['removed'], 1)  # baseline_only
+        self.assertEqual(data['modified'], 1) # http_server_duration
+
+    def test_metric_names_list_sorted(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        names = data['metric_names']
+        self.assertEqual(names, sorted(names))
+
+    def test_metric_names_deduped(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        self.assertEqual(len(data['metric_names']), len(set(data['metric_names'])))
+
+    def test_added_only_produces_correct_output(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        self.assertEqual(data['added'], 1)
+        self.assertEqual(data['removed'], 0)
+        self.assertEqual(data['modified'], 0)
+        self.assertIn('new_metric', data['metric_names'])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2a74cfcf712e7fce9603176552c91123713c4d21 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 17:34:39 +0000
Subject: [PATCH 4/4] fix: use baseline-first ordering consistently throughout
 diff pipeline

- compare_metrics.py: rename generate_diff signature to (baseline_content,
  current_content) matching diff command conventions; parse baseline before
  current; emit '# Metrics excluded from baseline:' before '# Metrics
  excluded from current:' in output; fix main() call order
- compare_metrics_test.py: swap all generate_diff() call arguments to
  (baseline, current) order; update comments to say baseline first
- metrics_summary.py: check '# Metrics excluded from baseline:' before
  '# Metrics excluded from current:' in condition
- metrics_summary_test.py: update _DIFF_WITH_ALL_CATEGORIES fixture to
  list baseline exclusion count before current
- metrics_summary.sh: expand artifact ID query section with detailed
  troubleshooting logs (show API URL, capture stderr, log jq errors,
  print artifact count and full map on success)

Agent-Logs-Url: https://github.com/jaegertracing/jaeger/sessions/04585a94-2aa5-4d3e-8859-8a15d5ae435a

Co-authored-by: yurishkuro <3523016+yurishkuro@users.noreply.github.com>
---
 scripts/e2e/compare_metrics.py      | 20 +++++++-------
 scripts/e2e/compare_metrics_test.py | 24 ++++++++--------
 scripts/e2e/metrics_summary.py      |  2 +-
 scripts/e2e/metrics_summary.sh      | 43 ++++++++++++++++++++++-------
 scripts/e2e/metrics_summary_test.py |  2 +-
 5 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/scripts/e2e/compare_metrics.py b/scripts/e2e/compare_metrics.py
index 8e3d454715c..0633f0b63d2 100644
--- a/scripts/e2e/compare_metrics.py
+++ b/scripts/e2e/compare_metrics.py
@@ -111,7 +111,7 @@ def parse_metrics(content):
     return metrics,metrics_exclusion_count
 
 
-def generate_diff(current_content, baseline_content):
+def generate_diff(baseline_content, current_content):
     """Compare two Prometheus metrics snapshots and return a unified diff of metric names.
 
     The input files are raw Prometheus text exposition format, scraped directly from
@@ -136,18 +136,18 @@ def generate_diff(current_content, baseline_content):
         - lines  = present in baseline but absent from current → regression/removed
         + lines  = present in current but absent from baseline → newly added
     followed by optional comment lines reporting how many metrics were excluded, e.g.:
-        # Metrics excluded from current: 3
-        # Metrics excluded from baseline: 5
+        # Metrics excluded from baseline: 3
+        # Metrics excluded from current: 5
     These comment lines (prefixed with `# `) are appended only when the diff is
     non-empty; they are informational context, not metric differences themselves.
     """
-    if isinstance(current_content, list):
-        current_content = ''.join(current_content)
     if isinstance(baseline_content, list):
         baseline_content = ''.join(baseline_content)
+    if isinstance(current_content, list):
+        current_content = ''.join(current_content)
 
-    current_metrics, excluded_count_current = parse_metrics(current_content)
     baseline_metrics, excluded_count_baseline = parse_metrics(baseline_content)
+    current_metrics, excluded_count_current = parse_metrics(current_content)
 
     # unified_diff(baseline, current): - = in baseline but not current (removed/regression),
     #                                  + = in current but not baseline (newly added).
@@ -160,11 +160,11 @@ def generate_diff(current_content, baseline_content):
     if len(diff) == 0:
         return ''
 
-    total_excluded = excluded_count_current + excluded_count_baseline
+    total_excluded = excluded_count_baseline + excluded_count_current
 
     exclusion_lines = ''
     if total_excluded > 0:
-        exclusion_lines = f'\n# Metrics excluded from current: {excluded_count_current}\n# Metrics excluded from baseline: {excluded_count_baseline}'
+        exclusion_lines = f'\n# Metrics excluded from baseline: {excluded_count_baseline}\n# Metrics excluded from current: {excluded_count_current}'
 
     return '\n'.join(diff) + exclusion_lines
 
@@ -185,11 +185,11 @@ def main():
     args = parser.parse_args()
 
     # Read input files
-    current_lines = read_metric_file(args.current)
     baseline_lines = read_metric_file(args.baseline)
+    current_lines = read_metric_file(args.current)
 
     # Generate diff
-    diff_lines = generate_diff(current_lines, baseline_lines)
+    diff_lines = generate_diff(baseline_lines, current_lines)
 
     # Check if there are any differences
     if diff_lines:
diff --git a/scripts/e2e/compare_metrics_test.py b/scripts/e2e/compare_metrics_test.py
index 9750808932c..fd6e9ef37a6 100644
--- a/scripts/e2e/compare_metrics_test.py
+++ b/scripts/e2e/compare_metrics_test.py
@@ -54,8 +54,8 @@ def test_empty_snapshots_returns_empty(self):
 
     def test_regression_detected(self):
         """Metric present in baseline but absent from current snapshot → diff is non-empty."""
-        # current=A only, baseline=A+B → B is missing from current (regression/removed)
-        result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
+        # baseline=A+B, current=A only → B is missing from current (regression/removed)
+        result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
         self.assertNotEqual(result, '', 'Expected a non-empty diff for a regression')
         # '-' line = in baseline but not in current (regression)
         self.assertIn('-counter_b', result)
@@ -68,8 +68,8 @@ def test_new_metric_in_current_snapshot_produces_diff(self):
         Silently ignoring new metrics would mask intermittent behaviour where a
         metric alternates between appearing and disappearing across runs.
         """
-        # current=A+B, baseline=A only → B is new in current (newly added)
-        result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
+        # baseline=A only, current=A+B → B is new in current (newly added)
+        result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
         self.assertNotEqual(result, '', 'New metrics in current snapshot should produce a diff')
         # '+' line = in current but not in baseline (newly added)
         self.assertIn('+counter_b', result)
@@ -82,8 +82,8 @@ def test_exclusion_count_difference_does_not_produce_diff(self):
         other), the exclusion-count lines are informational metadata and must not make
         the diff non-empty on their own.
         """
-        # current has metric_a + one 5xx (excluded), baseline has metric_a + zero 5xx
-        result = generate_diff(_METRIC_A_AND_EXCLUDED, _METRIC_A)
+        # baseline has metric_a + zero 5xx, current has metric_a + one 5xx (excluded)
+        result = generate_diff(_METRIC_A, _METRIC_A_AND_EXCLUDED)
         self.assertEqual(
             result,
             '',
@@ -92,8 +92,8 @@ def test_exclusion_count_difference_does_not_produce_diff(self):
 
     def test_mixed_regression_and_new_metric_returns_diff(self):
         """When there is both a regression AND a new metric, the diff is non-empty."""
-        # current=B only, baseline=A only → A is missing (regression/removed), B is new (added)
-        result = generate_diff(_METRIC_B, _METRIC_A)
+        # baseline=A only, current=B only → A is missing (regression/removed), B is new (added)
+        result = generate_diff(_METRIC_A, _METRIC_B)
         self.assertNotEqual(result, '')
         # '-' line = in baseline but not in current (regression)
         self.assertIn('-counter_a', result)
@@ -102,15 +102,15 @@ def test_mixed_regression_and_new_metric_returns_diff(self):
 
     def test_regression_with_exclusions_includes_exclusion_summary(self):
         """When there is a regression and excluded metrics, the output includes counts."""
-        # current=excluded only, baseline=A+excluded → A is missing (regression)
-        result = generate_diff(_METRIC_EXCLUDED_5XX, _METRIC_A + _METRIC_EXCLUDED_5XX)
+        # baseline=A+excluded, current=excluded only → A is missing (regression)
+        result = generate_diff(_METRIC_A + _METRIC_EXCLUDED_5XX, _METRIC_EXCLUDED_5XX)
         self.assertNotEqual(result, '')
-        self.assertIn('# Metrics excluded from current:', result)
         self.assertIn('# Metrics excluded from baseline:', result)
+        self.assertIn('# Metrics excluded from current:', result)
 
     def test_no_exclusions_means_no_exclusion_summary(self):
         """When there are no excluded metrics, the exclusion summary is omitted."""
-        result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
+        result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
         self.assertNotIn('Metrics excluded from', result)
 
 
diff --git a/scripts/e2e/metrics_summary.py b/scripts/e2e/metrics_summary.py
index 6e4ed9cd655..315fed8ca8d 100644
--- a/scripts/e2e/metrics_summary.py
+++ b/scripts/e2e/metrics_summary.py
@@ -28,7 +28,7 @@ def parse_diff_file(diff_path):
         original_line = line.rstrip('\n')
         stripped = original_line.strip()
 
-        if stripped.startswith('# Metrics excluded from current: ') or stripped.startswith('# Metrics excluded from baseline: '):
+        if stripped.startswith('# Metrics excluded from baseline: ') or stripped.startswith('# Metrics excluded from current: '):
             count_str = stripped.split(': ')[1]
             exclusion_count += int(count_str)
             continue
diff --git a/scripts/e2e/metrics_summary.sh b/scripts/e2e/metrics_summary.sh
index 2edc95e59e0..cc7d71777c4 100644
--- a/scripts/e2e/metrics_summary.sh
+++ b/scripts/e2e/metrics_summary.sh
@@ -58,18 +58,41 @@ echo "::endgroup::"
 # locally without a token) we write an empty map and the publish workflow
 # degrades gracefully (no download links, but the rest of the summary still works).
 echo "::group::Querying diff artifact IDs"
-if [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then
-    if gh api "repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/artifacts?per_page=100" \
-        --jq '[.artifacts[] | select(.name | startswith("diff_metrics_snapshot_")) | {key: .name, value: .id}] | from_entries' \
-        > "$METRICS_DIR/artifact_ids.json" 2>/dev/null; then
-        echo "Artifact IDs written to $METRICS_DIR/artifact_ids.json"
-    else
-        echo "Could not query artifact IDs (OK for local runs without a token)"
-        echo '{}' > "$METRICS_DIR/artifact_ids.json"
-    fi
-else
+if [ -z "${GITHUB_RUN_ID:-}" ] || [ -z "${GITHUB_REPOSITORY:-}" ]; then
     echo "GITHUB_RUN_ID or GITHUB_REPOSITORY not set; skipping artifact ID query"
     echo '{}' > "$METRICS_DIR/artifact_ids.json"
+else
+    echo "Querying artifacts for run ${GITHUB_RUN_ID} in ${GITHUB_REPOSITORY}"
+    api_url="repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/artifacts?per_page=100"
+    echo "API URL: https://api.github.com/${api_url}"
+
+    api_output=$(gh api "$api_url" 2>&1)
+    api_exit=$?
+    if [ $api_exit -ne 0 ]; then
+        echo "::warning::gh api call failed (exit $api_exit) — no artifact download links will be rendered"
+        echo "gh api error output:"
+        echo "$api_output"
+        echo '{}' > "$METRICS_DIR/artifact_ids.json"
+    else
+        echo "API call succeeded; filtering diff_metrics_snapshot_* artifacts"
+        jq_output=$(echo "$api_output" | \
+            jq '[.artifacts[] | select(.name | startswith("diff_metrics_snapshot_")) | {key: .name, value: .id}] | from_entries' 2>&1)
+        jq_exit=$?
+        if [ $jq_exit -ne 0 ]; then
+            echo "::warning::jq processing failed (exit $jq_exit) — no artifact download links will be rendered"
+            echo "jq error output:"
+            echo "$jq_output"
+            echo '{}' > "$METRICS_DIR/artifact_ids.json"
+        else
+            echo "$jq_output" > "$METRICS_DIR/artifact_ids.json"
+            artifact_count=$(echo "$jq_output" | jq 'length')
+            echo "Artifact IDs written to $METRICS_DIR/artifact_ids.json (${artifact_count} diff artifact(s) found)"
+            if [ "${artifact_count}" -gt 0 ]; then
+                echo "Artifact ID map:"
+                echo "$jq_output" | jq '.'
+            fi
+        fi
+    fi
 fi
 echo "::endgroup::"
 
diff --git a/scripts/e2e/metrics_summary_test.py b/scripts/e2e/metrics_summary_test.py
index cd8d54a3459..79d5e2dfdec 100644
--- a/scripts/e2e/metrics_summary_test.py
+++ b/scripts/e2e/metrics_summary_test.py
@@ -22,8 +22,8 @@
 -http_server_duration{le="+Inf"}
 +current_only{job="b"}
 +http_server_duration{http_route="/status",le="+Inf"}
-# Metrics excluded from current: 2
 # Metrics excluded from baseline: 3
+# Metrics excluded from current: 2
 """
 
 # A diff with only added metrics (present in current, absent from baseline).