diff --git a/.github/actions/verify-metrics-snapshot/action.yaml b/.github/actions/verify-metrics-snapshot/action.yaml
index 326a2861674..e0612a3242d 100644
--- a/.github/actions/verify-metrics-snapshot/action.yaml
+++ b/.github/actions/verify-metrics-snapshot/action.yaml
@@ -94,7 +94,7 @@ runs:
       shell: bash
       run: |
         python3 -m pip install prometheus-client
-        if python3 ./scripts/e2e/compare_metrics.py --file1 ./.metrics/${{ inputs.snapshot }}.txt --file2 ./.metrics/baseline_${{ inputs.snapshot }}.txt --output ./.metrics/diff_${{ inputs.snapshot }}.txt; then
+        if python3 ./scripts/e2e/compare_metrics.py --current ./.metrics/${{ inputs.snapshot }}.txt --baseline ./.metrics/baseline_${{ inputs.snapshot }}.txt --output ./.metrics/diff_${{ inputs.snapshot }}.txt; then
           echo "No differences found in metrics"
         else
           echo "🛑 Differences found in metrics"
diff --git a/.github/scripts/ci-summary-report-publish.js b/.github/scripts/ci-summary-report-publish.js
index 31ae50865fc..06c20e92de9 100644
--- a/.github/scripts/ci-summary-report-publish.js
+++ b/.github/scripts/ci-summary-report-publish.js
@@ -97,7 +97,12 @@ function sanitizeSnapshots(raw) {
         if (clean) names.push(clean);
       }
     }
-    result.push({ snapshot, added, removed, modified, metric_names: names });
+    // Validate artifact_id (optional, non-negative integer for the diff artifact download link)
+    const artifactId = safeNum(entry.artifact_id);
+    // artifact_id must be a positive integer (GitHub artifact IDs are always > 0)
+    const sanitizedArtifactId = (artifactId !== null && Number.isInteger(artifactId) && artifactId > 0)
+      ? artifactId : null;
+    result.push({ snapshot, added, removed, modified, metric_names: names, artifact_id: sanitizedArtifactId });
   }
   return result.length > 0 ? result : null;
 }
@@ -158,9 +163,14 @@ function computeCoverage(s) {
  * All text is built from trusted templates; metric names have been validated
  * through sanitizeMetricName() and are rendered in backtick-code spans.
  * @param {Array|null} snapshots - Sanitized snapshots from computeMetrics
+ * @param {string} [ciRunUrl] - URL to the CI run for linking to step logs
+ * @param {string} [artifactUrlPrefix] - Base URL for artifact downloads:
+ *   `https://github.com/{owner}/{repo}/actions/runs/{runId}/artifacts`.
+ *   When provided, each snapshot with a valid artifact_id gets a direct
+ *   download link appended to its header line.
  * @returns {string} - Markdown detail block, or empty string if no data
  */
-function formatMetricsDetail(snapshots) {
+function formatMetricsDetail(snapshots, { ciRunUrl, artifactUrlPrefix } = {}) {
   if (!snapshots || snapshots.length === 0) return '';
 
   const lines = [
@@ -170,8 +180,18 @@ function formatMetricsDetail(snapshots) {
     '',
   ];
 
+  if (ciRunUrl) {
+    lines.push(`_For label-level diff details, open the [CI run](${ciRunUrl}) and expand the "Compare metrics and generate summary" step logs._`);
+    lines.push('');
+  }
+
   for (const snap of snapshots) {
-    lines.push(`**${snap.snapshot}**`);
+    // Build the snapshot header, optionally with a direct diff artifact download link.
+    let snapHeader = `**${snap.snapshot}**`;
+    if (artifactUrlPrefix && snap.artifact_id !== null && snap.artifact_id !== undefined) {
+      snapHeader += ` — [⬇️ download diff](${artifactUrlPrefix}/${snap.artifact_id})`;
+    }
+    lines.push(snapHeader);
     const parts = [];
     if (snap.added    !== null && snap.added    > 0) parts.push(`${snap.added} added`);
     if (snap.removed  !== null && snap.removed  > 0) parts.push(`${snap.removed} removed`);
@@ -201,9 +221,11 @@ function formatMetricsDetail(snapshots) {
  * @param {string} footer - links + timestamp line
  * @param {object} [opts]
  * @param {Array|null} [opts.metricsSnapshots] - sanitized snapshot data for detail rendering
+ * @param {string} [opts.ciRunUrl] - URL to the CI run, passed through to formatMetricsDetail
+ * @param {string} [opts.artifactUrlPrefix] - Base URL for artifact downloads, passed through
  * @returns {string}
  */
-function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots } = {}) {
+function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots, ciRunUrl, artifactUrlPrefix } = {}) {
   const parts = [
     COMMENT_TAG,
     '## CI Summary Report',
@@ -212,7 +234,7 @@ function buildCommentBody(metricsText, coverageText, footer, { metricsSnapshots
     metricsText,
   ];
 
-  const detail = formatMetricsDetail(metricsSnapshots);
+  const detail = formatMetricsDetail(metricsSnapshots, { ciRunUrl, artifactUrlPrefix });
   if (detail) {
     parts.push(detail);
   }
@@ -301,6 +323,8 @@ async function postOrUpdateComment(github, owner, repo, prNumber, body, core, {
  * @param {string} opts.inputs.prNumber  - raw string from step output
  * @param {string} opts.inputs.ciRunUrl
  * @param {string} opts.inputs.publishUrl
+ * @param {string} [opts.inputs.sourceRunId] - numeric run ID of the CI Orchestrator run;
+ *   used to construct direct artifact download URLs (e.g. .../artifacts/{id})
  */
 async function handler({ github, core, fs, inputs }) {
   const { owner, repo, headSha, ciRunUrl, publishUrl } = inputs;
@@ -310,6 +334,13 @@ async function handler({ github, core, fs, inputs }) {
   const ts     = new Date().toISOString().replace('T', ' ').replace(/\.\d+Z$/, ' UTC');
   const footer = `${links}\n_${ts}_`;
 
+  // Construct the artifact URL prefix for per-snapshot diff download links.
+  // All parts come from trusted sources: owner/repo from context, runId validated as integer.
+  const sourceRunId = parseInt(inputs.sourceRunId, 10) || null;
+  const artifactUrlPrefix = (owner && repo && sourceRunId)
+    ? `https://github.com/${owner}/${repo}/actions/runs/${sourceRunId}/artifacts`
+    : null;
+
   // Read structured data written by ci-summary-report.yml.
   // All fields are primitives (enums, numbers, booleans) — no free-form text.
   let s;
@@ -349,7 +380,11 @@ async function handler({ github, core, fs, inputs }) {
     // after a green run.  Only create a new comment when there is something to report.
     const hasIssues = metrics.conclusion === 'failure' || coverage.conclusion === 'failure'
                       || metrics.totalChanges > 0;
-    const body = buildCommentBody(metrics.text, coverage.text, footer, { metricsSnapshots: metrics.snapshots });
+    const body = buildCommentBody(metrics.text, coverage.text, footer, {
+      metricsSnapshots: metrics.snapshots,
+      ciRunUrl,
+      artifactUrlPrefix,
+    });
     await postOrUpdateComment(github, owner, repo, prNumber, body, core, { createNew: hasIssues });
   } else {
     core.info('No PR number; skipping PR comment.');
diff --git a/.github/scripts/ci-summary-report-publish.test.js b/.github/scripts/ci-summary-report-publish.test.js
index 8334c131578..4b2ab570552 100644
--- a/.github/scripts/ci-summary-report-publish.test.js
+++ b/.github/scripts/ci-summary-report-publish.test.js
@@ -175,6 +175,44 @@ describe('sanitizeSnapshots', () => {
     expect(result).toHaveLength(1);
     expect(result[0].snapshot).toBe('valid');
   });
+
+  test('sanitizes artifact_id as a positive integer', () => {
+    const input = [{
+      snapshot: 'test_snapshot',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBe(6359406281);
+  });
+
+  test('rejects non-integer artifact_id', () => {
+    const input = [{
+      snapshot: 'test_snapshot',
+      added: 1, removed: 0, modified: 0,
+      metric_names: [],
+      artifact_id: 'evil_string',
+    }];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBeNull();
+  });
+
+  test('rejects zero and negative artifact_id', () => {
+    const input = [
+      { snapshot: 'snap_a', added: 0, removed: 0, modified: 0, metric_names: [], artifact_id: 0 },
+      { snapshot: 'snap_b', added: 0, removed: 0, modified: 0, metric_names: [], artifact_id: -1 },
+    ];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBeNull();
+    expect(result[1].artifact_id).toBeNull();
+  });
+
+  test('sets artifact_id to null when absent', () => {
+    const input = [{ snapshot: 'test_snapshot', added: 1, removed: 0, modified: 0, metric_names: [] }];
+    const result = sanitizeSnapshots(input);
+    expect(result[0].artifact_id).toBeNull();
+  });
 });
 
 // ── computeMetrics ────────────────────────────────────────────────────────────
@@ -285,14 +323,53 @@ describe('formatMetricsDetail', () => {
     expect(detail).toContain('3 removed');
   });
 
-  test('omits zero counts from summary line', () => {
+  test('includes link to CI run when ciRunUrl provided', () => {
     const detail = formatMetricsDetail([{
-      snapshot: 'test', added: 0, removed: 5, modified: 0,
-      metric_names: [],
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+    }], { ciRunUrl: 'https://github.com/org/repo/actions/runs/123' });
+    expect(detail).toContain('[CI run](https://github.com/org/repo/actions/runs/123)');
+    expect(detail).toContain('Compare metrics and generate summary');
+  });
+
+  test('omits CI run link when ciRunUrl not provided', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
     }]);
-    expect(detail).not.toContain('added');
-    expect(detail).toContain('5 removed');
-    expect(detail).not.toContain('modified');
+    expect(detail).not.toContain('[CI run]');
+  });
+
+  test('renders per-snapshot artifact download link when artifactUrlPrefix and artifact_id provided', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }], { artifactUrlPrefix: 'https://github.com/org/repo/actions/runs/123/artifacts' });
+    expect(detail).toContain('[⬇️ download diff](https://github.com/org/repo/actions/runs/123/artifacts/6359406281)');
+  });
+
+  test('omits artifact download link when artifact_id is null', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: null,
+    }], { artifactUrlPrefix: 'https://github.com/org/repo/actions/runs/123/artifacts' });
+    expect(detail).not.toContain('download diff');
+  });
+
+  test('omits artifact download link when artifactUrlPrefix not provided', () => {
+    const detail = formatMetricsDetail([{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }]);
+    expect(detail).not.toContain('download diff');
   });
 });
 
@@ -420,6 +497,31 @@ describe('buildCommentBody', () => {
     expect(metricsPos).toBeLessThan(detailsPos);
     expect(detailsPos).toBeLessThan(coveragePos);
   });
+
+  test('includes CI run link in metrics detail when ciRunUrl provided', () => {
+    const snapshots = [{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: null,
+    }];
+    const ciRunUrl = 'https://github.com/org/repo/actions/runs/999';
+    const body = buildCommentBody('❌ 1 metric change(s) detected', coverageText, footer, { metricsSnapshots: snapshots, ciRunUrl });
+    expect(body).toContain(`[CI run](${ciRunUrl})`);
+    expect(body).toContain('Compare metrics and generate summary');
+  });
+
+  test('includes per-snapshot artifact download link when artifactUrlPrefix and artifact_id provided', () => {
+    const snapshots = [{
+      snapshot: 'cassandra_v2',
+      added: 1, removed: 0, modified: 0,
+      metric_names: ['http_server_duration'],
+      artifact_id: 6359406281,
+    }];
+    const artifactUrlPrefix = 'https://github.com/org/repo/actions/runs/999/artifacts';
+    const body = buildCommentBody('❌ 1 metric change(s) detected', coverageText, footer, { metricsSnapshots: snapshots, artifactUrlPrefix });
+    expect(body).toContain(`[⬇️ download diff](${artifactUrlPrefix}/6359406281)`);
+  });
 });
 
 // ── postCheckRun ──────────────────────────────────────────────────────────────
diff --git a/.github/workflows/ci-summary-report-publish.yml b/.github/workflows/ci-summary-report-publish.yml
index 88ad0559e48..cc532d2e821 100644
--- a/.github/workflows/ci-summary-report-publish.yml
+++ b/.github/workflows/ci-summary-report-publish.yml
@@ -116,12 +116,13 @@ jobs:
             await handler({
               github, core, fs: require('fs'),
               inputs: {
-                owner:      context.repo.owner,
-                repo:       context.repo.repo,
-                headSha:    '${{ steps.source-run.outputs.head_sha }}',
-                prNumber:   '${{ steps.source-run.outputs.pr_number || steps.pr.outputs.pr_number }}',
-                ciRunUrl:   '${{ steps.source-run.outputs.source_run_url }}',
-                publishUrl: '${{ steps.source-run.outputs.summary_run_url }}',
+                owner:       context.repo.owner,
+                repo:        context.repo.repo,
+                headSha:     '${{ steps.source-run.outputs.head_sha }}',
+                prNumber:    '${{ steps.source-run.outputs.pr_number || steps.pr.outputs.pr_number }}',
+                ciRunUrl:    '${{ steps.source-run.outputs.source_run_url }}',
+                publishUrl:  '${{ steps.source-run.outputs.summary_run_url }}',
+                sourceRunId: '${{ steps.source-run.outputs.run_id }}',
               },
             });
         env:
diff --git a/.github/workflows/ci-summary-report.yml b/.github/workflows/ci-summary-report.yml
index 1d26cb75d3f..b83bfbf1e66 100644
--- a/.github/workflows/ci-summary-report.yml
+++ b/.github/workflows/ci-summary-report.yml
@@ -40,6 +40,8 @@ jobs:
 
       - name: Compare metrics and generate summary
         id: compare-metrics
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         shell: bash
         run: bash ./scripts/e2e/metrics_summary.sh
 
diff --git a/scripts/e2e/compare_metrics.py b/scripts/e2e/compare_metrics.py
index a64481a2eda..0633f0b63d2 100644
--- a/scripts/e2e/compare_metrics.py
+++ b/scripts/e2e/compare_metrics.py
@@ -111,7 +111,7 @@ def parse_metrics(content):
     return metrics,metrics_exclusion_count
 
 
-def generate_diff(file1_content, file2_content):
+def generate_diff(baseline_content, current_content):
     """Compare two Prometheus metrics snapshots and return a unified diff of metric names.
 
     The input files are raw Prometheus text exposition format, scraped directly from
@@ -132,21 +132,26 @@ def generate_diff(file1_content, file2_content):
     The diff is performed on these sorted, value-free metric strings.  If the two
     snapshots produce the same set of strings the diff is empty and this function
     returns ''.  When there are differences, the return value is a unified diff
+    using the standard convention:
+        - lines  = present in baseline but absent from current → regression/removed
+        + lines  = present in current but absent from baseline → newly added
     followed by optional comment lines reporting how many metrics were excluded, e.g.:
-        # Metrics excluded from A: 3
-        # Metrics excluded from B: 5
+        # Metrics excluded from baseline: 3
+        # Metrics excluded from current: 5
     These comment lines (prefixed with `# `) are appended only when the diff is
     non-empty; they are informational context, not metric differences themselves.
     """
-    if isinstance(file1_content, list):
-        file1_content = ''.join(file1_content)
-    if isinstance(file2_content, list):
-        file2_content = ''.join(file2_content)
+    if isinstance(baseline_content, list):
+        baseline_content = ''.join(baseline_content)
+    if isinstance(current_content, list):
+        current_content = ''.join(current_content)
 
-    metrics1,excluded_metrics_count1 = parse_metrics(file1_content)
-    metrics2,excluded_metrics_count2 = parse_metrics(file2_content)
+    baseline_metrics, excluded_count_baseline = parse_metrics(baseline_content)
+    current_metrics, excluded_count_current = parse_metrics(current_content)
 
-    diff = list(unified_diff(metrics1, metrics2, lineterm='', n=0))
+    # unified_diff(baseline, current): - = in baseline but not current (removed/regression),
+    #                                  + = in current but not baseline (newly added).
+    diff = list(unified_diff(baseline_metrics, current_metrics, lineterm='', n=0))
 
     # Exclusion counts are informational context appended to the diff output.
     # They must not be written when the diff itself is empty: two snapshots with
@@ -155,12 +160,12 @@ def generate_diff(file1_content, file2_content):
     if len(diff) == 0:
         return ''
 
-    total_excluded = excluded_metrics_count1 + excluded_metrics_count2
-    
+    total_excluded = excluded_count_baseline + excluded_count_current
+
     exclusion_lines = ''
     if total_excluded > 0:
-        exclusion_lines = f'\n# Metrics excluded from A: {excluded_metrics_count1}\n# Metrics excluded from B: {excluded_metrics_count2}'
-    
+        exclusion_lines = f'\n# Metrics excluded from baseline: {excluded_count_baseline}\n# Metrics excluded from current: {excluded_count_current}'
+
     return '\n'.join(diff) + exclusion_lines
 
 def write_diff_file(diff_lines, output_path):
@@ -172,20 +177,20 @@ def write_diff_file(diff_lines, output_path):
 
 def main():
     parser = argparse.ArgumentParser(description='Generate diff between two Jaeger metric files')
-    parser.add_argument('--file1', help='Path to first metric file')
-    parser.add_argument('--file2', help='Path to second metric file')
+    parser.add_argument('--current', help='Path to the current metric file (e.g. from the PR)')
+    parser.add_argument('--baseline', help='Path to the baseline metric file (e.g. from main branch)')
     parser.add_argument('--output', '-o', default='metrics_diff.txt',
                        help='Output diff file path (default: metrics_diff.txt)')
-    
+
     args = parser.parse_args()
-    
+
     # Read input files
-    file1_lines = read_metric_file(args.file1)
-    file2_lines = read_metric_file(args.file2)
-    
+    baseline_lines = read_metric_file(args.baseline)
+    current_lines = read_metric_file(args.current)
+
     # Generate diff
-    diff_lines = generate_diff(file1_lines, file2_lines)
-    
+    diff_lines = generate_diff(baseline_lines, current_lines)
+
     # Check if there are any differences
     if diff_lines:
         print("differences found between the metric files.")
diff --git a/scripts/e2e/compare_metrics_test.py b/scripts/e2e/compare_metrics_test.py
index 10068cdba19..fd6e9ef37a6 100644
--- a/scripts/e2e/compare_metrics_test.py
+++ b/scripts/e2e/compare_metrics_test.py
@@ -54,11 +54,11 @@ def test_empty_snapshots_returns_empty(self):
 
     def test_regression_detected(self):
         """Metric present in baseline but absent from current snapshot → diff is non-empty."""
-        # current=A only, baseline=A+B → B is missing from current (regression)
-        result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
+        # baseline=A+B, current=A only → B is missing from current (regression/removed)
+        result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
         self.assertNotEqual(result, '', 'Expected a non-empty diff for a regression')
-        # The diff must contain a '+' line for the missing metric (counter_b)
-        self.assertIn('+counter_b', result)
+        # '-' line = in baseline but not in current (regression)
+        self.assertIn('-counter_b', result)
 
     def test_new_metric_in_current_snapshot_produces_diff(self):
         """Metric present in current snapshot but absent from baseline → diff is non-empty.
@@ -68,11 +68,11 @@ def test_new_metric_in_current_snapshot_produces_diff(self):
         Silently ignoring new metrics would mask intermittent behaviour where a
         metric alternates between appearing and disappearing across runs.
         """
-        # current=A+B, baseline=A only → B is new in current
-        result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
+        # baseline=A only, current=A+B → B is new in current (newly added)
+        result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
         self.assertNotEqual(result, '', 'New metrics in current snapshot should produce a diff')
-        # '-' line = in current but not in baseline
-        self.assertIn('-counter_b', result)
+        # '+' line = in current but not in baseline (newly added)
+        self.assertIn('+counter_b', result)
 
     def test_exclusion_count_difference_does_not_produce_diff(self):
         """Snapshots that differ only in excluded-metric counts produce no diff.
@@ -82,8 +82,8 @@ def test_exclusion_count_difference_does_not_produce_diff(self):
         other), the exclusion-count lines are informational metadata and must not make
         the diff non-empty on their own.
         """
-        # current has metric_a + one 5xx (excluded), baseline has metric_a + zero 5xx
-        result = generate_diff(_METRIC_A_AND_EXCLUDED, _METRIC_A)
+        # baseline has metric_a + zero 5xx, current has metric_a + one 5xx (excluded)
+        result = generate_diff(_METRIC_A, _METRIC_A_AND_EXCLUDED)
         self.assertEqual(
             result,
             '',
@@ -92,24 +92,25 @@ def test_exclusion_count_difference_does_not_produce_diff(self):
 
     def test_mixed_regression_and_new_metric_returns_diff(self):
         """When there is both a regression AND a new metric, the diff is non-empty."""
-        # current=B only, baseline=A only → A is missing (regression), B is new
-        result = generate_diff(_METRIC_B, _METRIC_A)
+        # baseline=A only, current=B only → A is missing (regression/removed), B is new (added)
+        result = generate_diff(_METRIC_A, _METRIC_B)
         self.assertNotEqual(result, '')
-        self.assertIn('+counter_a', result)
-        # The new metric should still appear in the raw diff output for visibility
-        self.assertIn('-counter_b', result)
+        # '-' line = in baseline but not in current (regression)
+        self.assertIn('-counter_a', result)
+        # '+' line = in current but not in baseline (newly added)
+        self.assertIn('+counter_b', result)
 
     def test_regression_with_exclusions_includes_exclusion_summary(self):
         """When there is a regression and excluded metrics, the output includes counts."""
-        # current=excluded only, baseline=A+excluded → A is missing (regression)
-        result = generate_diff(_METRIC_EXCLUDED_5XX, _METRIC_A + _METRIC_EXCLUDED_5XX)
+        # baseline=A+excluded, current=excluded only → A is missing (regression)
+        result = generate_diff(_METRIC_A + _METRIC_EXCLUDED_5XX, _METRIC_EXCLUDED_5XX)
         self.assertNotEqual(result, '')
-        self.assertIn('# Metrics excluded from A:', result)
-        self.assertIn('# Metrics excluded from B:', result)
+        self.assertIn('# Metrics excluded from baseline:', result)
+        self.assertIn('# Metrics excluded from current:', result)
 
     def test_no_exclusions_means_no_exclusion_summary(self):
         """When there are no excluded metrics, the exclusion summary is omitted."""
-        result = generate_diff(_METRIC_A, _METRIC_A + _METRIC_B)
+        result = generate_diff(_METRIC_A + _METRIC_B, _METRIC_A)
         self.assertNotIn('Metrics excluded from', result)
 
 
diff --git a/scripts/e2e/metrics_summary.py b/scripts/e2e/metrics_summary.py
index f5b1c6e15dc..315fed8ca8d 100644
--- a/scripts/e2e/metrics_summary.py
+++ b/scripts/e2e/metrics_summary.py
@@ -28,7 +28,7 @@ def parse_diff_file(diff_path):
         original_line = line.rstrip('\n')
         stripped = original_line.strip()
 
-        if stripped.startswith('# Metrics excluded from A: ') or stripped.startswith('# Metrics excluded from B: '):
+        if stripped.startswith('# Metrics excluded from baseline: ') or stripped.startswith('# Metrics excluded from current: '):
             count_str = stripped.split(': ')[1]
             exclusion_count += int(count_str)
             continue
@@ -73,19 +73,38 @@ def extract_metric_name(line):
         return line.split('{')[0].strip()
     return line.strip()
 
-def get_raw_diff_sample(raw_lines, max_lines=7):
+def get_raw_diff_sample(raw_lines, max_lines=8):
     """
     Get sample raw diff lines, preserving original diff formatting.
+
+    For modified metrics that have both removed (-) and added (+) lines,
+    interleaves them so both sides are visible when the sample is truncated,
+    avoiding the problem of showing only the removed side.
     """
     if not raw_lines:
         return []
 
-    # Take up to max_lines
-    sample_lines = raw_lines[:max_lines]
-    if len(raw_lines) > max_lines:
-        sample_lines.append("...")
+    removed_lines = [l for l in raw_lines if l.startswith('-')]
+    added_lines = [l for l in raw_lines if l.startswith('+')]
+
+    # If only one direction of change (pure add or pure remove), just truncate.
+    if not removed_lines or not added_lines:
+        sample_lines = raw_lines[:max_lines]
+        if len(raw_lines) > max_lines:
+            sample_lines.append("...")
+        return sample_lines
+
+    # Interleave pairs of (removed, added) lines so both sides are always visible.
+    max_pairs = max(1, max_lines // 2)
+    interleaved = []
+    for i in range(min(max_pairs, len(removed_lines), len(added_lines))):
+        interleaved.append(removed_lines[i])
+        interleaved.append(added_lines[i])
+
+    if len(removed_lines) > max_pairs or len(added_lines) > max_pairs:
+        interleaved.append("...")
 
-    return sample_lines
+    return interleaved
 
 def generate_diff_summary(changes, raw_diff_sections, exclusion_count):
     """
diff --git a/scripts/e2e/metrics_summary.sh b/scripts/e2e/metrics_summary.sh
index 67983ece935..cc7d71777c4 100644
--- a/scripts/e2e/metrics_summary.sh
+++ b/scripts/e2e/metrics_summary.sh
@@ -3,22 +3,24 @@
 # Copyright (c) 2025 The Jaeger Authors.
 # SPDX-License-Identifier: Apache-2.0
 
-# Enable debug tracing and exit on error
-set -exo pipefail
+# Exit on error, treat unset variables as errors, fail on pipe errors.
+set -eo pipefail
 
 METRICS_DIR="${METRICS_DIR:-./.artifacts}"
 declare -a summary_files=()
 declare -a json_files=()
 total_changes=0
 
+echo "::group::Metrics diff setup"
 echo "Starting metrics diff processing in directory: $METRICS_DIR"
 echo "Directory structure:"
 ls -la "$METRICS_DIR" || echo "Metrics directory listing failed"
+echo "::endgroup::"
 
 # Verify 1-to-1: every metrics_snapshot_* artifact must have a diff_metrics_snapshot_* artifact.
 # verify-metrics-snapshot always uploads a diff artifact on PRs (empty stub if no baseline),
 # so a missing diff dir means that action never ran for that snapshot — an infra failure.
-echo "=== Checking for missing diff artifacts ==="
+echo "::group::Checking for missing diff artifacts"
 declare -a missing_diffs=()
 found_any_snapshot=false
 for snapshot_dir in "$METRICS_DIR"/metrics_snapshot_*/; do
@@ -41,12 +43,58 @@ if [ ${#missing_diffs[@]} -gt 0 ]; then
 else
     echo "INFRA_ERRORS=false" >> "$GITHUB_OUTPUT"
 fi
+echo "::endgroup::"
 
 # Debug: List all diff files found
-echo "=== Searching for diff files ==="
+echo "::group::Searching for diff files"
 find "$METRICS_DIR" -type f -name "diff_*.txt" | while read -r file; do
     echo "Found diff file: $file"
 done
+echo "::endgroup::"
+
+# Query artifact IDs for all diff_metrics_snapshot_* artifacts in this run.
+# These IDs are used by the publish workflow to render direct download links
+# in the PR comment. The query is optional: if it fails (e.g. when running
+# locally without a token) we write an empty map and the publish workflow
+# degrades gracefully (no download links, but the rest of the summary still works).
+echo "::group::Querying diff artifact IDs"
+if [ -z "${GITHUB_RUN_ID:-}" ] || [ -z "${GITHUB_REPOSITORY:-}" ]; then
+    echo "GITHUB_RUN_ID or GITHUB_REPOSITORY not set; skipping artifact ID query"
+    echo '{}' > "$METRICS_DIR/artifact_ids.json"
+else
+    echo "Querying artifacts for run ${GITHUB_RUN_ID} in ${GITHUB_REPOSITORY}"
+    api_url="repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/artifacts?per_page=100"
+    echo "API URL: https://api.github.com/${api_url}"
+
+    api_output=$(gh api "$api_url" 2>&1)
+    api_exit=$?
+    if [ $api_exit -ne 0 ]; then
+        echo "::warning::gh api call failed (exit $api_exit) — no artifact download links will be rendered"
+        echo "gh api error output:"
+        echo "$api_output"
+        echo '{}' > "$METRICS_DIR/artifact_ids.json"
+    else
+        echo "API call succeeded; filtering diff_metrics_snapshot_* artifacts"
+        jq_output=$(echo "$api_output" | \
+            jq '[.artifacts[] | select(.name | startswith("diff_metrics_snapshot_")) | {key: .name, value: .id}] | from_entries' 2>&1)
+        jq_exit=$?
+        if [ $jq_exit -ne 0 ]; then
+            echo "::warning::jq processing failed (exit $jq_exit) — no artifact download links will be rendered"
+            echo "jq error output:"
+            echo "$jq_output"
+            echo '{}' > "$METRICS_DIR/artifact_ids.json"
+        else
+            echo "$jq_output" > "$METRICS_DIR/artifact_ids.json"
+            artifact_count=$(echo "$jq_output" | jq 'length')
+            echo "Artifact IDs written to $METRICS_DIR/artifact_ids.json (${artifact_count} diff artifact(s) found)"
+            if [ "${artifact_count}" -gt 0 ]; then
+                echo "Artifact ID map:"
+                echo "$jq_output" | jq '.'
+            fi
+        fi
+    fi
+fi
+echo "::endgroup::"
 
 # Process all non-empty diff files.
 # Empty diff files are stubs uploaded by verify-metrics-snapshot when there is no
@@ -74,11 +122,12 @@ while IFS= read -r -d '' diff_file; do
     summary_file="$dir/summary_$snapshot_name.md"
     json_file="$dir/changes_$snapshot_name.json"
 
-    echo "Generating summary for $snapshot_name"
+    echo "::group::Generating summary for $snapshot_name"
     python3 ./scripts/e2e/metrics_summary.py \
         --diff "$diff_file" \
         --output "$summary_file" \
         --json-output "$json_file"
+    echo "::endgroup::"
 
     summary_files+=("$summary_file")
     json_files+=("$json_file")
@@ -110,13 +159,24 @@ else
 fi
 
 # Merge per-snapshot JSON files into a single metrics_snapshots.json.
-# Each entry gets a "snapshot" field with the snapshot name.
+# Each entry gets a "snapshot" field with the snapshot name and an optional
+# "artifact_id" field (integer) for the corresponding diff artifact download link.
 # Capped at 50 entries to match the publish workflow's MAX_SNAPSHOTS limit.
 # The trusted publish workflow validates this data before rendering.
 python3 - "$METRICS_DIR" "${json_files[@]}" <<'PYEOF'
 import json, os, sys
 metrics_dir = sys.argv[1]
 MAX_SNAPSHOTS = 50
+
+# Load artifact IDs if available (produced by the gh api query above).
+# artifact_ids maps artifact name → artifact ID (integer).
+artifact_ids = {}
+try:
+    with open(os.path.join(metrics_dir, 'artifact_ids.json')) as f:
+        artifact_ids = json.load(f)
+except (FileNotFoundError, json.JSONDecodeError):
+    pass
+
 snapshots = []
 for path in sys.argv[2:]:
     if len(snapshots) >= MAX_SNAPSHOTS:
@@ -128,6 +188,12 @@ for path in sys.argv[2:]:
         basename = os.path.basename(path)
         name = basename.removeprefix('changes_').removesuffix('.json')
         data['snapshot'] = name
+        # Look up the artifact ID for this snapshot's diff artifact so the
+        # publish workflow can render a direct download link.
+        diff_artifact_name = f"diff_{name}"
+        artifact_id = artifact_ids.get(diff_artifact_name)
+        if isinstance(artifact_id, int):
+            data['artifact_id'] = artifact_id
         snapshots.append(data)
     except Exception as e:
         print(f"Warning: could not read {path}: {e}", file=sys.stderr)
@@ -152,10 +218,11 @@ if [ "$total_changes" -gt 0 ]; then
     echo ""
     for summary_file in "${summary_files[@]}"; do
         file_name=$(basename "$summary_file" .md)
-        echo "--- ${file_name} ---"
+        echo "::group::${file_name}"
         echo ""
         cat "$summary_file"
         echo ""
+        echo "::endgroup::"
     done
 elif [ ${#missing_diffs[@]} -gt 0 ]; then
     echo "No metric changes in available diffs, but some diff artifacts were missing (see above)."
diff --git a/scripts/e2e/metrics_summary_test.py b/scripts/e2e/metrics_summary_test.py
new file mode 100644
index 00000000000..79d5e2dfdec
--- /dev/null
+++ b/scripts/e2e/metrics_summary_test.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2025 The Jaeger Authors.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import tempfile
+import unittest
+from metrics_summary import (
+    extract_metric_name,
+    get_raw_diff_sample,
+    parse_diff_file,
+    generate_diff_summary,
+    generate_structured_json,
+)
+
+
+# A minimal unified diff that exercises added/removed/modified cases and exclusion counts.
+_DIFF_WITH_ALL_CATEGORIES = """\
+--- 
++++ 
+@@ -1,3 +1,3 @@
+-baseline_only{job="a"}
+-http_server_duration{le="+Inf"}
++current_only{job="b"}
++http_server_duration{http_route="/status",le="+Inf"}
+# Metrics excluded from baseline: 3
+# Metrics excluded from current: 2
+"""
+
+# A diff with only added metrics (present in current, absent from baseline).
+_DIFF_ADDED_ONLY = """\
+--- 
++++ 
+@@ -1 +2 @@
++new_metric{job="a"}
+"""
+
+# A diff with only removed metrics (present in baseline, absent from current).
+_DIFF_REMOVED_ONLY = """\
+--- 
++++ 
+@@ -2 +1 @@
+-old_metric{job="b"}
+"""
+
+
+def _write_tmp(content):
+    """Write content to a temp file and return its path."""
+    f = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
+    f.write(content)
+    f.close()
+    return f.name
+
+
+class TestExtractMetricName(unittest.TestCase):
+    """Tests for extract_metric_name()."""
+
+    def test_extracts_name_before_braces(self):
+        self.assertEqual(extract_metric_name('http_server_duration{le="+Inf"}'), 'http_server_duration')
+
+    def test_returns_bare_name_when_no_braces(self):
+        self.assertEqual(extract_metric_name('my_metric'), 'my_metric')
+
+    def test_strips_whitespace(self):
+        self.assertEqual(extract_metric_name('  my_metric  '), 'my_metric')
+
+
+class TestGetRawDiffSample(unittest.TestCase):
+    """Tests for get_raw_diff_sample()."""
+
+    def test_empty_input_returns_empty(self):
+        self.assertEqual(get_raw_diff_sample([]), [])
+
+    def test_pure_removals_are_truncated(self):
+        lines = [f'-metric_{i}' for i in range(10)]
+        result = get_raw_diff_sample(lines, max_lines=4)
+        self.assertEqual(result[:4], lines[:4])
+        self.assertEqual(result[-1], '...')
+
+    def test_pure_additions_are_truncated(self):
+        lines = [f'+metric_{i}' for i in range(10)]
+        result = get_raw_diff_sample(lines, max_lines=4)
+        self.assertEqual(result[:4], lines[:4])
+        self.assertEqual(result[-1], '...')
+
+    def test_interleaves_removed_and_added_lines(self):
+        """Modified metrics with both - and + lines are interleaved."""
+        removed = [f'-metric{{a="{i}"}}' for i in range(4)]
+        added   = [f'+metric{{a="{i}",b="x"}}' for i in range(4)]
+        result = get_raw_diff_sample(removed + added, max_lines=4)
+        # Pairs: (-, +, -, +, ...)
+        self.assertTrue(result[0].startswith('-'))
+        self.assertTrue(result[1].startswith('+'))
+        self.assertTrue(result[2].startswith('-'))
+        self.assertTrue(result[3].startswith('+'))
+
+    def test_interleaved_truncation_adds_ellipsis(self):
+        removed = [f'-metric{{a="{i}"}}' for i in range(10)]
+        added   = [f'+metric{{a="{i}",b="x"}}' for i in range(10)]
+        result = get_raw_diff_sample(removed + added, max_lines=4)
+        self.assertIn('...', result)
+        # Should only show max_lines//2 = 2 pairs
+        non_ellipsis = [l for l in result if l != '...']
+        self.assertEqual(len(non_ellipsis), 4)  # 2 pairs × 2 lines each
+
+    def test_no_ellipsis_when_within_limit(self):
+        lines = ['-m1', '+m1_v2']
+        result = get_raw_diff_sample(lines, max_lines=8)
+        self.assertNotIn('...', result)
+
+
+class TestParseDiffFile(unittest.TestCase):
+    """Tests for parse_diff_file()."""
+
+    def test_parses_added_removed_and_modified(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+
+        # baseline_only is in baseline but not current → removed
+        self.assertIn('baseline_only', changes['removed'])
+        # current_only is in current but not baseline → added
+        self.assertIn('current_only', changes['added'])
+        # http_server_duration appears in both → modified
+        self.assertIn('http_server_duration', changes['modified'])
+        # Not in added or removed after reclassification
+        self.assertNotIn('http_server_duration', changes['added'])
+        self.assertNotIn('http_server_duration', changes['removed'])
+
+    def test_accumulates_exclusion_counts(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            _, _, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertEqual(excl_count, 5)  # 2 + 3
+
+    def test_zero_exclusion_count_when_no_exclusion_lines(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            _, _, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertEqual(excl_count, 0)
+
+    def test_raw_diff_sections_populated(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            _, raw_sections, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertIn('http_server_duration', raw_sections)
+        # The section should have at least one - line and one + line
+        lines = raw_sections['http_server_duration']
+        self.assertTrue(any(l.startswith('-') for l in lines))
+        self.assertTrue(any(l.startswith('+') for l in lines))
+
+    def test_added_only_diff(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertIn('new_metric', changes['added'])
+        self.assertEqual(len(changes['removed']), 0)
+        self.assertEqual(len(changes['modified']), 0)
+
+    def test_removed_only_diff(self):
+        path = _write_tmp(_DIFF_REMOVED_ONLY)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        self.assertIn('old_metric', changes['removed'])
+        self.assertEqual(len(changes['added']), 0)
+        self.assertEqual(len(changes['modified']), 0)
+
+
+class TestGenerateDiffSummary(unittest.TestCase):
+    """Tests for generate_diff_summary()."""
+
+    def test_total_changes_header_present(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('**Total Changes:**', summary)
+
+    def test_added_section_rendered(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Added Metrics', summary)
+        self.assertIn('new_metric', summary)
+
+    def test_removed_section_rendered(self):
+        path = _write_tmp(_DIFF_REMOVED_ONLY)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Removed Metrics', summary)
+        self.assertIn('old_metric', summary)
+
+    def test_modified_section_rendered(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Modified Metrics', summary)
+        self.assertIn('http_server_duration', summary)
+
+    def test_diff_sample_block_present_for_changed_metric(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('```diff', summary)
+
+    def test_exclusion_count_shown(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, raw_sections, excl_count = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        summary = generate_diff_summary(changes, raw_sections, excl_count)
+        self.assertIn('Excluded: 5', summary)
+
+
+class TestGenerateStructuredJson(unittest.TestCase):
+    """Tests for generate_structured_json()."""
+
+    def test_counts_are_correct(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        self.assertEqual(data['added'], 1)    # current_only
+        self.assertEqual(data['removed'], 1)  # baseline_only
+        self.assertEqual(data['modified'], 1) # http_server_duration
+
+    def test_metric_names_list_sorted(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        names = data['metric_names']
+        self.assertEqual(names, sorted(names))
+
+    def test_metric_names_deduped(self):
+        path = _write_tmp(_DIFF_WITH_ALL_CATEGORIES)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        self.assertEqual(len(data['metric_names']), len(set(data['metric_names'])))
+
+    def test_added_only_produces_correct_output(self):
+        path = _write_tmp(_DIFF_ADDED_ONLY)
+        try:
+            changes, _, _ = parse_diff_file(path)
+        finally:
+            os.unlink(path)
+        data = generate_structured_json(changes)
+        self.assertEqual(data['added'], 1)
+        self.assertEqual(data['removed'], 0)
+        self.assertEqual(data['modified'], 0)
+        self.assertIn('new_metric', data['metric_names'])
+
+
+if __name__ == '__main__':
+    unittest.main()