From 4031562f6b24d1fafada843efa39be20da3e6c84 Mon Sep 17 00:00:00 2001
From: Zander Raycraft <zanderjraycraft@gmail.com>
Date: Sat, 21 Mar 2026 23:21:23 -0500
Subject: [PATCH] redoing auto labeling using z score instead of clustering
 lowkey the method

---
 .github/scripts/triage/embedding_utils.py     | 131 ++++++--
 .github/scripts/triage/sweep.py               | 229 +++++++++++---
 .../scripts/triage/test_embedding_utils.py    | 233 ++++++++++----
 .github/scripts/triage/test_sweep.py          | 286 +++++++++++++++---
 .github/workflows/triage-sweep.yml            |  23 +-
 5 files changed, 727 insertions(+), 175 deletions(-)

diff --git a/.github/scripts/triage/embedding_utils.py b/.github/scripts/triage/embedding_utils.py
index ede06d61cb..a93b063ec5 100644
--- a/.github/scripts/triage/embedding_utils.py
+++ b/.github/scripts/triage/embedding_utils.py
@@ -8,7 +8,6 @@
 import numpy as np
 from numpy.typing import NDArray
 from fastembed import TextEmbedding
-from scipy.stats import chi2
 from sklearn.decomposition import PCA
 from sklearn.covariance import EllipticEnvelope
 from sklearn.metrics.pairwise import cosine_similarity
@@ -78,19 +77,25 @@ def reduce_dimensions(
 
 def detect_outliers(
     matrix: NDArray[np.float32],
-    percentile: float = 0.997,
     contamination: float = 0.1,
+    iqr_multiplier: float = 3.0,
+    max_outlier_pct: float = 0.05,
 ) -> list[tuple[int, float]]:
-    """Flag items whose Mahalanobis distance exceeds a dimension-aware cutoff.
+    """Flag items whose Mahalanobis distance exceeds an IQR-based cutoff.
 
     Uses EllipticEnvelope (robust covariance via MCD) to estimate the
     multivariate Gaussian, then computes sqrt(squared Mahalanobis distance)
-    for each sample. The cutoff is derived from the chi2 distribution with
-    k degrees of freedom (k = number of features), so it scales correctly
-    regardless of dimensionality. Returns (index, distance) tuples sorted
-    by index ascending.
+    for each sample. The cutoff is Q75 + iqr_multiplier * IQR, which
+    adapts to the actual distribution of distances.
+
+    A hard cap ensures no more than max_outlier_pct * n items are flagged;
+    when the cap is hit, only the most extreme items (sorted by distance
+    descending) are kept.
+
+    Returns (index, distance) tuples sorted by index ascending, along with
+    the cutoff value stored as an attribute on the returned list.
     """
-    n, k = matrix.shape
+    n = matrix.shape[0]
     if n < 2:
         return []
 
@@ -100,11 +105,34 @@ def detect_outliers(
     # .mahalanobis() returns squared Mahalanobis distances
     distances = np.sqrt(envelope.mahalanobis(matrix))
 
-    # Dimension-aware cutoff: sqrt(chi2.ppf(percentile, df=k))
-    cutoff = np.sqrt(chi2.ppf(percentile, df=k))
+    # IQR-based cutoff
+    q25, q75 = np.percentile(distances, [25, 75])
+    iqr = q75 - q25
+    cutoff = q75 + iqr_multiplier * iqr
+
     outlier_mask = distances > cutoff
     indices = np.where(outlier_mask)[0]
-    return [(int(idx), float(distances[idx])) for idx in indices]
+
+    # Hard cap: keep at most max_outlier_pct * n items
+    max_count = max(1, int(max_outlier_pct * n))
+    if len(indices) > max_count:
+        # Sort by distance descending, take the most extreme
+        sorted_by_dist = sorted(indices, key=lambda i: distances[i], reverse=True)
+        indices = np.array(sorted_by_dist[:max_count])
+
+    # Sort by index ascending for stable output
+    indices = np.sort(indices)
+    result = [(int(idx), float(distances[idx])) for idx in indices]
+
+    # Attach cutoff as metadata so the report can use it
+    result = _OutlierResult(result)  # type: ignore[assignment]
+    result.cutoff = float(cutoff)  # type: ignore[attr-defined]
+    return result  # type: ignore[return-value]
+
+
+class _OutlierResult(list):
+    """A list subclass that carries metadata (cutoff) from outlier detection."""
+    cutoff: float = 0.0
 
 
 def find_duplicate_pairs(
@@ -134,11 +162,22 @@ def find_duplicate_pairs(
     return pairs
 
 
-# ── Label suggestion via embedding similarity ────────────────────────
+# ── Label suggestion via z-score normalized embedding similarity ──────
+
+# Z-score threshold: a label must be this many standard deviations above
+# the column mean to be considered a match.
+LABEL_Z_THRESHOLD: float = 1.5
 
-# Minimum similarity between an item and a label to suggest it.
-# 0.4 is intentionally permissive — the report is for human review.
-LABEL_SIMILARITY_THRESHOLD: float = 0.4
+# Margin gate: the top-1 label must beat the second-best by this many
+# z-score units to be accepted (subsequent labels don't need a margin).
+LABEL_Z_MARGIN: float = 0.5
+
+# Floor for per-column standard deviation to avoid division by near-zero.
+LABEL_Z_STD_FLOOR: float = 0.01
+
+# Minimum raw cosine similarity required even if z-score is high.
+# Prevents suggesting labels that are "relatively best" but still poor.
+MIN_RAW_SIMILARITY: float = 0.3
 
 # Maximum number of labels to suggest per item.
 MAX_LABELS_PER_ITEM: int = 3
@@ -148,37 +187,71 @@ def suggest_labels(
     item_embeddings: NDArray[np.float32],
     label_embeddings: NDArray[np.float32],
     label_names: list[str],
-    threshold: float = LABEL_SIMILARITY_THRESHOLD,
+    z_threshold: float = LABEL_Z_THRESHOLD,
+    z_margin: float = LABEL_Z_MARGIN,
+    std_floor: float = LABEL_Z_STD_FLOOR,
+    min_raw_sim: float = MIN_RAW_SIMILARITY,
     max_per_item: int = MAX_LABELS_PER_ITEM,
 ) -> list[list[tuple[str, float]]]:
-    """Suggest labels for each item based on embedding similarity.
+    """Suggest labels for each item using z-score normalized similarity.
 
-    Computes cosine similarity between item embeddings (n, 384) and
-    label embeddings (m, 384). For each item, returns the top-k labels
-    whose similarity exceeds the threshold, sorted by similarity descending.
+    1. Compute raw cosine similarity matrix (n items x m labels).
+    2. Column-wise z-score: for each label j, normalize across all items.
+    3. For each item, rank labels by z-score descending.
+    4. Accept a label only if z >= z_threshold AND raw_sim >= min_raw_sim.
+    5. Margin gate: the top-1 label must beat #2 by z_margin; subsequent
+       labels don't need a margin.
+    6. Cap at max_per_item.
 
     Returns a list of length n, where each element is a list of
-    (label_name, similarity) tuples. Empty list if no label exceeds threshold.
+    (label_name, raw_similarity) tuples. Empty list if nothing qualifies.
     """
     n = item_embeddings.shape[0]
     m = label_embeddings.shape[0]
     if n == 0 or m == 0:
         return [[] for _ in range(n)]
 
-    # (n, m) similarity matrix: each row is one item vs all labels
+    # (n, m) raw similarity matrix
     sim_matrix = cosine_similarity(item_embeddings, label_embeddings)
 
+    # Column-wise z-score normalization
+    col_means = sim_matrix.mean(axis=0)  # shape (m,)
+    col_stds = sim_matrix.std(axis=0)    # shape (m,)
+    col_stds = np.maximum(col_stds, std_floor)
+    z_matrix = (sim_matrix - col_means) / col_stds
+
     suggestions: list[list[tuple[str, float]]] = []
     for i in range(n):
-        row = sim_matrix[i]
-        # Indices sorted by similarity descending
-        ranked = np.argsort(row)[::-1]
+        z_row = z_matrix[i]
+        raw_row = sim_matrix[i]
+
+        # Rank labels by z-score descending
+        ranked = np.argsort(z_row)[::-1]
+
         item_labels: list[tuple[str, float]] = []
-        for idx in ranked[:max_per_item]:
-            score = float(row[idx])
-            if score < threshold:
+
+        # Margin gate: top-1 z-score must beat #2 by z_margin.
+        # If not, the assignment is ambiguous — skip this item entirely.
+        if len(ranked) > 1:
+            top1_z = float(z_row[ranked[0]])
+            top2_z = float(z_row[ranked[1]])
+            if top1_z - top2_z < z_margin:
+                suggestions.append(item_labels)
+                continue
+
+        for rank_pos, idx in enumerate(ranked):
+            if len(item_labels) >= max_per_item:
                 break
-            item_labels.append((label_names[idx], score))
+
+            z_val = float(z_row[idx])
+            raw_val = float(raw_row[idx])
+
+            # Must pass both z-threshold and raw similarity floor
+            if z_val < z_threshold or raw_val < min_raw_sim:
+                continue
+
+            item_labels.append((label_names[idx], raw_val))
+
         suggestions.append(item_labels)
 
     return suggestions
diff --git a/.github/scripts/triage/sweep.py b/.github/scripts/triage/sweep.py
index 8e0f70d6d7..97892300d1 100644
--- a/.github/scripts/triage/sweep.py
+++ b/.github/scripts/triage/sweep.py
@@ -21,13 +21,20 @@
     detect_outliers,
     find_duplicate_pairs,
     suggest_labels,
+    LABEL_Z_THRESHOLD,
+    LABEL_Z_MARGIN,
+    LABEL_Z_STD_FLOOR,
+    MIN_RAW_SIMILARITY,
+    MAX_LABELS_PER_ITEM,
 )
 
 # ── Thresholds (overridable via workflow_dispatch inputs) ──────────────
 
-# Chi2 percentile for the dimension-aware outlier cutoff.
-# 0.997 is the multivariate equivalent of the 3-sigma rule.
-OUTLIER_PERCENTILE: float = float(os.environ.get("INPUT_OUTLIER_PERCENTILE", "0.997"))
+# IQR multiplier for outlier cutoff: cutoff = Q75 + IQR_MULTIPLIER * IQR.
+IQR_MULTIPLIER: float = float(os.environ.get("INPUT_IQR_MULTIPLIER", "3.0"))
+
+# Hard cap: at most this fraction of items can be flagged as outliers.
+MAX_OUTLIER_PCT: float = float(os.environ.get("INPUT_MAX_OUTLIER_PCT", "0.05"))
 
 # EllipticEnvelope contamination: expected fraction of outliers in the data.
 # Governs how aggressively the robust covariance downweights extreme points.
@@ -48,7 +55,7 @@
 # Minimum number of samples required for EllipticEnvelope to fit
 # a Gaussian reliably. Must be >= 3 * PCA_MAX_COMPONENTS so the
 # covariance matrix is estimated from enough data points.
-PCA_MAX_COMPONENTS: int = 33
+PCA_MAX_COMPONENTS: int = 20
 MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 100
 
 # Max character length for embedding input text. bge-small-en-v1.5 has a
@@ -211,6 +218,40 @@ def apply_labels_to_item(item_number: int, labels: list[str]) -> None:
         print(f"::warning::Failed to label #{item_number}: {e.code} {body}")
 
 
+def _item_age(created_at: str) -> str:
+    """Compute a human-readable age string from an ISO 8601 created_at timestamp."""
+    try:
+        created = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
+        delta = datetime.now(timezone.utc) - created
+        days = delta.days
+        if days < 1:
+            return "<1d"
+        if days < 30:
+            return f"{days}d"
+        if days < 365:
+            return f"{days // 30}mo"
+        return f"{days // 365}y"
+    except (ValueError, TypeError):
+        return "?"
+
+
+def _suggested_action(a: TriageItem, b: TriageItem) -> str:
+    """Determine a suggested action for a duplicate pair based on types and age."""
+    if a["is_pr"] and b["is_pr"]:
+        return "Review for overlap"
+    if not a["is_pr"] and not b["is_pr"]:
+        # Both issues — close the newer one
+        try:
+            a_dt = datetime.fromisoformat(a["created_at"].replace("Z", "+00:00"))
+            b_dt = datetime.fromisoformat(b["created_at"].replace("Z", "+00:00"))
+            newer = b if b_dt > a_dt else a
+        except (ValueError, TypeError):
+            newer = b
+        return f"Close #{newer['number']} as duplicate"
+    # One issue, one PR
+    return "Link PR to issue"
+
+
 def generate_report(
     items: list[TriageItem],
     outlier_results: list[tuple[int, float]],
@@ -221,33 +262,89 @@ def generate_report(
     now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
     repo = os.environ.get("GITHUB_REPOSITORY", "unknown/repo")
 
+    # Compute label suggestion counts early for the health table
+    outlier_set = {idx for idx, _ in outlier_results}
+    suggested_count = 0
+    if label_suggestions is not None:
+        suggested_count = sum(
+            1 for i, s in enumerate(label_suggestions)
+            if s and not items[i]["labels"] and i not in outlier_set
+        )
+
+    # ── Health summary table at the top ──────────────────────────────
     lines: list[str] = [
         "## Triage Sweep Report",
         "",
         f"**Run:** {now} UTC",
         f"**Items analyzed:** {len(items)}",
-        f"**Thresholds:** Outlier percentile {OUTLIER_PERCENTILE}, Cosine > {COSINE_THRESHOLD}",
+        f"**Thresholds:** IQR multiplier {IQR_MULTIPLIER}, Cosine > {COSINE_THRESHOLD}",
+        "",
+        "### Health Summary",
         "",
+        "| Metric | Value |",
+        "|--------|-------|",
+        f"| Items analyzed | {len(items)} |",
+        f"| Outliers flagged | {len(outlier_results)} |",
+        f"| Duplicate pairs | {len(duplicate_pairs)} |",
+        f"| Label suggestions | {suggested_count} |",
+        "",
+    ]
+
+    # ── Outlier section ──────────────────────────────────────────────
+    # Determine cutoff for high-confidence split
+    cutoff = getattr(outlier_results, "cutoff", 0.0)
+    high_conf_cutoff = 2 * cutoff if cutoff > 0 else float("inf")
+
+    high_conf = [(idx, d) for idx, d in outlier_results if d > high_conf_cutoff]
+    borderline = [(idx, d) for idx, d in outlier_results if d <= high_conf_cutoff]
+
+    lines.extend([
         f"### Potential Outliers / Spam ({len(outlier_results)})",
         "",
         "Items with unusually high Mahalanobis distance from the distribution center.",
         "These may be spam, off-topic, or poorly described.",
         "",
-    ]
+    ])
 
-    if outlier_results:
-        lines.append("| # | Type | Title | Distance |")
-        lines.append("|---|------|-------|----------|")
-        for idx, distance in outlier_results:
+    if high_conf:
+        lines.append(f"**High Confidence** ({len(high_conf)} items, distance > 2x cutoff)")
+        lines.append("")
+        lines.append("| # | Type | Title | Distance | Age |")
+        lines.append("|---|------|-------|----------|-----|")
+        for idx, distance in high_conf:
             item = items[idx]
             kind = "PR" if item["is_pr"] else "Issue"
+            age = _item_age(item["created_at"])
+            title = item["title"][:80] + ("..." if len(item["title"]) > 80 else "")
             lines.append(
                 f"| [#{item['number']}]({item['html_url']}) "
-                f"| {kind} | {item['title']} | {distance:.2f} |"
+                f"| {kind} | {title} | {distance:.2f} | {age} |"
             )
-    else:
+        lines.append("")
+
+    if borderline:
+        lines.append("<details>")
+        lines.append(f"<summary>Borderline ({len(borderline)} items)</summary>")
+        lines.append("")
+        lines.append("| # | Type | Title | Distance | Age |")
+        lines.append("|---|------|-------|----------|-----|")
+        for idx, distance in borderline:
+            item = items[idx]
+            kind = "PR" if item["is_pr"] else "Issue"
+            age = _item_age(item["created_at"])
+            title = item["title"][:80] + ("..." if len(item["title"]) > 80 else "")
+            lines.append(
+                f"| [#{item['number']}]({item['html_url']}) "
+                f"| {kind} | {title} | {distance:.2f} | {age} |"
+            )
+        lines.append("")
+        lines.append("</details>")
+        lines.append("")
+
+    if not outlier_results:
         lines.append("None found.")
 
+    # ── Duplicate pairs section ──────────────────────────────────────
     lines.extend([
         "",
         f"### Potential Duplicates ({len(duplicate_pairs)} pairs)",
@@ -257,43 +354,82 @@ def generate_report(
     ])
 
     if duplicate_pairs:
-        lines.append("| Item A | Item B | Similarity |")
-        lines.append("|--------|--------|------------|")
+        lines.append("| Item A | Item B | Similarity | Suggested Action |")
+        lines.append("|--------|--------|------------|------------------|")
         for i, j, sim in duplicate_pairs:
             a = items[i]
             b = items[j]
             kind_a = "PR" if a["is_pr"] else "Issue"
             kind_b = "PR" if b["is_pr"] else "Issue"
+            action = _suggested_action(a, b)
             lines.append(
                 f"| [#{a['number']}]({a['html_url']}) {kind_a}: {a['title']} "
                 f"| [#{b['number']}]({b['html_url']}) {kind_b}: {b['title']} "
-                f"| {sim:.3f} |"
+                f"| {sim:.3f} | {action} |"
             )
     else:
         lines.append("None found.")
 
     # ── Label suggestions section ────────────────────────────────────
-    outlier_set = {idx for idx, _ in outlier_results}
     if label_suggestions is not None:
-        # Only unlabeled, non-outlier items — spam shouldn't get categorized
-        # Show only the top-1 label to match what actually gets applied
-        items_with_suggestions = [
-            (i, sugs[:1]) for i, sugs in enumerate(label_suggestions)
-            if sugs and not items[i]["labels"] and i not in outlier_set
-        ]
+        # High confidence: top-1 label with raw_sim >= 0.5
+        # Low confidence: top-1 label with raw_sim < 0.5
+        high_conf_labels: list[tuple[int, list[tuple[str, float]]]] = []
+        low_conf_labels: list[tuple[int, list[tuple[str, float]]]] = []
+        for i, sugs in enumerate(label_suggestions):
+            if sugs and not items[i]["labels"] and i not in outlier_set:
+                top1 = sugs[:1]
+                if top1[0][1] >= 0.5:
+                    high_conf_labels.append((i, top1))
+                else:
+                    low_conf_labels.append((i, top1))
+
+        total_suggestions = len(high_conf_labels) + len(low_conf_labels)
         lines.extend([
             "",
-            f"### Suggested Labels ({len(items_with_suggestions)} unlabeled items)",
+            f"### Suggested Labels ({total_suggestions} unlabeled items)",
             "",
-            "Labels suggested by embedding similarity against repo label descriptions.",
+            "Labels suggested by z-score normalized embedding similarity against repo label descriptions.",
             "Only shown for unlabeled items that were not flagged as outliers.",
             "",
         ])
 
-        if items_with_suggestions:
+        # Label concentration warning
+        if total_suggestions > 0:
+            label_counts: dict[str, int] = {}
+            for _, sugs in high_conf_labels + low_conf_labels:
+                for name, _ in sugs:
+                    label_counts[name] = label_counts.get(name, 0) + 1
+            for name, count in label_counts.items():
+                if count > total_suggestions * 0.5:
+                    lines.append(
+                        f"> **Warning:** Label `{name}` accounts for "
+                        f"{count}/{total_suggestions} suggestions "
+                        f"({count * 100 // total_suggestions}%). "
+                        f"Consider reviewing label descriptions for specificity."
+                    )
+                    lines.append("")
+
+        if high_conf_labels:
+            lines.append("| # | Type | Title | Suggested Label |")
+            lines.append("|---|------|-------|--------------------|")
+            for idx, sugs in high_conf_labels:
+                item = items[idx]
+                kind = "PR" if item["is_pr"] else "Issue"
+                label_strs = [f"`{name}` ({score:.2f})" for name, score in sugs]
+                lines.append(
+                    f"| [#{item['number']}]({item['html_url']}) "
+                    f"| {kind} | {item['title']} | {', '.join(label_strs)} |"
+                )
+
+        if low_conf_labels:
+            lines.append("")
+            lines.append("<details>")
+            lines.append(f"<summary>Low-confidence suggestions ({len(low_conf_labels)} items)</summary>")
+            lines.append("")
             lines.append("| # | Type | Title | Suggested Label |")
             lines.append("|---|------|-------|--------------------|")
-            for idx, sugs in items_with_suggestions:
+            for idx, sugs in low_conf_labels:
                 item = items[idx]
                 kind = "PR" if item["is_pr"] else "Issue"
                 label_strs = [f"`{name}` ({score:.2f})" for name, score in sugs]
@@ -301,7 +437,10 @@ def generate_report(
                     f"| [#{item['number']}]({item['html_url']}) "
                     f"| {kind} | {item['title']} | {', '.join(label_strs)} |"
                 )
-        else:
+            lines.append("")
+            lines.append("</details>")
+
+        if not high_conf_labels and not low_conf_labels:
             lines.append("No unlabeled items need suggestions.")
 
     lines.extend([
@@ -314,11 +453,7 @@ def generate_report(
     ])
 
     if label_suggestions is not None:
-        applied = sum(
-            1 for i, s in enumerate(label_suggestions)
-            if s and not items[i]["labels"] and i not in outlier_set
-        )
-        lines.append(f"- {applied} items suggested for labeling")
+        lines.append(f"- {suggested_count} items suggested for labeling")
 
     lines.extend([
         "",
@@ -404,7 +539,10 @@ def main() -> None:
     if len(items) >= MIN_SAMPLES_FOR_OUTLIER_DETECTION:
         reduced = reduce_dimensions(embeddings, PCA_MAX_COMPONENTS)
         outlier_results = detect_outliers(
-            reduced, percentile=OUTLIER_PERCENTILE, contamination=CONTAMINATION,
+            reduced,
+            contamination=CONTAMINATION,
+            iqr_multiplier=IQR_MULTIPLIER,
+            max_outlier_pct=MAX_OUTLIER_PCT,
         )
     else:
         print(
@@ -426,17 +564,20 @@ def main() -> None:
         label_suggestions = suggest_labels(embeddings, label_embeddings, label_names)
         print(f"Computed label suggestions against {len(repo_labels)} repo labels")
 
-        # Apply top label to unlabeled items (unless dry run)
-        # Skip outliers — flagged items shouldn't get categorized
-        outlier_set = {idx for idx, _ in outlier_results}
-        if not DRY_RUN:
-            applied_count = 0
-            for i, sugs in enumerate(label_suggestions):
-                if sugs and not items[i]["labels"] and i not in outlier_set:
-                    # Apply only the top-1 label (highest confidence)
-                    apply_labels_to_item(items[i]["number"], [sugs[0][0]])
-                    applied_count += 1
-            print(f"Applied labels to {applied_count} unlabeled items")
+        # NOTE: Auto-labeling is disabled. The report shows suggestions for
+        # human review. To re-enable, uncomment the block below.
+        #
+        # # Apply top label to unlabeled items (unless dry run)
+        # # Skip outliers — flagged items shouldn't get categorized
+        # outlier_set = {idx for idx, _ in outlier_results}
+        # if not DRY_RUN:
+        #     applied_count = 0
+        #     for i, sugs in enumerate(label_suggestions):
+        #         if sugs and not items[i]["labels"] and i not in outlier_set:
+        #             # Apply only the top-1 label (highest confidence)
+        #             apply_labels_to_item(items[i]["number"], [sugs[0][0]])
+        #             applied_count += 1
+        #     print(f"Applied labels to {applied_count} unlabeled items")
     else:
         print("No repo labels found — skipping label suggestions")
 
diff --git a/.github/scripts/triage/test_embedding_utils.py b/.github/scripts/triage/test_embedding_utils.py
index 6fbfe83afa..f4a723c6f9 100644
--- a/.github/scripts/triage/test_embedding_utils.py
+++ b/.github/scripts/triage/test_embedding_utils.py
@@ -20,7 +20,10 @@
     EMBEDDING_DIM,
     EMBEDDING_MODEL,
     EMBEDDING_BATCH_SIZE,
-    LABEL_SIMILARITY_THRESHOLD,
+    LABEL_Z_THRESHOLD,
+    LABEL_Z_MARGIN,
+    LABEL_Z_STD_FLOOR,
+    MIN_RAW_SIMILARITY,
     MAX_LABELS_PER_ITEM,
 )
 
@@ -139,17 +142,17 @@ def test_max_components_respected(self):
 
 
 class TestDetectOutliers:
-    """Tests for Mahalanobis-based outlier detection."""
+    """Tests for IQR-based outlier detection."""
 
     def test_single_sample_returns_empty(self):
         m = np.random.randn(1, 5).astype(np.float32)
-        result = detect_outliers(m, percentile=0.997)
+        result = detect_outliers(m)
         assert result == []
 
     def test_empty_returns_empty(self):
         # n < 2 case
         m = np.empty((0, 5), dtype=np.float32)
-        result = detect_outliers(m, percentile=0.997)
+        result = detect_outliers(m)
         assert result == []
 
     def test_finds_outliers_in_synthetic_data(self):
@@ -158,7 +161,7 @@ def test_finds_outliers_in_synthetic_data(self):
         cluster = rng.standard_normal((50, 3)).astype(np.float32) * 0.1
         outlier = np.array([[100.0, 100.0, 100.0]], dtype=np.float32)
         m = np.vstack([cluster, outlier])
-        result = detect_outliers(m, percentile=0.997)
+        result = detect_outliers(m)
         # The outlier (index 50) should be detected
         outlier_indices = [idx for idx, _ in result]
         assert 50 in outlier_indices
@@ -169,7 +172,7 @@ def test_returns_list_of_index_distance_tuples(self):
         cluster = rng.standard_normal((20, 3)).astype(np.float32) * 0.1
         far_point = np.array([[50.0, 50.0, 50.0]], dtype=np.float32)
         m = np.vstack([cluster, far_point])
-        result = detect_outliers(m, percentile=0.997)
+        result = detect_outliers(m)
         assert isinstance(result, list)
         for item in result:
             assert isinstance(item, tuple)
@@ -179,20 +182,21 @@ def test_returns_list_of_index_distance_tuples(self):
             assert isinstance(dist, float)
             assert dist > 0
 
-    def test_low_percentile_flags_more(self):
+    def test_iqr_cutoff_behavior(self):
+        """Lower IQR multiplier should flag more items than higher multiplier."""
         rng = np.random.default_rng(42)
-        m = rng.standard_normal((30, 3)).astype(np.float32)
-        low = detect_outliers(m, percentile=0.5)
-        high = detect_outliers(m, percentile=0.999)
+        m = rng.standard_normal((100, 3)).astype(np.float32)
+        low = detect_outliers(m, iqr_multiplier=1.0, max_outlier_pct=0.5)
+        high = detect_outliers(m, iqr_multiplier=5.0, max_outlier_pct=0.5)
         assert len(low) >= len(high)
 
-    def test_dimension_aware_cutoff(self):
-        """High-dimensional data should not flag everything with default percentile."""
+    def test_dimension_aware_no_mass_flagging(self):
+        """High-dimensional clean Gaussian data should not flag everything."""
         rng = np.random.default_rng(42)
         # 500 samples, 10 dims — well-conditioned for robust covariance
         m = rng.standard_normal((500, 10)).astype(np.float32)
-        result = detect_outliers(m, percentile=0.997)
-        # With a proper dimension-aware cutoff on clean Gaussian data,
+        result = detect_outliers(m)
+        # With IQR-based cutoff on clean Gaussian data,
         # only a small fraction should be flagged (well under 50%)
         assert len(result) < 250
 
@@ -200,9 +204,48 @@ def test_contamination_parameter(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((50, 3)).astype(np.float32)
         # Should not raise with different contamination values
-        result = detect_outliers(m, percentile=0.997, contamination=0.05)
+        result = detect_outliers(m, contamination=0.05)
         assert isinstance(result, list)
 
+    def test_max_outlier_pct_hard_cap(self):
+        """The hard cap should limit outlier count to max_outlier_pct * n."""
+        rng = np.random.default_rng(42)
+        # Create data with many potential outliers (bimodal)
+        cluster = rng.standard_normal((80, 3)).astype(np.float32) * 0.1
+        outliers = rng.standard_normal((20, 3)).astype(np.float32) * 50.0
+        m = np.vstack([cluster, outliers])
+        # Very low IQR multiplier to flag a lot, but cap at 5%
+        result = detect_outliers(m, iqr_multiplier=0.5, max_outlier_pct=0.05)
+        max_allowed = max(1, int(0.05 * 100))  # 5
+        assert len(result) <= max_allowed
+
+    def test_hard_cap_keeps_most_extreme(self):
+        """When capped, the most extreme items (highest distance) should be kept."""
+        rng = np.random.default_rng(42)
+        cluster = rng.standard_normal((90, 3)).astype(np.float32) * 0.1
+        # Create outliers with increasing extremity
+        outliers = np.array([
+            [10.0, 10.0, 10.0],
+            [20.0, 20.0, 20.0],
+            [50.0, 50.0, 50.0],
+        ], dtype=np.float32)
+        m = np.vstack([cluster, outliers])
+        # Cap at ~1 item (0.01 * 93 = 0, but min is 1)
+        result = detect_outliers(m, iqr_multiplier=0.5, max_outlier_pct=0.02)
+        if len(result) > 0:
+            # The most extreme (index 92, distance for [50,50,50]) should be kept
+            indices = [idx for idx, _ in result]
+            assert 92 in indices
+
+    def test_cutoff_attribute(self):
+        """Returned result should carry a cutoff attribute."""
+        rng = np.random.default_rng(42)
+        m = rng.standard_normal((50, 3)).astype(np.float32)
+        result = detect_outliers(m)
+        assert hasattr(result, "cutoff")
+        assert isinstance(result.cutoff, float)
+        assert result.cutoff > 0
+
 
 class TestFindDuplicatePairs:
     """Tests for cosine similarity duplicate detection."""
@@ -265,7 +308,7 @@ def test_high_threshold_fewer_pairs(self):
 
 
 class TestSuggestLabels:
-    """Tests for embedding-based label suggestion."""
+    """Tests for z-score normalized label suggestion."""
 
     def test_empty_items_returns_empty_lists(self):
         items = np.empty((0, 10), dtype=np.float32)
@@ -281,57 +324,145 @@ def test_empty_labels_returns_empty_per_item(self):
         assert all(s == [] for s in result)
 
     def test_identical_embedding_gets_that_label(self):
-        """If an item embedding equals a label embedding, it should suggest that label."""
-        vec = np.array([1.0, 0.0, 0.0], dtype=np.float32)
-        items = np.array([vec], dtype=np.float32)
-        labels = np.array([vec, [0, 1, 0], [0, 0, 1]], dtype=np.float32)
-        result = suggest_labels(items, labels, ["bug", "feature", "docs"], threshold=0.5)
-        assert len(result) == 1
-        assert result[0][0][0] == "bug"
-        assert result[0][0][1] > 0.99
-
-    def test_threshold_filters_low_similarity(self):
-        """With a high threshold, orthogonal vectors should get no suggestions."""
-        items = np.eye(3, dtype=np.float32)
-        labels = np.eye(3, dtype=np.float32)
-        # threshold=0.99 means only near-exact matches
-        result = suggest_labels(items, labels, ["a", "b", "c"], threshold=0.99)
-        # Each item should match exactly one label (itself)
+        """If an item embedding strongly matches one label, z-score should highlight it."""
+        # Create multiple items so z-score normalization is meaningful
+        rng = np.random.default_rng(42)
+        # 10 random items + 1 item that matches label "bug" exactly
+        random_items = rng.standard_normal((10, 3)).astype(np.float32)
+        bug_vec = np.array([[1.0, 0.0, 0.0]], dtype=np.float32)
+        items = np.vstack([random_items, bug_vec])
+        labels = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], dtype=np.float32)
+        result = suggest_labels(
+            items, labels, ["bug", "feature", "docs"],
+            z_threshold=0.5, z_margin=0.0, min_raw_sim=0.1,
+        )
+        # The last item (matching bug_vec) should get "bug" as top suggestion
+        last_item_sugs = result[-1]
+        if last_item_sugs:
+            assert last_item_sugs[0][0] == "bug"
+
+    def test_z_score_suppresses_dominant_label(self):
+        """When all items are similar to one label, z-scores should be low
+        (none stands out) and that label should not be blindly suggested."""
+        # All items identical — z-score for every item on every label is 0
+        items = np.ones((10, 3), dtype=np.float32)
+        labels = np.array([[1.0, 1.0, 1.0], [0.0, 1.0, 0.0]], dtype=np.float32)
+        result = suggest_labels(
+            items, labels, ["catch-all", "specific"],
+            z_threshold=1.5, min_raw_sim=0.3,
+        )
+        # With identical items, std=0 -> z-scores are all 0 -> nothing passes z_threshold
+        for sugs in result:
+            assert sugs == []
+
+    def test_margin_gate_blocks_top1(self):
+        """Top-1 label must beat #2 by z_margin to be accepted as position 0."""
+        rng = np.random.default_rng(99)
+        # 20 items, each slightly different, 2 labels
+        items = rng.standard_normal((20, 5)).astype(np.float32)
+        # Two labels that are nearly identical -> margin gate should block top-1
+        labels = np.array([[1.0, 0.5, 0.0, 0.0, 0.0],
+                           [1.0, 0.5, 0.01, 0.0, 0.0]], dtype=np.float32)
+        result = suggest_labels(
+            items, labels, ["label-a", "label-b"],
+            z_threshold=0.0, z_margin=10.0, min_raw_sim=0.0, max_per_item=1,
+        )
+        # With a huge margin requirement and max_per_item=1, nothing should pass
+        # because the only candidate (top-1) is blocked by margin gate,
+        # and max_per_item=1 prevents falling through to position 2
         for sugs in result:
-            assert len(sugs) == 1
+            assert sugs == []
+
+    def test_margin_gate_passes_when_clear_winner(self):
+        """When top-1 clearly beats #2, it should pass the margin gate."""
+        # Create items where one strongly matches label 0 vs label 1
+        items = np.array([
+            [1.0, 0.0, 0.0, 0.0, 0.0],   # strongly matches label-a
+            [0.0, 0.0, 0.0, 0.0, 1.0],   # matches neither well
+        ] * 5, dtype=np.float32)  # 10 items for stable z-scores
+        labels = np.array([
+            [1.0, 0.0, 0.0, 0.0, 0.0],   # label-a
+            [0.0, 1.0, 0.0, 0.0, 0.0],   # label-b (orthogonal)
+        ], dtype=np.float32)
+        result = suggest_labels(
+            items, labels, ["label-a", "label-b"],
+            z_threshold=0.5, z_margin=0.3, min_raw_sim=0.1,
+        )
+        # Items matching label-a should get it suggested (clear z-score advantage)
+        got_label_a = sum(1 for sugs in result if sugs and sugs[0][0] == "label-a")
+        assert got_label_a > 0
+
+    def test_min_raw_similarity_filter(self):
+        """Even with high z-score, low raw similarity should be filtered out."""
+        # Items are orthogonal to all labels -> raw similarity near 0
+        items = np.array([[1.0, 0.0, 0.0]], dtype=np.float32)
+        labels = np.array([[0.0, 0.0, 1.0]], dtype=np.float32)
+        result = suggest_labels(
+            items, labels, ["irrelevant"],
+            z_threshold=0.0, z_margin=0.0, min_raw_sim=0.9,
+        )
+        # Raw similarity is ~0, which is below min_raw_sim=0.9
+        assert result[0] == []
 
     def test_max_per_item_respected(self):
-        """Even if all labels are similar, max_per_item caps the results."""
+        """Even if many labels qualify, max_per_item caps the results."""
         rng = np.random.default_rng(42)
-        base = rng.standard_normal(10).astype(np.float32)
-        items = np.array([base])
-        # All labels very similar to item
+        # Create items with some variance so z-scores differentiate
+        items = rng.standard_normal((20, 10)).astype(np.float32)
+        base = items[0]
+        # All labels very similar to item 0
         labels = np.array([base + rng.standard_normal(10) * 0.01 for _ in range(10)])
         names = [f"label-{i}" for i in range(10)]
-        result = suggest_labels(items, labels, names, threshold=0.1, max_per_item=2)
-        assert len(result[0]) <= 2
+        result = suggest_labels(
+            items, labels, names,
+            z_threshold=0.0, z_margin=0.0, min_raw_sim=0.0, max_per_item=2,
+        )
+        for sugs in result:
+            assert len(sugs) <= 2
 
-    def test_returns_sorted_by_similarity_descending(self):
-        """Suggestions should be ordered highest similarity first."""
-        items = np.array([[1.0, 0.5, 0.0]], dtype=np.float32)
-        labels = np.array([
-            [1.0, 0.0, 0.0],  # decent match
-            [1.0, 0.5, 0.0],  # exact match
-            [0.0, 0.0, 1.0],  # poor match
-        ], dtype=np.float32)
-        result = suggest_labels(items, labels, ["a", "b", "c"], threshold=0.1)
-        scores = [s for _, s in result[0]]
-        assert scores == sorted(scores, reverse=True)
+    def test_returns_raw_similarity_not_z_score(self):
+        """Returned scores should be raw cosine similarity, not z-scores."""
+        rng = np.random.default_rng(42)
+        items = rng.standard_normal((15, 5)).astype(np.float32)
+        labels = rng.standard_normal((3, 5)).astype(np.float32)
+        names = ["bug", "feature", "docs"]
+        result = suggest_labels(
+            items, labels, names,
+            z_threshold=0.0, z_margin=0.0, min_raw_sim=-1.0,
+        )
+        # Raw cosine similarity should be in [-1, 1] range
+        for sugs in result:
+            for name, score in sugs:
+                assert -1.0 <= score <= 1.0 + 1e-5
+                assert isinstance(name, str)
+                assert isinstance(score, float)
 
     def test_returns_correct_format(self):
         rng = np.random.default_rng(42)
         items = rng.standard_normal((3, 10)).astype(np.float32)
         labels = rng.standard_normal((5, 10)).astype(np.float32)
         names = ["bug", "feature", "docs", "ci", "test"]
-        result = suggest_labels(items, labels, names, threshold=0.0)
+        result = suggest_labels(
+            items, labels, names,
+            z_threshold=0.0, z_margin=0.0, min_raw_sim=-1.0,
+        )
         assert len(result) == 3
         for sugs in result:
             for name, score in sugs:
                 assert isinstance(name, str)
                 assert isinstance(score, float)
                 assert name in names
+
+    def test_text_truncation_in_labels(self):
+        """Label names should be returned as-is even when very long."""
+        rng = np.random.default_rng(42)
+        items = rng.standard_normal((10, 5)).astype(np.float32)
+        long_name = "a" * 200
+        labels = rng.standard_normal((1, 5)).astype(np.float32)
+        result = suggest_labels(
+            items, labels, [long_name],
+            z_threshold=0.0, z_margin=0.0, min_raw_sim=-1.0,
+        )
+        for sugs in result:
+            if sugs:
+                assert sugs[0][0] == long_name
diff --git a/.github/scripts/triage/test_sweep.py b/.github/scripts/triage/test_sweep.py
index feded34fa0..39c8bef3af 100644
--- a/.github/scripts/triage/test_sweep.py
+++ b/.github/scripts/triage/test_sweep.py
@@ -35,18 +35,23 @@
     MIN_SAMPLES_FOR_OUTLIER_DETECTION,
     PCA_MAX_COMPONENTS,
     MAX_EMBED_CHARS,
+    IQR_MULTIPLIER,
+    MAX_OUTLIER_PCT,
+    _item_age,
+    _suggested_action,
 )
 
 
 def _make_api_issue(number: int, title: str = "Test issue", is_pr: bool = False,
-                    body: str = "Issue body", labels: list[str] | None = None) -> dict:
+                    body: str = "Issue body", labels: list[str] | None = None,
+                    created_at: str = "2026-03-21T00:00:00Z") -> dict:
     """Helper to build a mock GitHub API issue response object."""
     result: dict = {
         "number": number,
         "title": title,
         "html_url": f"https://github.com/owner/repo/issues/{number}",
         "body": body,
-        "created_at": "2026-03-21T00:00:00Z",
+        "created_at": created_at,
         "labels": [{"name": lbl} for lbl in (labels or [])],
     }
     if is_pr:
@@ -94,6 +99,15 @@ def test_min_samples_is_at_least_3x_pca_max(self):
     def test_min_samples_is_100(self):
         assert MIN_SAMPLES_FOR_OUTLIER_DETECTION == 100
 
+    def test_pca_max_components_is_20(self):
+        assert PCA_MAX_COMPONENTS == 20
+
+    def test_iqr_multiplier_default(self):
+        assert IQR_MULTIPLIER == 3.0
+
+    def test_max_outlier_pct_default(self):
+        assert MAX_OUTLIER_PCT == 0.05
+
 
 class TestFetchAllOpenItems:
     """Tests for fetch_all_open_items."""
@@ -179,6 +193,71 @@ def test_pagination(self, mock_get):
         assert mock_get.call_count == 2
 
 
+class TestItemAge:
+    """Tests for _item_age helper."""
+
+    def test_recent_item(self):
+        from datetime import datetime, timezone, timedelta
+        recent = (datetime.now(timezone.utc) - timedelta(hours=12)).isoformat()
+        assert _item_age(recent) == "<1d"
+
+    def test_days_old(self):
+        from datetime import datetime, timezone, timedelta
+        old = (datetime.now(timezone.utc) - timedelta(days=15)).isoformat()
+        assert _item_age(old) == "15d"
+
+    def test_months_old(self):
+        from datetime import datetime, timezone, timedelta
+        old = (datetime.now(timezone.utc) - timedelta(days=90)).isoformat()
+        assert _item_age(old) == "3mo"
+
+    def test_years_old(self):
+        from datetime import datetime, timezone, timedelta
+        old = (datetime.now(timezone.utc) - timedelta(days=400)).isoformat()
+        assert _item_age(old) == "1y"
+
+    def test_invalid_date(self):
+        assert _item_age("not-a-date") == "?"
+
+
+class TestSuggestedAction:
+    """Tests for _suggested_action helper."""
+
+    def test_both_issues_close_newer(self):
+        a = TriageItem(
+            number=1, title="A", html_url="u", is_pr=False, labels=[],
+            created_at="2026-01-01T00:00:00Z", text="t",
+        )
+        b = TriageItem(
+            number=2, title="B", html_url="u", is_pr=False, labels=[],
+            created_at="2026-02-01T00:00:00Z", text="t",
+        )
+        result = _suggested_action(a, b)
+        assert "Close #2 as duplicate" in result
+
+    def test_both_prs_review(self):
+        a = TriageItem(
+            number=1, title="A", html_url="u", is_pr=True, labels=[],
+            created_at="2026-01-01T00:00:00Z", text="t",
+        )
+        b = TriageItem(
+            number=2, title="B", html_url="u", is_pr=True, labels=[],
+            created_at="2026-01-01T00:00:00Z", text="t",
+        )
+        assert _suggested_action(a, b) == "Review for overlap"
+
+    def test_issue_pr_link(self):
+        a = TriageItem(
+            number=1, title="A", html_url="u", is_pr=False, labels=[],
+            created_at="2026-01-01T00:00:00Z", text="t",
+        )
+        b = TriageItem(
+            number=2, title="B", html_url="u", is_pr=True, labels=[],
+            created_at="2026-01-01T00:00:00Z", text="t",
+        )
+        assert _suggested_action(a, b) == "Link PR to issue"
+
+
 class TestGenerateReport:
     """Tests for the markdown report generator."""
 
@@ -186,7 +265,7 @@ def test_no_findings(self):
         items = [
             TriageItem(
                 number=1, title="Test", html_url="https://example.com/1",
-                is_pr=False, labels=[], created_at="2026-01-01", text="Test",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="Test",
             ),
         ]
         report = generate_report(items, [], [])
@@ -196,15 +275,39 @@ def test_no_findings(self):
         assert "0 outliers flagged" in report
         assert "0 duplicate pairs found" in report
 
-    def test_with_outliers_shows_distance(self):
+    def test_health_summary_table(self):
+        items = [
+            TriageItem(
+                number=1, title="Test", html_url="https://example.com/1",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="Test",
+            ),
+        ]
+        report = generate_report(items, [], [])
+        assert "### Health Summary" in report
+        assert "| Metric | Value |" in report
+        assert "| Items analyzed | 1 |" in report
+
+    def test_iqr_multiplier_in_thresholds(self):
+        """Report should show IQR multiplier, not percentile."""
+        items = [
+            TriageItem(
+                number=1, title="Test", html_url="https://example.com/1",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="Test",
+            ),
+        ]
+        report = generate_report(items, [], [])
+        assert "IQR multiplier" in report
+        assert "percentile" not in report.lower().split("thresholds")[0]  # not in thresholds line
+
+    def test_with_outliers_shows_distance_and_age(self):
         items = [
             TriageItem(
                 number=10, title="Spam Issue", html_url="https://example.com/10",
-                is_pr=False, labels=[], created_at="2026-01-01", text="spam",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam",
             ),
             TriageItem(
                 number=20, title="Good Issue", html_url="https://example.com/20",
-                is_pr=False, labels=[], created_at="2026-01-01", text="good",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="good",
             ),
         ]
         report = generate_report(items, [(0, 12.34)], [])
@@ -212,16 +315,48 @@ def test_with_outliers_shows_distance(self):
         assert "Spam Issue" in report
         assert "12.34" in report
         assert "1 outliers flagged" in report
+        # Age column should be present
+        assert "| Age |" in report
+
+    def test_outlier_borderline_in_details(self):
+        """Borderline outliers should be in a <details> section."""
+        from embedding_utils import _OutlierResult
+        items = [
+            TriageItem(
+                number=10, title="Borderline", html_url="https://example.com/10",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam",
+            ),
+        ]
+        # Create outlier results with cutoff=10.0, distance=12.0 (< 2*cutoff=20)
+        outlier_results = _OutlierResult([(0, 12.0)])
+        outlier_results.cutoff = 10.0
+        report = generate_report(items, outlier_results, [])
+        assert "<details>" in report
+        assert "Borderline" in report
+
+    def test_outlier_high_confidence(self):
+        """Items with distance > 2x cutoff should be in high confidence section."""
+        from embedding_utils import _OutlierResult
+        items = [
+            TriageItem(
+                number=10, title="Definite Spam", html_url="https://example.com/10",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam",
+            ),
+        ]
+        outlier_results = _OutlierResult([(0, 25.0)])
+        outlier_results.cutoff = 10.0
+        report = generate_report(items, outlier_results, [])
+        assert "High Confidence" in report
 
-    def test_with_duplicates(self):
+    def test_with_duplicates_suggested_action(self):
         items = [
             TriageItem(
                 number=1, title="First", html_url="https://example.com/1",
-                is_pr=False, labels=[], created_at="2026-01-01", text="a",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="a",
             ),
             TriageItem(
                 number=2, title="Second", html_url="https://example.com/2",
-                is_pr=True, labels=[], created_at="2026-01-01", text="b",
+                is_pr=True, labels=[], created_at="2026-02-01T00:00:00Z", text="b",
             ),
         ]
         report = generate_report(items, [], [(0, 1, 0.954)])
@@ -229,12 +364,42 @@ def test_with_duplicates(self):
         assert "#2" in report
         assert "0.954" in report
         assert "1 duplicate pairs found" in report
+        assert "Suggested Action" in report
+        assert "Link PR to issue" in report
+
+    def test_duplicate_both_issues_close_newer(self):
+        items = [
+            TriageItem(
+                number=1, title="First", html_url="https://example.com/1",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="a",
+            ),
+            TriageItem(
+                number=2, title="Second", html_url="https://example.com/2",
+                is_pr=False, labels=[], created_at="2026-02-01T00:00:00Z", text="b",
+            ),
+        ]
+        report = generate_report(items, [], [(0, 1, 0.95)])
+        assert "Close #2 as duplicate" in report
+
+    def test_duplicate_both_prs_review(self):
+        items = [
+            TriageItem(
+                number=1, title="PR A", html_url="https://example.com/1",
+                is_pr=True, labels=[], created_at="2026-01-01T00:00:00Z", text="a",
+            ),
+            TriageItem(
+                number=2, title="PR B", html_url="https://example.com/2",
+                is_pr=True, labels=[], created_at="2026-01-01T00:00:00Z", text="b",
+            ),
+        ]
+        report = generate_report(items, [], [(0, 1, 0.95)])
+        assert "Review for overlap" in report
 
     def test_pr_type_label(self):
         items = [
             TriageItem(
                 number=5, title="PR Title", html_url="https://example.com/5",
-                is_pr=True, labels=[], created_at="2026-01-01", text="pr",
+                is_pr=True, labels=[], created_at="2026-01-01T00:00:00Z", text="pr",
             ),
         ]
         report = generate_report(items, [(0, 8.5)], [])
@@ -244,7 +409,7 @@ def test_footer_present(self):
         items = [
             TriageItem(
                 number=1, title="T", html_url="u",
-                is_pr=False, labels=[], created_at="d", text="t",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="t",
             ),
         ]
         report = generate_report(items, [], [])
@@ -386,26 +551,39 @@ def test_http_error_is_non_fatal(self, mock_urlopen):
 class TestGenerateReportWithLabels:
     """Tests for label suggestions in the report."""
 
-    def test_report_includes_label_section_with_top1_only(self):
+    def test_report_includes_label_section_high_confidence(self):
+        """High-confidence label (raw_sim >= 0.5) should appear in main table."""
         items = [
             TriageItem(
                 number=1, title="Fix crash", html_url="https://example.com/1",
-                is_pr=False, labels=[], created_at="2026-01-01", text="crash",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="crash",
             ),
         ]
-        suggestions = [[("bug", 0.85), ("enhancement", 0.42)]]
+        suggestions = [[("bug", 0.85)]]
         report = generate_report(items, [], [], label_suggestions=suggestions)
         assert "Suggested Labels" in report
-        # Only the top-1 label should appear in the report
         assert "`bug` (0.85)" in report
-        assert "`enhancement`" not in report
         assert "1 items suggested for labeling" in report
 
+    def test_report_low_confidence_in_details(self):
+        """Low-confidence label (raw_sim < 0.5) should be in <details> section."""
+        items = [
+            TriageItem(
+                number=1, title="Something", html_url="https://example.com/1",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="something",
+            ),
+        ]
+        suggestions = [[("maybe-bug", 0.35)]]
+        report = generate_report(items, [], [], label_suggestions=suggestions)
+        assert "Low-confidence suggestions" in report
+        assert "<details>" in report
+        assert "`maybe-bug` (0.35)" in report
+
     def test_report_skips_already_labeled_items(self):
         items = [
             TriageItem(
                 number=1, title="Already labeled", html_url="https://example.com/1",
-                is_pr=False, labels=["bug"], created_at="2026-01-01", text="bug",
+                is_pr=False, labels=["bug"], created_at="2026-01-01T00:00:00Z", text="bug",
             ),
         ]
         suggestions = [[("bug", 0.95)]]
@@ -417,11 +595,11 @@ def test_report_excludes_outliers_from_suggestions(self):
         items = [
             TriageItem(
                 number=1, title="Spam garbage", html_url="https://example.com/1",
-                is_pr=False, labels=[], created_at="2026-01-01", text="spam",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam",
             ),
             TriageItem(
                 number=2, title="Real bug", html_url="https://example.com/2",
-                is_pr=False, labels=[], created_at="2026-01-01", text="bug",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="bug",
             ),
         ]
         suggestions = [[("bug", 0.85)], [("bug", 0.90)]]
@@ -436,12 +614,33 @@ def test_report_without_label_suggestions(self):
         items = [
             TriageItem(
                 number=1, title="T", html_url="u",
-                is_pr=False, labels=[], created_at="d", text="t",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="t",
             ),
         ]
         report = generate_report(items, [], [], label_suggestions=None)
         assert "Suggested Labels" not in report
 
+    def test_label_concentration_warning(self):
+        """When >50% of suggestions point to the same label, a warning should appear."""
+        items = [
+            TriageItem(
+                number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}",
+            )
+            for i in range(4)
+        ]
+        # 3 out of 4 items get "bug" label -> 75% concentration
+        suggestions = [
+            [("bug", 0.85)],
+            [("bug", 0.80)],
+            [("bug", 0.75)],
+            [("enhancement", 0.90)],
+        ]
+        report = generate_report(items, [], [], label_suggestions=suggestions)
+        assert "Warning" in report
+        assert "`bug`" in report
+        assert "3/4" in report
+
 
 class TestMain:
     """Tests for the main orchestration function."""
@@ -485,7 +684,7 @@ def test_full_flow_with_enough_items(
         items = [
             TriageItem(
                 number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
-                is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}",
             )
             for i in range(n)
         ]
@@ -527,12 +726,12 @@ def test_skips_outlier_detection_for_few_items(
         self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_reduce,
         mock_outliers, mock_dupes, mock_suggest, mock_write, mock_create,
     ):
-        """With < MIN_SAMPLES (150) items, outlier detection should be skipped."""
-        n = MIN_SAMPLES_FOR_OUTLIER_DETECTION - 1  # 149
+        """With < MIN_SAMPLES items, outlier detection should be skipped."""
+        n = MIN_SAMPLES_FOR_OUTLIER_DETECTION - 1
         items = [
             TriageItem(
                 number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
-                is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}",
             )
             for i in range(n)
         ]
@@ -568,7 +767,7 @@ def test_dry_run_skips_issue_creation_and_labeling(
         items = [
             TriageItem(
                 number=1, title="Item", html_url="https://example.com/1",
-                is_pr=False, labels=[], created_at="2026-01-01", text="text",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="text",
             )
         ]
         mock_fetch.return_value = items
@@ -594,19 +793,19 @@ def test_dry_run_skips_issue_creation_and_labeling(
     @patch("sweep.embed_texts")
     @patch("sweep.fetch_repo_labels")
     @patch("sweep.fetch_all_open_items")
-    def test_applies_labels_to_unlabeled_items(
+    def test_labels_not_auto_applied(
         self, mock_fetch, mock_labels, mock_embed, mock_norm,
         mock_dupes, mock_suggest, mock_apply, mock_write, mock_create,
     ):
-        """When not dry run, top-1 label should be applied to unlabeled items."""
+        """Auto-labeling is disabled; labels should appear in report only."""
         items = [
             TriageItem(
                 number=1, title="Crash bug", html_url="https://example.com/1",
-                is_pr=False, labels=[], created_at="2026-01-01", text="crash",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="crash",
             ),
             TriageItem(
                 number=2, title="Already labeled", html_url="https://example.com/2",
-                is_pr=False, labels=["enhancement"], created_at="2026-01-01", text="feat",
+                is_pr=False, labels=["enhancement"], created_at="2026-01-01T00:00:00Z", text="feat",
             ),
         ]
         mock_fetch.return_value = items
@@ -614,8 +813,8 @@ def test_applies_labels_to_unlabeled_items(
             RepoLabel(name="bug", description="Broken", text="bug: Broken"),
         ]
         mock_suggest.return_value = [
-            [("bug", 0.90)],       # item 1: unlabeled, should get labeled
-            [("bug", 0.45)],       # item 2: already labeled, skip
+            [("bug", 0.90)],
+            [("bug", 0.45)],
         ]
 
         embeddings = np.random.randn(2, 384).astype(np.float32)
@@ -624,8 +823,8 @@ def test_applies_labels_to_unlabeled_items(
 
         main()
 
-        # Only item 1 (unlabeled) should get a label applied
-        mock_apply.assert_called_once_with(1, ["bug"])
+        # Auto-labeling is disabled — apply_labels_to_item should never be called
+        mock_apply.assert_not_called()
 
     @patch("sweep.create_report_issue")
     @patch("sweep.write_report")
@@ -638,16 +837,16 @@ def test_applies_labels_to_unlabeled_items(
     @patch("sweep.embed_texts")
     @patch("sweep.fetch_repo_labels")
     @patch("sweep.fetch_all_open_items")
-    def test_outliers_do_not_get_labeled(
+    def test_outliers_excluded_from_report_suggestions(
         self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_reduce,
         mock_outliers, mock_dupes, mock_suggest, mock_apply, mock_write, mock_create,
     ):
-        """Items flagged as outliers should not receive label suggestions."""
+        """Items flagged as outliers should not appear in report label suggestions."""
         n = MIN_SAMPLES_FOR_OUTLIER_DETECTION
         items = [
             TriageItem(
                 number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
-                is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}",
+                is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}",
             )
             for i in range(n)
         ]
@@ -655,9 +854,7 @@ def test_outliers_do_not_get_labeled(
         mock_labels.return_value = [
             RepoLabel(name="bug", description="Broken", text="bug: Broken"),
         ]
-        # Outlier detection flags items 0 and 5 (now returns tuples with distances)
         mock_outliers.return_value = [(0, 12.5), (5, 15.3)]
-        # Every item gets a suggestion
         mock_suggest.return_value = [[("bug", 0.85)] for _ in range(n)]
 
         embeddings = np.random.randn(n, 384).astype(np.float32)
@@ -667,9 +864,10 @@ def test_outliers_do_not_get_labeled(
 
         main()
 
-        # Items 0 and 5 are outliers — should NOT be labeled
-        labeled_numbers = [call.args[0] for call in mock_apply.call_args_list]
-        assert 0 not in labeled_numbers
-        assert 5 not in labeled_numbers
-        # Other items should be labeled (n - 2 outliers)
-        assert mock_apply.call_count == n - 2
+        # Auto-labeling is disabled
+        mock_apply.assert_not_called()
+        # Report should still be generated (outliers excluded from suggestions in report)
+        mock_write.assert_called_once()
+        report = mock_write.call_args[0][0]
+        # Outlier items 0 and 5 should not appear in the label suggestions section
+        assert "Item 0" not in report.split("Suggested Labels")[1] if "Suggested Labels" in report else True
diff --git a/.github/workflows/triage-sweep.yml b/.github/workflows/triage-sweep.yml
index b3f4840ffd..ba5514dbf6 100644
--- a/.github/workflows/triage-sweep.yml
+++ b/.github/workflows/triage-sweep.yml
@@ -3,13 +3,19 @@ name: Triage Sweep
 on:
   workflow_dispatch:
     inputs:
-      outlier_percentile:
+      iqr_multiplier:
         description: >-
-          Chi2 percentile for dimension-aware outlier cutoff (0-1).
-          0.997 is the multivariate equivalent of 3-sigma.
-          Lower = more aggressive flagging.
+          IQR multiplier for outlier cutoff.
+          cutoff = Q75 + multiplier * IQR.
+          Higher = fewer outliers flagged.
         type: number
-        default: 0.997
+        default: 3.0
+      max_outlier_pct:
+        description: >-
+          Maximum fraction of items that can be flagged as outliers (0-1).
+          Hard cap to prevent over-flagging.
+        type: number
+        default: 0.05
       contamination:
         description: >-
           Expected fraction of outliers in the data (0-0.5).
@@ -39,6 +45,7 @@ on:
 permissions:
   contents: read
   issues: write
+  pull-requests: write
 
 concurrency:
   group: triage-sweep
@@ -69,7 +76,7 @@ jobs:
       - name: Cache FastEmbed model weights
         uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5
         with:
-          path: ~/.cache/fastembed_cache
+          path: ${{ github.workspace }}/.fastembed_cache
           key: fastembed-bge-small-en-v1.5
 
       - name: Run triage sweep
@@ -77,7 +84,9 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
-          INPUT_OUTLIER_PERCENTILE: ${{ inputs.outlier_percentile }}
+          FASTEMBED_CACHE_PATH: ${{ github.workspace }}/.fastembed_cache
+          INPUT_IQR_MULTIPLIER: ${{ inputs.iqr_multiplier }}
+          INPUT_MAX_OUTLIER_PCT: ${{ inputs.max_outlier_pct }}
           INPUT_CONTAMINATION: ${{ inputs.contamination }}
           INPUT_COSINE_THRESHOLD: ${{ inputs.cosine_threshold }}
           INPUT_MAX_ITEMS: ${{ inputs.max_items }}