From cc2b13e332d8eebd1757c6d144bdae01331a8407 Mon Sep 17 00:00:00 2001
From: Zander Raycraft <zanderjraycraft@gmail.com>
Date: Sat, 21 Mar 2026 20:06:29 -0500
Subject: [PATCH] updated mahalanobis threshold to be multi-dim aware

---
 .github/scripts/triage/embedding_utils.py     | 28 ++++---
 .github/scripts/triage/requirements.txt       |  1 +
 .github/scripts/triage/sweep.py               | 83 ++++++++++---------
 .../scripts/triage/test_embedding_utils.py    | 62 ++++++++++----
 .github/scripts/triage/test_sweep.py          | 81 +++++++++++++-----
 .github/workflows/triage-sweep.yml            | 17 ++--
 6 files changed, 179 insertions(+), 93 deletions(-)

diff --git a/.github/scripts/triage/embedding_utils.py b/.github/scripts/triage/embedding_utils.py
index 95e872de15..ede06d61cb 100644
--- a/.github/scripts/triage/embedding_utils.py
+++ b/.github/scripts/triage/embedding_utils.py
@@ -8,6 +8,7 @@
 import numpy as np
 from numpy.typing import NDArray
 from fastembed import TextEmbedding
+from scipy.stats import chi2
 from sklearn.decomposition import PCA
 from sklearn.covariance import EllipticEnvelope
 from sklearn.metrics.pairwise import cosine_similarity
@@ -53,15 +54,12 @@ def normalize_rows(matrix: NDArray[np.float32]) -> NDArray[np.float32]:
 
 def reduce_dimensions(
     matrix: NDArray[np.float32],
-    variance_ratio: float,
     max_components: int,
 ) -> NDArray[np.float32]:
     """Reduce dimensionality via PCA.
 
     Computes n_components = min(max_components, n-1, d). If n_components < 1,
     returns the matrix unchanged. Logs explained variance for observability.
-    The variance_ratio parameter documents intent but is not strictly enforced;
-    the actual retained variance depends on the data and component cap.
     """
     n, d = matrix.shape
     if n <= 1:
@@ -80,25 +78,33 @@ def reduce_dimensions(
 
 def detect_outliers(
     matrix: NDArray[np.float32],
-    threshold: float,
-) -> list[int]:
-    """Flag items whose Mahalanobis distance exceeds the threshold.
+    percentile: float = 0.997,
+    contamination: float = 0.1,
+) -> list[tuple[int, float]]:
+    """Flag items whose Mahalanobis distance exceeds a dimension-aware cutoff.
 
     Uses EllipticEnvelope (robust covariance via MCD) to estimate the
     multivariate Gaussian, then computes sqrt(squared Mahalanobis distance)
-    for each sample. Returns indices of outliers sorted ascending.
+    for each sample. The cutoff is derived from the chi2 distribution with
+    k degrees of freedom (k = number of features), so it scales correctly
+    regardless of dimensionality. Returns (index, distance) tuples sorted
+    by index ascending.
     """
-    n = matrix.shape[0]
+    n, k = matrix.shape
     if n < 2:
         return []
 
-    envelope = EllipticEnvelope(contamination=0.1, random_state=42)
+    envelope = EllipticEnvelope(contamination=contamination, random_state=42)
     envelope.fit(matrix)
 
     # .mahalanobis() returns squared Mahalanobis distances
     distances = np.sqrt(envelope.mahalanobis(matrix))
-    outlier_mask = distances > threshold
-    return list(np.where(outlier_mask)[0])
+
+    # Dimension-aware cutoff: sqrt(chi2.ppf(percentile, df=k))
+    cutoff = np.sqrt(chi2.ppf(percentile, df=k))
+    outlier_mask = distances > cutoff
+    indices = np.where(outlier_mask)[0]
+    return [(int(idx), float(distances[idx])) for idx in indices]
 
 
 def find_duplicate_pairs(
diff --git a/.github/scripts/triage/requirements.txt b/.github/scripts/triage/requirements.txt
index 82531ae4bf..40b48ddc85 100644
--- a/.github/scripts/triage/requirements.txt
+++ b/.github/scripts/triage/requirements.txt
@@ -1,3 +1,4 @@
 fastembed>=0.5.0
 numpy>=1.26.0
 scikit-learn>=1.4.0
+scipy>=1.10.0
diff --git a/.github/scripts/triage/sweep.py b/.github/scripts/triage/sweep.py
index f6cebc7f15..8956a72f15 100644
--- a/.github/scripts/triage/sweep.py
+++ b/.github/scripts/triage/sweep.py
@@ -25,9 +25,13 @@
 
 # ── Thresholds (overridable via workflow_dispatch inputs) ──────────────
 
-# Mahalanobis distance beyond which an item is flagged as an outlier.
-# Default 3.0 ~ 99.7% of a Gaussian distribution (3-sigma rule).
-MAHALANOBIS_THRESHOLD: float = float(os.environ.get("INPUT_MAHALANOBIS_THRESHOLD", "3.0"))
+# Chi2 percentile for the dimension-aware outlier cutoff.
+# 0.997 is the multivariate equivalent of the 3-sigma rule.
+OUTLIER_PERCENTILE: float = float(os.environ.get("INPUT_OUTLIER_PERCENTILE", "0.997"))
+
+# EllipticEnvelope contamination: expected fraction of outliers in the data.
+# Governs how aggressively the robust covariance downweights extreme points.
+CONTAMINATION: float = float(os.environ.get("INPUT_CONTAMINATION", "0.1"))
 
 # Cosine similarity above which two items are flagged as duplicates.
 # 0.92 catches near-identical issues while tolerating paraphrasing.
@@ -42,18 +46,10 @@
 # ── Fixed constants (not user-configurable) ───────────────────────────
 
 # Minimum number of samples required for EllipticEnvelope to fit
-# a Gaussian. Below this, outlier detection is skipped because
-# covariance estimation is unreliable.
-MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 10
-
-# PCA: retain components explaining this fraction of variance.
-# 0.95 keeps 95% of information while reducing dimensionality enough
-# for EllipticEnvelope to be numerically stable.
-PCA_VARIANCE_RATIO: float = 0.95
-
-# PCA: maximum number of components regardless of variance ratio.
-# Caps dimensionality for EllipticEnvelope's n_samples > n_features^2 rule.
-PCA_MAX_COMPONENTS: int = 50
+# a Gaussian reliably. Must be >= 3 * PCA_MAX_COMPONENTS so the
+# covariance matrix is estimated from enough data points.
+PCA_MAX_COMPONENTS: int = 33
+MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 100
 
 # GitHub REST API page size (max allowed is 100).
 API_PAGE_SIZE: int = 100
@@ -152,19 +148,27 @@ class RepoLabel(TypedDict):
 
 
 def fetch_repo_labels() -> list[RepoLabel]:
-    """Fetch all labels from the repository.
+    """Fetch all labels from the repository, paginating if needed.
 
     Returns labels with name, description, and a text field suitable
     for embedding ("name: description"). Labels with no description
     use just the name.
     """
-    data = github_api_get("/labels?per_page=100")
     labels: list[RepoLabel] = []
-    for raw in data:
-        name = raw["name"]
-        desc = raw.get("description", "") or ""
-        text = f"{name}: {desc}" if desc else name
-        labels.append(RepoLabel(name=name, description=desc, text=text))
+    page = 1
+
+    while True:
+        data = github_api_get(f"/labels?per_page={API_PAGE_SIZE}&page={page}")
+        for raw in data:
+            name = raw["name"]
+            desc = raw.get("description", "") or ""
+            text = f"{name}: {desc}" if desc else name
+            labels.append(RepoLabel(name=name, description=desc, text=text))
+
+        if len(data) < API_PAGE_SIZE:
+            break
+        page += 1
+
     return labels
 
 
@@ -199,7 +203,7 @@ def apply_labels_to_item(item_number: int, labels: list[str]) -> None:
 
 def generate_report(
     items: list[TriageItem],
-    outlier_indices: list[int],
+    outlier_results: list[tuple[int, float]],
     duplicate_pairs: list[tuple[int, int, float]],
     label_suggestions: list[list[tuple[str, float]]] | None = None,
 ) -> str:
@@ -212,24 +216,24 @@ def generate_report(
         "",
         f"**Run:** {now} UTC",
         f"**Items analyzed:** {len(items)}",
-        f"**Thresholds:** Mahalanobis > {MAHALANOBIS_THRESHOLD}, Cosine > {COSINE_THRESHOLD}",
+        f"**Thresholds:** Outlier percentile {OUTLIER_PERCENTILE}, Cosine > {COSINE_THRESHOLD}",
         "",
-        f"### Potential Outliers / Spam ({len(outlier_indices)})",
+        f"### Potential Outliers / Spam ({len(outlier_results)})",
         "",
         "Items with unusually high Mahalanobis distance from the distribution center.",
         "These may be spam, off-topic, or poorly described.",
         "",
     ]
 
-    if outlier_indices:
+    if outlier_results:
         lines.append("| # | Type | Title | Distance |")
         lines.append("|---|------|-------|----------|")
-        for idx in outlier_indices:
+        for idx, distance in outlier_results:
             item = items[idx]
             kind = "PR" if item["is_pr"] else "Issue"
             lines.append(
                 f"| [#{item['number']}]({item['html_url']}) "
-                f"| {kind} | {item['title']} | flagged |"
+                f"| {kind} | {item['title']} | {distance:.2f} |"
             )
     else:
         lines.append("None found.")
@@ -259,11 +263,12 @@ def generate_report(
         lines.append("None found.")
 
     # ── Label suggestions section ────────────────────────────────────
-    outlier_set = set(outlier_indices)
+    outlier_set = {idx for idx, _ in outlier_results}
     if label_suggestions is not None:
         # Only unlabeled, non-outlier items — spam shouldn't get categorized
+        # Show only the top-1 label to match what actually gets applied
         items_with_suggestions = [
-            (i, sugs) for i, sugs in enumerate(label_suggestions)
+            (i, sugs[:1]) for i, sugs in enumerate(label_suggestions)
             if sugs and not items[i]["labels"] and i not in outlier_set
         ]
         lines.extend([
@@ -276,8 +281,8 @@ def generate_report(
         ])
 
         if items_with_suggestions:
-            lines.append("| # | Type | Title | Suggested Labels |")
-            lines.append("|---|------|-------|-----------------|")
+            lines.append("| # | Type | Title | Suggested Label |")
+            lines.append("|---|------|-------|--------------------|")
             for idx, sugs in items_with_suggestions:
                 item = items[idx]
                 kind = "PR" if item["is_pr"] else "Issue"
@@ -293,7 +298,7 @@ def generate_report(
         "",
         "### Summary",
         "",
-        f"- {len(outlier_indices)} outliers flagged for review",
+        f"- {len(outlier_results)} outliers flagged for review",
         f"- {len(duplicate_pairs)} duplicate pairs found",
         f"- {len(items)} items analyzed in total",
     ])
@@ -385,10 +390,12 @@ def main() -> None:
     embeddings = normalize_rows(embeddings)
 
     # 6. Outlier detection (Mahalanobis via EllipticEnvelope)
-    outlier_indices: list[int] = []
+    outlier_results: list[tuple[int, float]] = []
     if len(items) >= MIN_SAMPLES_FOR_OUTLIER_DETECTION:
-        reduced = reduce_dimensions(embeddings, PCA_VARIANCE_RATIO, PCA_MAX_COMPONENTS)
-        outlier_indices = detect_outliers(reduced, MAHALANOBIS_THRESHOLD)
+        reduced = reduce_dimensions(embeddings, PCA_MAX_COMPONENTS)
+        outlier_results = detect_outliers(
+            reduced, percentile=OUTLIER_PERCENTILE, contamination=CONTAMINATION,
+        )
     else:
         print(
             f"Skipping outlier detection: {len(items)} items < "
@@ -411,7 +418,7 @@ def main() -> None:
 
         # Apply top label to unlabeled items (unless dry run)
         # Skip outliers — flagged items shouldn't get categorized
-        outlier_set = set(outlier_indices)
+        outlier_set = {idx for idx, _ in outlier_results}
         if not DRY_RUN:
             applied_count = 0
             for i, sugs in enumerate(label_suggestions):
@@ -424,7 +431,7 @@ def main() -> None:
         print("No repo labels found — skipping label suggestions")
 
     # 9. Generate report
-    report = generate_report(items, outlier_indices, duplicate_pairs, label_suggestions)
+    report = generate_report(items, outlier_results, duplicate_pairs, label_suggestions)
 
     # 10. Write report to file (for summary step)
     write_report(report)
diff --git a/.github/scripts/triage/test_embedding_utils.py b/.github/scripts/triage/test_embedding_utils.py
index c192f63aba..6fbfe83afa 100644
--- a/.github/scripts/triage/test_embedding_utils.py
+++ b/.github/scripts/triage/test_embedding_utils.py
@@ -107,13 +107,13 @@ class TestReduceDimensions:
 
     def test_single_sample_returns_unchanged(self):
         m = np.random.randn(1, 50).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 10)
+        result = reduce_dimensions(m, 10)
         np.testing.assert_array_equal(result, m)
 
     def test_reduces_dimensions(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((100, 50)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 10)
+        result = reduce_dimensions(m, 10)
         assert result.shape == (100, 10)
         assert result.dtype == np.float32
 
@@ -121,20 +121,20 @@ def test_caps_at_n_minus_1(self):
         rng = np.random.default_rng(42)
         # 5 samples, 20 features -> max components = 4 (n-1)
         m = rng.standard_normal((5, 20)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 50)
+        result = reduce_dimensions(m, 50)
         assert result.shape == (5, 4)
 
     def test_caps_at_d(self):
         rng = np.random.default_rng(42)
         # 100 samples, 3 features -> max components = 3
         m = rng.standard_normal((100, 3)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 50)
+        result = reduce_dimensions(m, 50)
         assert result.shape == (100, 3)
 
     def test_max_components_respected(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((50, 30)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 5)
+        result = reduce_dimensions(m, 5)
         assert result.shape[1] == 5
 
 
@@ -143,13 +143,13 @@ class TestDetectOutliers:
 
     def test_single_sample_returns_empty(self):
         m = np.random.randn(1, 5).astype(np.float32)
-        result = detect_outliers(m, 3.0)
+        result = detect_outliers(m, percentile=0.997)
         assert result == []
 
     def test_empty_returns_empty(self):
         # n < 2 case
         m = np.empty((0, 5), dtype=np.float32)
-        result = detect_outliers(m, 3.0)
+        result = detect_outliers(m, percentile=0.997)
         assert result == []
 
     def test_finds_outliers_in_synthetic_data(self):
@@ -158,25 +158,51 @@ def test_finds_outliers_in_synthetic_data(self):
         cluster = rng.standard_normal((50, 3)).astype(np.float32) * 0.1
         outlier = np.array([[100.0, 100.0, 100.0]], dtype=np.float32)
         m = np.vstack([cluster, outlier])
-        result = detect_outliers(m, 3.0)
+        result = detect_outliers(m, percentile=0.997)
         # The outlier (index 50) should be detected
-        assert 50 in result
+        outlier_indices = [idx for idx, _ in result]
+        assert 50 in outlier_indices
 
-    def test_returns_list_of_ints(self):
+    def test_returns_list_of_index_distance_tuples(self):
         rng = np.random.default_rng(42)
-        m = rng.standard_normal((20, 3)).astype(np.float32)
-        result = detect_outliers(m, 3.0)
+        # Tight cluster + outlier to guarantee at least one result
+        cluster = rng.standard_normal((20, 3)).astype(np.float32) * 0.1
+        far_point = np.array([[50.0, 50.0, 50.0]], dtype=np.float32)
+        m = np.vstack([cluster, far_point])
+        result = detect_outliers(m, percentile=0.997)
         assert isinstance(result, list)
-        for idx in result:
-            assert isinstance(idx, (int, np.integer))
-
-    def test_low_threshold_flags_more(self):
+        for item in result:
+            assert isinstance(item, tuple)
+            assert len(item) == 2
+            idx, dist = item
+            assert isinstance(idx, int)
+            assert isinstance(dist, float)
+            assert dist > 0
+
+    def test_low_percentile_flags_more(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((30, 3)).astype(np.float32)
-        low = detect_outliers(m, 1.0)
-        high = detect_outliers(m, 10.0)
+        low = detect_outliers(m, percentile=0.5)
+        high = detect_outliers(m, percentile=0.999)
         assert len(low) >= len(high)
 
+    def test_dimension_aware_cutoff(self):
+        """High-dimensional data should not flag everything with default percentile."""
+        rng = np.random.default_rng(42)
+        # 500 samples, 10 dims — well-conditioned for robust covariance
+        m = rng.standard_normal((500, 10)).astype(np.float32)
+        result = detect_outliers(m, percentile=0.997)
+        # With a proper dimension-aware cutoff on clean Gaussian data,
+        # only a small fraction should be flagged (well under 50%)
+        assert len(result) < 250
+
+    def test_contamination_parameter(self):
+        rng = np.random.default_rng(42)
+        m = rng.standard_normal((50, 3)).astype(np.float32)
+        # Should not raise with different contamination values
+        result = detect_outliers(m, percentile=0.997, contamination=0.05)
+        assert isinstance(result, list)
+
 
 class TestFindDuplicatePairs:
     """Tests for cosine similarity duplicate detection."""
diff --git a/.github/scripts/triage/test_sweep.py b/.github/scripts/triage/test_sweep.py
index ea5e8cf7c9..8ab2c76ec9 100644
--- a/.github/scripts/triage/test_sweep.py
+++ b/.github/scripts/triage/test_sweep.py
@@ -33,6 +33,7 @@
     REPORT_LABEL,
     API_PAGE_SIZE,
     MIN_SAMPLES_FOR_OUTLIER_DETECTION,
+    PCA_MAX_COMPONENTS,
 )
 
 
@@ -82,6 +83,17 @@ def test_http_error_exits(self, mock_urlopen):
         assert exc_info.value.code == 1
 
 
+class TestConstants:
+    """Tests for module-level constants."""
+
+    def test_min_samples_is_at_least_3x_pca_max(self):
+        """MIN_SAMPLES must be >= 3 * PCA_MAX_COMPONENTS for reliable covariance."""
+        assert MIN_SAMPLES_FOR_OUTLIER_DETECTION >= 3 * PCA_MAX_COMPONENTS
+
+    def test_min_samples_is_100(self):
+        assert MIN_SAMPLES_FOR_OUTLIER_DETECTION == 100
+
+
 class TestFetchAllOpenItems:
     """Tests for fetch_all_open_items."""
 
@@ -164,7 +176,7 @@ def test_no_findings(self):
         assert "0 outliers flagged" in report
         assert "0 duplicate pairs found" in report
 
-    def test_with_outliers(self):
+    def test_with_outliers_shows_distance(self):
         items = [
             TriageItem(
                 number=10, title="Spam Issue", html_url="https://example.com/10",
@@ -175,9 +187,10 @@ def test_with_outliers(self):
                 is_pr=False, labels=[], created_at="2026-01-01", text="good",
             ),
         ]
-        report = generate_report(items, [0], [])
+        report = generate_report(items, [(0, 12.34)], [])
         assert "#10" in report
         assert "Spam Issue" in report
+        assert "12.34" in report
         assert "1 outliers flagged" in report
 
     def test_with_duplicates(self):
@@ -204,7 +217,7 @@ def test_pr_type_label(self):
                 is_pr=True, labels=[], created_at="2026-01-01", text="pr",
             ),
         ]
-        report = generate_report(items, [0], [])
+        report = generate_report(items, [(0, 8.5)], [])
         assert "| PR |" in report
 
     def test_footer_present(self):
@@ -295,6 +308,27 @@ def test_null_description_handled(self, mock_get):
         labels = fetch_repo_labels()
         assert labels[0]["text"] == "wontfix"
 
+    @patch("sweep.API_PAGE_SIZE", 2)
+    @patch("sweep.github_api_get")
+    def test_label_pagination(self, mock_get):
+        """Repos with more labels than one page should fetch all pages."""
+        mock_get.side_effect = [
+            # First page: full (2 items = API_PAGE_SIZE)
+            [
+                {"name": "bug", "description": "Broken"},
+                {"name": "feature", "description": "New"},
+            ],
+            # Second page: partial (1 item < API_PAGE_SIZE) -> stop
+            [
+                {"name": "docs", "description": "Documentation"},
+            ],
+        ]
+        labels = fetch_repo_labels()
+        assert len(labels) == 3
+        assert mock_get.call_count == 2
+        assert labels[0]["name"] == "bug"
+        assert labels[2]["name"] == "docs"
+
 
 class TestApplyLabelsToItem:
     """Tests for apply_labels_to_item."""
@@ -332,7 +366,7 @@ def test_http_error_is_non_fatal(self, mock_urlopen):
 class TestGenerateReportWithLabels:
     """Tests for label suggestions in the report."""
 
-    def test_report_includes_label_section(self):
+    def test_report_includes_label_section_with_top1_only(self):
         items = [
             TriageItem(
                 number=1, title="Fix crash", html_url="https://example.com/1",
@@ -342,7 +376,9 @@ def test_report_includes_label_section(self):
         suggestions = [[("bug", 0.85), ("enhancement", 0.42)]]
         report = generate_report(items, [], [], label_suggestions=suggestions)
         assert "Suggested Labels" in report
+        # Only the top-1 label should appear in the report
         assert "`bug` (0.85)" in report
+        assert "`enhancement`" not in report
         assert "1 items suggested for labeling" in report
 
     def test_report_skips_already_labeled_items(self):
@@ -369,11 +405,11 @@ def test_report_excludes_outliers_from_suggestions(self):
             ),
         ]
         suggestions = [[("bug", 0.85)], [("bug", 0.90)]]
-        # Item 0 is an outlier — should be excluded from label suggestions
-        report = generate_report(items, [0], [], label_suggestions=suggestions)
+        # Item 0 is an outlier (with distance) — should be excluded from label suggestions
+        report = generate_report(items, [(0, 15.2)], [], label_suggestions=suggestions)
         assert "1 unlabeled items" in report  # only item 2
         assert "#2" in report
-        # Item 1 (outlier) should NOT be in the suggestions table
+        # Item 0 (outlier) should NOT be in the suggestions table
         assert "Spam garbage" not in report.split("Suggested Labels")[1]
 
     def test_report_without_label_suggestions(self):
@@ -425,22 +461,23 @@ def test_full_flow_with_enough_items(
         mock_outliers, mock_dupes, mock_suggest, mock_write, mock_create,
     ):
         """Test the full flow with >= MIN_SAMPLES items (outlier detection runs)."""
+        n = MIN_SAMPLES_FOR_OUTLIER_DETECTION
         items = [
             TriageItem(
                 number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
                 is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}",
             )
-            for i in range(15)
+            for i in range(n)
         ]
         mock_fetch.return_value = items
         mock_labels.return_value = [
             RepoLabel(name="bug", description="Something broken", text="bug: Something broken"),
         ]
 
-        embeddings = np.random.randn(15, 384).astype(np.float32)
+        embeddings = np.random.randn(n, 384).astype(np.float32)
         mock_embed.return_value = embeddings
         mock_norm.return_value = embeddings
-        mock_reduce.return_value = np.random.randn(15, 10).astype(np.float32)
+        mock_reduce.return_value = np.random.randn(n, 10).astype(np.float32)
 
         main()
 
@@ -470,17 +507,18 @@ def test_skips_outlier_detection_for_few_items(
         self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_reduce,
         mock_outliers, mock_dupes, mock_suggest, mock_write, mock_create,
     ):
-        """With < MIN_SAMPLES items, outlier detection should be skipped."""
+        """With < MIN_SAMPLES (150) items, outlier detection should be skipped."""
+        n = MIN_SAMPLES_FOR_OUTLIER_DETECTION - 1  # 149
         items = [
             TriageItem(
                 number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
                 is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}",
             )
-            for i in range(5)
+            for i in range(n)
         ]
         mock_fetch.return_value = items
 
-        embeddings = np.random.randn(5, 384).astype(np.float32)
+        embeddings = np.random.randn(n, 384).astype(np.float32)
         mock_embed.return_value = embeddings
         mock_norm.return_value = embeddings
 
@@ -585,26 +623,27 @@ def test_outliers_do_not_get_labeled(
         mock_outliers, mock_dupes, mock_suggest, mock_apply, mock_write, mock_create,
     ):
         """Items flagged as outliers should not receive label suggestions."""
+        n = MIN_SAMPLES_FOR_OUTLIER_DETECTION
         items = [
             TriageItem(
                 number=i, title=f"Item {i}", html_url=f"https://example.com/{i}",
                 is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}",
             )
-            for i in range(15)
+            for i in range(n)
         ]
         mock_fetch.return_value = items
         mock_labels.return_value = [
             RepoLabel(name="bug", description="Broken", text="bug: Broken"),
         ]
-        # Outlier detection flags items 0 and 5
-        mock_outliers.return_value = [0, 5]
+        # Outlier detection flags items 0 and 5 (now returns tuples with distances)
+        mock_outliers.return_value = [(0, 12.5), (5, 15.3)]
         # Every item gets a suggestion
-        mock_suggest.return_value = [[("bug", 0.85)] for _ in range(15)]
+        mock_suggest.return_value = [[("bug", 0.85)] for _ in range(n)]
 
-        embeddings = np.random.randn(15, 384).astype(np.float32)
+        embeddings = np.random.randn(n, 384).astype(np.float32)
         mock_embed.return_value = embeddings
         mock_norm.return_value = embeddings
-        mock_reduce.return_value = np.random.randn(15, 10).astype(np.float32)
+        mock_reduce.return_value = np.random.randn(n, 10).astype(np.float32)
 
         main()
 
@@ -612,5 +651,5 @@ def test_outliers_do_not_get_labeled(
         labeled_numbers = [call.args[0] for call in mock_apply.call_args_list]
         assert 0 not in labeled_numbers
         assert 5 not in labeled_numbers
-        # Other items should be labeled (13 items: 15 total - 2 outliers)
-        assert mock_apply.call_count == 13
+        # Other items should be labeled (n - 2 outliers)
+        assert mock_apply.call_count == n - 2
diff --git a/.github/workflows/triage-sweep.yml b/.github/workflows/triage-sweep.yml
index 9e48dd75d6..b3f4840ffd 100644
--- a/.github/workflows/triage-sweep.yml
+++ b/.github/workflows/triage-sweep.yml
@@ -3,13 +3,19 @@ name: Triage Sweep
 on:
   workflow_dispatch:
     inputs:
-      mahalanobis_threshold:
+      outlier_percentile:
         description: >-
-          Mahalanobis distance threshold for outlier detection.
-          Items with distance above this are flagged as potential spam/noise.
+          Chi2 percentile for dimension-aware outlier cutoff (0-1).
+          0.997 is the multivariate equivalent of 3-sigma.
           Lower = more aggressive flagging.
         type: number
-        default: 3.0
+        default: 0.997
+      contamination:
+        description: >-
+          Expected fraction of outliers in the data (0-0.5).
+          Controls how aggressively EllipticEnvelope downweights extremes.
+        type: number
+        default: 0.1
       cosine_threshold:
         description: >-
           Cosine similarity threshold for duplicate detection.
@@ -71,7 +77,8 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
-          INPUT_MAHALANOBIS_THRESHOLD: ${{ inputs.mahalanobis_threshold }}
+          INPUT_OUTLIER_PERCENTILE: ${{ inputs.outlier_percentile }}
+          INPUT_CONTAMINATION: ${{ inputs.contamination }}
           INPUT_COSINE_THRESHOLD: ${{ inputs.cosine_threshold }}
           INPUT_MAX_ITEMS: ${{ inputs.max_items }}
           INPUT_DRY_RUN: ${{ inputs.dry_run }}