zander-raycraft · zander-raycraft · Mar 22, 2026 · Mar 22, 2026
diff --git a/.github/scripts/triage/embedding_utils.py b/.github/scripts/triage/embedding_utils.py
@@ -8,6 +8,7 @@
 import numpy as np
 from numpy.typing import NDArray
 from fastembed import TextEmbedding
+from scipy.stats import chi2
 from sklearn.decomposition import PCA
 from sklearn.covariance import EllipticEnvelope
 from sklearn.metrics.pairwise import cosine_similarity
@@ -53,15 +54,12 @@ def normalize_rows(matrix: NDArray[np.float32]) -> NDArray[np.float32]:
 
 def reduce_dimensions(
     matrix: NDArray[np.float32],
-    variance_ratio: float,
     max_components: int,
 ) -> NDArray[np.float32]:
     """Reduce dimensionality via PCA.
 
     Computes n_components = min(max_components, n-1, d). If n_components < 1,
     returns the matrix unchanged. Logs explained variance for observability.
-    The variance_ratio parameter documents intent but is not strictly enforced;
-    the actual retained variance depends on the data and component cap.
     """
     n, d = matrix.shape
     if n <= 1:
@@ -80,25 +78,33 @@ def reduce_dimensions(
 
 def detect_outliers(
     matrix: NDArray[np.float32],
-    threshold: float,
-) -> list[int]:
-    """Flag items whose Mahalanobis distance exceeds the threshold.
+    percentile: float = 0.997,
+    contamination: float = 0.1,
+) -> list[tuple[int, float]]:
+    """Flag items whose Mahalanobis distance exceeds a dimension-aware cutoff.
 
     Uses EllipticEnvelope (robust covariance via MCD) to estimate the
     multivariate Gaussian, then computes sqrt(squared Mahalanobis distance)
-    for each sample. Returns indices of outliers sorted ascending.
+    for each sample. The cutoff is derived from the chi2 distribution with
+    k degrees of freedom (k = number of features), so it scales correctly
+    regardless of dimensionality. Returns (index, distance) tuples sorted
+    by index ascending.
     """
-    n = matrix.shape[0]
+    n, k = matrix.shape
     if n < 2:
         return []
 
-    envelope = EllipticEnvelope(contamination=0.1, random_state=42)
+    envelope = EllipticEnvelope(contamination=contamination, random_state=42)
     envelope.fit(matrix)
 
     # .mahalanobis() returns squared Mahalanobis distances
     distances = np.sqrt(envelope.mahalanobis(matrix))
-    outlier_mask = distances > threshold
-    return list(np.where(outlier_mask)[0])
+
+    # Dimension-aware cutoff: sqrt(chi2.ppf(percentile, df=k))
+    cutoff = np.sqrt(chi2.ppf(percentile, df=k))
+    outlier_mask = distances > cutoff
+    indices = np.where(outlier_mask)[0]
+    return [(int(idx), float(distances[idx])) for idx in indices]
 
 
 def find_duplicate_pairs(

diff --git a/.github/scripts/triage/requirements.txt b/.github/scripts/triage/requirements.txt
@@ -1,3 +1,4 @@
 fastembed>=0.5.0
 numpy>=1.26.0
 scikit-learn>=1.4.0
+scipy>=1.10.0
diff --git a/.github/scripts/triage/sweep.py b/.github/scripts/triage/sweep.py
@@ -25,9 +25,13 @@
 
 # ── Thresholds (overridable via workflow_dispatch inputs) ──────────────
 
-# Mahalanobis distance beyond which an item is flagged as an outlier.
-# Default 3.0 ~ 99.7% of a Gaussian distribution (3-sigma rule).
-MAHALANOBIS_THRESHOLD: float = float(os.environ.get("INPUT_MAHALANOBIS_THRESHOLD", "3.0"))
+# Chi2 percentile for the dimension-aware outlier cutoff.
+# 0.997 is the multivariate equivalent of the 3-sigma rule.
+OUTLIER_PERCENTILE: float = float(os.environ.get("INPUT_OUTLIER_PERCENTILE", "0.997"))
+
+# EllipticEnvelope contamination: expected fraction of outliers in the data.
+# Governs how aggressively the robust covariance downweights extreme points.
+CONTAMINATION: float = float(os.environ.get("INPUT_CONTAMINATION", "0.1"))
 
 # Cosine similarity above which two items are flagged as duplicates.
 # 0.92 catches near-identical issues while tolerating paraphrasing.
@@ -42,18 +46,10 @@
 # ── Fixed constants (not user-configurable) ───────────────────────────
 
 # Minimum number of samples required for EllipticEnvelope to fit
-# a Gaussian. Below this, outlier detection is skipped because
-# covariance estimation is unreliable.
-MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 10
-
-# PCA: retain components explaining this fraction of variance.
-# 0.95 keeps 95% of information while reducing dimensionality enough
-# for EllipticEnvelope to be numerically stable.
-PCA_VARIANCE_RATIO: float = 0.95
-
-# PCA: maximum number of components regardless of variance ratio.
-# Caps dimensionality for EllipticEnvelope's n_samples > n_features^2 rule.
-PCA_MAX_COMPONENTS: int = 50
+# a Gaussian reliably. Must be >= 3 * PCA_MAX_COMPONENTS so the
+# covariance matrix is estimated from enough data points.
+PCA_MAX_COMPONENTS: int = 33
+MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 100
 
 # GitHub REST API page size (max allowed is 100).
 API_PAGE_SIZE: int = 100
@@ -152,19 +148,27 @@ class RepoLabel(TypedDict):
 
 
 def fetch_repo_labels() -> list[RepoLabel]:
-    """Fetch all labels from the repository.
+    """Fetch all labels from the repository, paginating if needed.
 
     Returns labels with name, description, and a text field suitable
     for embedding ("name: description"). Labels with no description
     use just the name.
     """
-    data = github_api_get("/labels?per_page=100")
     labels: list[RepoLabel] = []
-    for raw in data:
-        name = raw["name"]
-        desc = raw.get("description", "") or ""
-        text = f"{name}: {desc}" if desc else name
-        labels.append(RepoLabel(name=name, description=desc, text=text))
+    page = 1
+
+    while True:
+        data = github_api_get(f"/labels?per_page={API_PAGE_SIZE}&page={page}")
+        for raw in data:
+            name = raw["name"]
+            desc = raw.get("description", "") or ""
+            text = f"{name}: {desc}" if desc else name
+            labels.append(RepoLabel(name=name, description=desc, text=text))
+
+        if len(data) < API_PAGE_SIZE:
+            break
+        page += 1
+
     return labels
 
 
@@ -199,7 +203,7 @@ def apply_labels_to_item(item_number: int, labels: list[str]) -> None:
 
 def generate_report(
     items: list[TriageItem],
-    outlier_indices: list[int],
+    outlier_results: list[tuple[int, float]],
     duplicate_pairs: list[tuple[int, int, float]],
     label_suggestions: list[list[tuple[str, float]]] | None = None,
 ) -> str:
@@ -212,24 +216,24 @@ def generate_report(
         "",
         f"**Run:** {now} UTC",
         f"**Items analyzed:** {len(items)}",
-        f"**Thresholds:** Mahalanobis > {MAHALANOBIS_THRESHOLD}, Cosine > {COSINE_THRESHOLD}",
+        f"**Thresholds:** Outlier percentile {OUTLIER_PERCENTILE}, Cosine > {COSINE_THRESHOLD}",
         "",
-        f"### Potential Outliers / Spam ({len(outlier_indices)})",
+        f"### Potential Outliers / Spam ({len(outlier_results)})",
         "",
         "Items with unusually high Mahalanobis distance from the distribution center.",
         "These may be spam, off-topic, or poorly described.",
         "",
     ]
 
-    if outlier_indices:
+    if outlier_results:
         lines.append("| # | Type | Title | Distance |")
         lines.append("|---|------|-------|----------|")
-        for idx in outlier_indices:
+        for idx, distance in outlier_results:
             item = items[idx]
             kind = "PR" if item["is_pr"] else "Issue"
             lines.append(
                 f"| [#{item['number']}]({item['html_url']}) "
-                f"| {kind} | {item['title']} | flagged |"
+                f"| {kind} | {item['title']} | {distance:.2f} |"
             )
     else:
         lines.append("None found.")
@@ -259,11 +263,12 @@ def generate_report(
         lines.append("None found.")
 
     # ── Label suggestions section ────────────────────────────────────
-    outlier_set = set(outlier_indices)
+    outlier_set = {idx for idx, _ in outlier_results}
     if label_suggestions is not None:
         # Only unlabeled, non-outlier items — spam shouldn't get categorized
+        # Show only the top-1 label to match what actually gets applied
         items_with_suggestions = [
-            (i, sugs) for i, sugs in enumerate(label_suggestions)
+            (i, sugs[:1]) for i, sugs in enumerate(label_suggestions)
             if sugs and not items[i]["labels"] and i not in outlier_set
         ]
         lines.extend([
@@ -276,8 +281,8 @@ def generate_report(
         ])
 
         if items_with_suggestions:
-            lines.append("| # | Type | Title | Suggested Labels |")
-            lines.append("|---|------|-------|-----------------|")
+            lines.append("| # | Type | Title | Suggested Label |")
+            lines.append("|---|------|-------|--------------------|")
             for idx, sugs in items_with_suggestions:
                 item = items[idx]
                 kind = "PR" if item["is_pr"] else "Issue"
@@ -293,7 +298,7 @@ def generate_report(
         "",
         "### Summary",
         "",
-        f"- {len(outlier_indices)} outliers flagged for review",
+        f"- {len(outlier_results)} outliers flagged for review",
         f"- {len(duplicate_pairs)} duplicate pairs found",
         f"- {len(items)} items analyzed in total",
     ])
@@ -385,10 +390,12 @@ def main() -> None:
     embeddings = normalize_rows(embeddings)
 
     # 6. Outlier detection (Mahalanobis via EllipticEnvelope)
-    outlier_indices: list[int] = []
+    outlier_results: list[tuple[int, float]] = []
     if len(items) >= MIN_SAMPLES_FOR_OUTLIER_DETECTION:
-        reduced = reduce_dimensions(embeddings, PCA_VARIANCE_RATIO, PCA_MAX_COMPONENTS)
-        outlier_indices = detect_outliers(reduced, MAHALANOBIS_THRESHOLD)
+        reduced = reduce_dimensions(embeddings, PCA_MAX_COMPONENTS)
+        outlier_results = detect_outliers(
+            reduced, percentile=OUTLIER_PERCENTILE, contamination=CONTAMINATION,
+        )
     else:
         print(
             f"Skipping outlier detection: {len(items)} items < "
@@ -411,7 +418,7 @@ def main() -> None:
 
         # Apply top label to unlabeled items (unless dry run)
         # Skip outliers — flagged items shouldn't get categorized
-        outlier_set = set(outlier_indices)
+        outlier_set = {idx for idx, _ in outlier_results}
         if not DRY_RUN:
             applied_count = 0
             for i, sugs in enumerate(label_suggestions):
@@ -424,7 +431,7 @@ def main() -> None:
         print("No repo labels found — skipping label suggestions")
 
     # 9. Generate report
-    report = generate_report(items, outlier_indices, duplicate_pairs, label_suggestions)
+    report = generate_report(items, outlier_results, duplicate_pairs, label_suggestions)
 
     # 10. Write report to file (for summary step)
     write_report(report)

diff --git a/.github/scripts/triage/test_embedding_utils.py b/.github/scripts/triage/test_embedding_utils.py
@@ -107,34 +107,34 @@ class TestReduceDimensions:
 
     def test_single_sample_returns_unchanged(self):
         m = np.random.randn(1, 50).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 10)
+        result = reduce_dimensions(m, 10)
         np.testing.assert_array_equal(result, m)
 
     def test_reduces_dimensions(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((100, 50)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 10)
+        result = reduce_dimensions(m, 10)
         assert result.shape == (100, 10)
         assert result.dtype == np.float32
 
     def test_caps_at_n_minus_1(self):
         rng = np.random.default_rng(42)
         # 5 samples, 20 features -> max components = 4 (n-1)
         m = rng.standard_normal((5, 20)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 50)
+        result = reduce_dimensions(m, 50)
         assert result.shape == (5, 4)
 
     def test_caps_at_d(self):
         rng = np.random.default_rng(42)
         # 100 samples, 3 features -> max components = 3
         m = rng.standard_normal((100, 3)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 50)
+        result = reduce_dimensions(m, 50)
         assert result.shape == (100, 3)
 
     def test_max_components_respected(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((50, 30)).astype(np.float32)
-        result = reduce_dimensions(m, 0.95, 5)
+        result = reduce_dimensions(m, 5)
         assert result.shape[1] == 5
 
 
@@ -143,13 +143,13 @@ class TestDetectOutliers:
 
     def test_single_sample_returns_empty(self):
         m = np.random.randn(1, 5).astype(np.float32)
-        result = detect_outliers(m, 3.0)
+        result = detect_outliers(m, percentile=0.997)
         assert result == []
 
     def test_empty_returns_empty(self):
         # n < 2 case
         m = np.empty((0, 5), dtype=np.float32)
-        result = detect_outliers(m, 3.0)
+        result = detect_outliers(m, percentile=0.997)
         assert result == []
 
     def test_finds_outliers_in_synthetic_data(self):
@@ -158,25 +158,51 @@ def test_finds_outliers_in_synthetic_data(self):
         cluster = rng.standard_normal((50, 3)).astype(np.float32) * 0.1
         outlier = np.array([[100.0, 100.0, 100.0]], dtype=np.float32)
         m = np.vstack([cluster, outlier])
-        result = detect_outliers(m, 3.0)
+        result = detect_outliers(m, percentile=0.997)
         # The outlier (index 50) should be detected
-        assert 50 in result
+        outlier_indices = [idx for idx, _ in result]
+        assert 50 in outlier_indices
 
-    def test_returns_list_of_ints(self):
+    def test_returns_list_of_index_distance_tuples(self):
         rng = np.random.default_rng(42)
-        m = rng.standard_normal((20, 3)).astype(np.float32)
-        result = detect_outliers(m, 3.0)
+        # Tight cluster + outlier to guarantee at least one result
+        cluster = rng.standard_normal((20, 3)).astype(np.float32) * 0.1
+        far_point = np.array([[50.0, 50.0, 50.0]], dtype=np.float32)
+        m = np.vstack([cluster, far_point])
+        result = detect_outliers(m, percentile=0.997)
         assert isinstance(result, list)
-        for idx in result:
-            assert isinstance(idx, (int, np.integer))
-
-    def test_low_threshold_flags_more(self):
+        for item in result:
+            assert isinstance(item, tuple)
+            assert len(item) == 2
+            idx, dist = item
+            assert isinstance(idx, int)
+            assert isinstance(dist, float)
+            assert dist > 0
+
+    def test_low_percentile_flags_more(self):
         rng = np.random.default_rng(42)
         m = rng.standard_normal((30, 3)).astype(np.float32)
-        low = detect_outliers(m, 1.0)
-        high = detect_outliers(m, 10.0)
+        low = detect_outliers(m, percentile=0.5)
+        high = detect_outliers(m, percentile=0.999)
         assert len(low) >= len(high)
 
+    def test_dimension_aware_cutoff(self):
+        """High-dimensional data should not flag everything with default percentile."""
+        rng = np.random.default_rng(42)
+        # 500 samples, 10 dims — well-conditioned for robust covariance
+        m = rng.standard_normal((500, 10)).astype(np.float32)
+        result = detect_outliers(m, percentile=0.997)
+        # With a proper dimension-aware cutoff on clean Gaussian data,
+        # only a small fraction should be flagged (well under 50%)
+        assert len(result) < 250
+
+    def test_contamination_parameter(self):
+        rng = np.random.default_rng(42)
+        m = rng.standard_normal((50, 3)).astype(np.float32)
+        # Should not raise with different contamination values
+        result = detect_outliers(m, percentile=0.997, contamination=0.05)
+        assert isinstance(result, list)
+
 
 class TestFindDuplicatePairs:
     """Tests for cosine similarity duplicate detection."""