From cc2b13e332d8eebd1757c6d144bdae01331a8407 Mon Sep 17 00:00:00 2001 From: Zander Raycraft Date: Sat, 21 Mar 2026 20:06:29 -0500 Subject: [PATCH] updated mahalanobis threshold to be multi-dim aware --- .github/scripts/triage/embedding_utils.py | 28 ++++--- .github/scripts/triage/requirements.txt | 1 + .github/scripts/triage/sweep.py | 83 ++++++++++--------- .../scripts/triage/test_embedding_utils.py | 62 ++++++++++---- .github/scripts/triage/test_sweep.py | 81 +++++++++++++----- .github/workflows/triage-sweep.yml | 17 ++-- 6 files changed, 179 insertions(+), 93 deletions(-) diff --git a/.github/scripts/triage/embedding_utils.py b/.github/scripts/triage/embedding_utils.py index 95e872de15..ede06d61cb 100644 --- a/.github/scripts/triage/embedding_utils.py +++ b/.github/scripts/triage/embedding_utils.py @@ -8,6 +8,7 @@ import numpy as np from numpy.typing import NDArray from fastembed import TextEmbedding +from scipy.stats import chi2 from sklearn.decomposition import PCA from sklearn.covariance import EllipticEnvelope from sklearn.metrics.pairwise import cosine_similarity @@ -53,15 +54,12 @@ def normalize_rows(matrix: NDArray[np.float32]) -> NDArray[np.float32]: def reduce_dimensions( matrix: NDArray[np.float32], - variance_ratio: float, max_components: int, ) -> NDArray[np.float32]: """Reduce dimensionality via PCA. Computes n_components = min(max_components, n-1, d). If n_components < 1, returns the matrix unchanged. Logs explained variance for observability. - The variance_ratio parameter documents intent but is not strictly enforced; - the actual retained variance depends on the data and component cap. """ n, d = matrix.shape if n <= 1: @@ -80,25 +78,33 @@ def reduce_dimensions( def detect_outliers( matrix: NDArray[np.float32], - threshold: float, -) -> list[int]: - """Flag items whose Mahalanobis distance exceeds the threshold. + percentile: float = 0.997, + contamination: float = 0.1, +) -> list[tuple[int, float]]: + """Flag items whose Mahalanobis distance exceeds a dimension-aware cutoff. Uses EllipticEnvelope (robust covariance via MCD) to estimate the multivariate Gaussian, then computes sqrt(squared Mahalanobis distance) - for each sample. Returns indices of outliers sorted ascending. + for each sample. The cutoff is derived from the chi2 distribution with + k degrees of freedom (k = number of features), so it scales correctly + regardless of dimensionality. Returns (index, distance) tuples sorted + by index ascending. """ - n = matrix.shape[0] + n, k = matrix.shape if n < 2: return [] - envelope = EllipticEnvelope(contamination=0.1, random_state=42) + envelope = EllipticEnvelope(contamination=contamination, random_state=42) envelope.fit(matrix) # .mahalanobis() returns squared Mahalanobis distances distances = np.sqrt(envelope.mahalanobis(matrix)) - outlier_mask = distances > threshold - return list(np.where(outlier_mask)[0]) + + # Dimension-aware cutoff: sqrt(chi2.ppf(percentile, df=k)) + cutoff = np.sqrt(chi2.ppf(percentile, df=k)) + outlier_mask = distances > cutoff + indices = np.where(outlier_mask)[0] + return [(int(idx), float(distances[idx])) for idx in indices] def find_duplicate_pairs( diff --git a/.github/scripts/triage/requirements.txt b/.github/scripts/triage/requirements.txt index 82531ae4bf..40b48ddc85 100644 --- a/.github/scripts/triage/requirements.txt +++ b/.github/scripts/triage/requirements.txt @@ -1,3 +1,4 @@ fastembed>=0.5.0 numpy>=1.26.0 scikit-learn>=1.4.0 +scipy>=1.10.0 diff --git a/.github/scripts/triage/sweep.py b/.github/scripts/triage/sweep.py index f6cebc7f15..8956a72f15 100644 --- a/.github/scripts/triage/sweep.py +++ b/.github/scripts/triage/sweep.py @@ -25,9 +25,13 @@ # ── Thresholds (overridable via workflow_dispatch inputs) ────────────── -# Mahalanobis distance beyond which an item is flagged as an outlier. -# Default 3.0 ~ 99.7% of a Gaussian distribution (3-sigma rule). -MAHALANOBIS_THRESHOLD: float = float(os.environ.get("INPUT_MAHALANOBIS_THRESHOLD", "3.0")) +# Chi2 percentile for the dimension-aware outlier cutoff. +# 0.997 is the multivariate equivalent of the 3-sigma rule. +OUTLIER_PERCENTILE: float = float(os.environ.get("INPUT_OUTLIER_PERCENTILE", "0.997")) + +# EllipticEnvelope contamination: expected fraction of outliers in the data. +# Governs how aggressively the robust covariance downweights extreme points. +CONTAMINATION: float = float(os.environ.get("INPUT_CONTAMINATION", "0.1")) # Cosine similarity above which two items are flagged as duplicates. # 0.92 catches near-identical issues while tolerating paraphrasing. @@ -42,18 +46,10 @@ # ── Fixed constants (not user-configurable) ─────────────────────────── # Minimum number of samples required for EllipticEnvelope to fit -# a Gaussian. Below this, outlier detection is skipped because -# covariance estimation is unreliable. -MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 10 - -# PCA: retain components explaining this fraction of variance. -# 0.95 keeps 95% of information while reducing dimensionality enough -# for EllipticEnvelope to be numerically stable. -PCA_VARIANCE_RATIO: float = 0.95 - -# PCA: maximum number of components regardless of variance ratio. -# Caps dimensionality for EllipticEnvelope's n_samples > n_features^2 rule. -PCA_MAX_COMPONENTS: int = 50 +# a Gaussian reliably. Must be >= 3 * PCA_MAX_COMPONENTS so the +# covariance matrix is estimated from enough data points. +PCA_MAX_COMPONENTS: int = 33 +MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 100 # GitHub REST API page size (max allowed is 100). API_PAGE_SIZE: int = 100 @@ -152,19 +148,27 @@ class RepoLabel(TypedDict): def fetch_repo_labels() -> list[RepoLabel]: - """Fetch all labels from the repository. + """Fetch all labels from the repository, paginating if needed. Returns labels with name, description, and a text field suitable for embedding ("name: description"). Labels with no description use just the name. """ - data = github_api_get("/labels?per_page=100") labels: list[RepoLabel] = [] - for raw in data: - name = raw["name"] - desc = raw.get("description", "") or "" - text = f"{name}: {desc}" if desc else name - labels.append(RepoLabel(name=name, description=desc, text=text)) + page = 1 + + while True: + data = github_api_get(f"/labels?per_page={API_PAGE_SIZE}&page={page}") + for raw in data: + name = raw["name"] + desc = raw.get("description", "") or "" + text = f"{name}: {desc}" if desc else name + labels.append(RepoLabel(name=name, description=desc, text=text)) + + if len(data) < API_PAGE_SIZE: + break + page += 1 + return labels @@ -199,7 +203,7 @@ def apply_labels_to_item(item_number: int, labels: list[str]) -> None: def generate_report( items: list[TriageItem], - outlier_indices: list[int], + outlier_results: list[tuple[int, float]], duplicate_pairs: list[tuple[int, int, float]], label_suggestions: list[list[tuple[str, float]]] | None = None, ) -> str: @@ -212,24 +216,24 @@ def generate_report( "", f"**Run:** {now} UTC", f"**Items analyzed:** {len(items)}", - f"**Thresholds:** Mahalanobis > {MAHALANOBIS_THRESHOLD}, Cosine > {COSINE_THRESHOLD}", + f"**Thresholds:** Outlier percentile {OUTLIER_PERCENTILE}, Cosine > {COSINE_THRESHOLD}", "", - f"### Potential Outliers / Spam ({len(outlier_indices)})", + f"### Potential Outliers / Spam ({len(outlier_results)})", "", "Items with unusually high Mahalanobis distance from the distribution center.", "These may be spam, off-topic, or poorly described.", "", ] - if outlier_indices: + if outlier_results: lines.append("| # | Type | Title | Distance |") lines.append("|---|------|-------|----------|") - for idx in outlier_indices: + for idx, distance in outlier_results: item = items[idx] kind = "PR" if item["is_pr"] else "Issue" lines.append( f"| [#{item['number']}]({item['html_url']}) " - f"| {kind} | {item['title']} | flagged |" + f"| {kind} | {item['title']} | {distance:.2f} |" ) else: lines.append("None found.") @@ -259,11 +263,12 @@ def generate_report( lines.append("None found.") # ── Label suggestions section ──────────────────────────────────── - outlier_set = set(outlier_indices) + outlier_set = {idx for idx, _ in outlier_results} if label_suggestions is not None: # Only unlabeled, non-outlier items — spam shouldn't get categorized + # Show only the top-1 label to match what actually gets applied items_with_suggestions = [ - (i, sugs) for i, sugs in enumerate(label_suggestions) + (i, sugs[:1]) for i, sugs in enumerate(label_suggestions) if sugs and not items[i]["labels"] and i not in outlier_set ] lines.extend([ @@ -276,8 +281,8 @@ def generate_report( ]) if items_with_suggestions: - lines.append("| # | Type | Title | Suggested Labels |") - lines.append("|---|------|-------|-----------------|") + lines.append("| # | Type | Title | Suggested Label |") + lines.append("|---|------|-------|--------------------|") for idx, sugs in items_with_suggestions: item = items[idx] kind = "PR" if item["is_pr"] else "Issue" @@ -293,7 +298,7 @@ def generate_report( "", "### Summary", "", - f"- {len(outlier_indices)} outliers flagged for review", + f"- {len(outlier_results)} outliers flagged for review", f"- {len(duplicate_pairs)} duplicate pairs found", f"- {len(items)} items analyzed in total", ]) @@ -385,10 +390,12 @@ def main() -> None: embeddings = normalize_rows(embeddings) # 6. Outlier detection (Mahalanobis via EllipticEnvelope) - outlier_indices: list[int] = [] + outlier_results: list[tuple[int, float]] = [] if len(items) >= MIN_SAMPLES_FOR_OUTLIER_DETECTION: - reduced = reduce_dimensions(embeddings, PCA_VARIANCE_RATIO, PCA_MAX_COMPONENTS) - outlier_indices = detect_outliers(reduced, MAHALANOBIS_THRESHOLD) + reduced = reduce_dimensions(embeddings, PCA_MAX_COMPONENTS) + outlier_results = detect_outliers( + reduced, percentile=OUTLIER_PERCENTILE, contamination=CONTAMINATION, + ) else: print( f"Skipping outlier detection: {len(items)} items < " @@ -411,7 +418,7 @@ def main() -> None: # Apply top label to unlabeled items (unless dry run) # Skip outliers — flagged items shouldn't get categorized - outlier_set = set(outlier_indices) + outlier_set = {idx for idx, _ in outlier_results} if not DRY_RUN: applied_count = 0 for i, sugs in enumerate(label_suggestions): @@ -424,7 +431,7 @@ def main() -> None: print("No repo labels found — skipping label suggestions") # 9. Generate report - report = generate_report(items, outlier_indices, duplicate_pairs, label_suggestions) + report = generate_report(items, outlier_results, duplicate_pairs, label_suggestions) # 10. Write report to file (for summary step) write_report(report) diff --git a/.github/scripts/triage/test_embedding_utils.py b/.github/scripts/triage/test_embedding_utils.py index c192f63aba..6fbfe83afa 100644 --- a/.github/scripts/triage/test_embedding_utils.py +++ b/.github/scripts/triage/test_embedding_utils.py @@ -107,13 +107,13 @@ class TestReduceDimensions: def test_single_sample_returns_unchanged(self): m = np.random.randn(1, 50).astype(np.float32) - result = reduce_dimensions(m, 0.95, 10) + result = reduce_dimensions(m, 10) np.testing.assert_array_equal(result, m) def test_reduces_dimensions(self): rng = np.random.default_rng(42) m = rng.standard_normal((100, 50)).astype(np.float32) - result = reduce_dimensions(m, 0.95, 10) + result = reduce_dimensions(m, 10) assert result.shape == (100, 10) assert result.dtype == np.float32 @@ -121,20 +121,20 @@ def test_caps_at_n_minus_1(self): rng = np.random.default_rng(42) # 5 samples, 20 features -> max components = 4 (n-1) m = rng.standard_normal((5, 20)).astype(np.float32) - result = reduce_dimensions(m, 0.95, 50) + result = reduce_dimensions(m, 50) assert result.shape == (5, 4) def test_caps_at_d(self): rng = np.random.default_rng(42) # 100 samples, 3 features -> max components = 3 m = rng.standard_normal((100, 3)).astype(np.float32) - result = reduce_dimensions(m, 0.95, 50) + result = reduce_dimensions(m, 50) assert result.shape == (100, 3) def test_max_components_respected(self): rng = np.random.default_rng(42) m = rng.standard_normal((50, 30)).astype(np.float32) - result = reduce_dimensions(m, 0.95, 5) + result = reduce_dimensions(m, 5) assert result.shape[1] == 5 @@ -143,13 +143,13 @@ class TestDetectOutliers: def test_single_sample_returns_empty(self): m = np.random.randn(1, 5).astype(np.float32) - result = detect_outliers(m, 3.0) + result = detect_outliers(m, percentile=0.997) assert result == [] def test_empty_returns_empty(self): # n < 2 case m = np.empty((0, 5), dtype=np.float32) - result = detect_outliers(m, 3.0) + result = detect_outliers(m, percentile=0.997) assert result == [] def test_finds_outliers_in_synthetic_data(self): @@ -158,25 +158,51 @@ def test_finds_outliers_in_synthetic_data(self): cluster = rng.standard_normal((50, 3)).astype(np.float32) * 0.1 outlier = np.array([[100.0, 100.0, 100.0]], dtype=np.float32) m = np.vstack([cluster, outlier]) - result = detect_outliers(m, 3.0) + result = detect_outliers(m, percentile=0.997) # The outlier (index 50) should be detected - assert 50 in result + outlier_indices = [idx for idx, _ in result] + assert 50 in outlier_indices - def test_returns_list_of_ints(self): + def test_returns_list_of_index_distance_tuples(self): rng = np.random.default_rng(42) - m = rng.standard_normal((20, 3)).astype(np.float32) - result = detect_outliers(m, 3.0) + # Tight cluster + outlier to guarantee at least one result + cluster = rng.standard_normal((20, 3)).astype(np.float32) * 0.1 + far_point = np.array([[50.0, 50.0, 50.0]], dtype=np.float32) + m = np.vstack([cluster, far_point]) + result = detect_outliers(m, percentile=0.997) assert isinstance(result, list) - for idx in result: - assert isinstance(idx, (int, np.integer)) - - def test_low_threshold_flags_more(self): + for item in result: + assert isinstance(item, tuple) + assert len(item) == 2 + idx, dist = item + assert isinstance(idx, int) + assert isinstance(dist, float) + assert dist > 0 + + def test_low_percentile_flags_more(self): rng = np.random.default_rng(42) m = rng.standard_normal((30, 3)).astype(np.float32) - low = detect_outliers(m, 1.0) - high = detect_outliers(m, 10.0) + low = detect_outliers(m, percentile=0.5) + high = detect_outliers(m, percentile=0.999) assert len(low) >= len(high) + def test_dimension_aware_cutoff(self): + """High-dimensional data should not flag everything with default percentile.""" + rng = np.random.default_rng(42) + # 500 samples, 10 dims — well-conditioned for robust covariance + m = rng.standard_normal((500, 10)).astype(np.float32) + result = detect_outliers(m, percentile=0.997) + # With a proper dimension-aware cutoff on clean Gaussian data, + # only a small fraction should be flagged (well under 50%) + assert len(result) < 250 + + def test_contamination_parameter(self): + rng = np.random.default_rng(42) + m = rng.standard_normal((50, 3)).astype(np.float32) + # Should not raise with different contamination values + result = detect_outliers(m, percentile=0.997, contamination=0.05) + assert isinstance(result, list) + class TestFindDuplicatePairs: """Tests for cosine similarity duplicate detection.""" diff --git a/.github/scripts/triage/test_sweep.py b/.github/scripts/triage/test_sweep.py index ea5e8cf7c9..8ab2c76ec9 100644 --- a/.github/scripts/triage/test_sweep.py +++ b/.github/scripts/triage/test_sweep.py @@ -33,6 +33,7 @@ REPORT_LABEL, API_PAGE_SIZE, MIN_SAMPLES_FOR_OUTLIER_DETECTION, + PCA_MAX_COMPONENTS, ) @@ -82,6 +83,17 @@ def test_http_error_exits(self, mock_urlopen): assert exc_info.value.code == 1 +class TestConstants: + """Tests for module-level constants.""" + + def test_min_samples_is_at_least_3x_pca_max(self): + """MIN_SAMPLES must be >= 3 * PCA_MAX_COMPONENTS for reliable covariance.""" + assert MIN_SAMPLES_FOR_OUTLIER_DETECTION >= 3 * PCA_MAX_COMPONENTS + + def test_min_samples_is_100(self): + assert MIN_SAMPLES_FOR_OUTLIER_DETECTION == 100 + + class TestFetchAllOpenItems: """Tests for fetch_all_open_items.""" @@ -164,7 +176,7 @@ def test_no_findings(self): assert "0 outliers flagged" in report assert "0 duplicate pairs found" in report - def test_with_outliers(self): + def test_with_outliers_shows_distance(self): items = [ TriageItem( number=10, title="Spam Issue", html_url="https://example.com/10", @@ -175,9 +187,10 @@ def test_with_outliers(self): is_pr=False, labels=[], created_at="2026-01-01", text="good", ), ] - report = generate_report(items, [0], []) + report = generate_report(items, [(0, 12.34)], []) assert "#10" in report assert "Spam Issue" in report + assert "12.34" in report assert "1 outliers flagged" in report def test_with_duplicates(self): @@ -204,7 +217,7 @@ def test_pr_type_label(self): is_pr=True, labels=[], created_at="2026-01-01", text="pr", ), ] - report = generate_report(items, [0], []) + report = generate_report(items, [(0, 8.5)], []) assert "| PR |" in report def test_footer_present(self): @@ -295,6 +308,27 @@ def test_null_description_handled(self, mock_get): labels = fetch_repo_labels() assert labels[0]["text"] == "wontfix" + @patch("sweep.API_PAGE_SIZE", 2) + @patch("sweep.github_api_get") + def test_label_pagination(self, mock_get): + """Repos with more labels than one page should fetch all pages.""" + mock_get.side_effect = [ + # First page: full (2 items = API_PAGE_SIZE) + [ + {"name": "bug", "description": "Broken"}, + {"name": "feature", "description": "New"}, + ], + # Second page: partial (1 item < API_PAGE_SIZE) -> stop + [ + {"name": "docs", "description": "Documentation"}, + ], + ] + labels = fetch_repo_labels() + assert len(labels) == 3 + assert mock_get.call_count == 2 + assert labels[0]["name"] == "bug" + assert labels[2]["name"] == "docs" + class TestApplyLabelsToItem: """Tests for apply_labels_to_item.""" @@ -332,7 +366,7 @@ def test_http_error_is_non_fatal(self, mock_urlopen): class TestGenerateReportWithLabels: """Tests for label suggestions in the report.""" - def test_report_includes_label_section(self): + def test_report_includes_label_section_with_top1_only(self): items = [ TriageItem( number=1, title="Fix crash", html_url="https://example.com/1", @@ -342,7 +376,9 @@ def test_report_includes_label_section(self): suggestions = [[("bug", 0.85), ("enhancement", 0.42)]] report = generate_report(items, [], [], label_suggestions=suggestions) assert "Suggested Labels" in report + # Only the top-1 label should appear in the report assert "`bug` (0.85)" in report + assert "`enhancement`" not in report assert "1 items suggested for labeling" in report def test_report_skips_already_labeled_items(self): @@ -369,11 +405,11 @@ def test_report_excludes_outliers_from_suggestions(self): ), ] suggestions = [[("bug", 0.85)], [("bug", 0.90)]] - # Item 0 is an outlier — should be excluded from label suggestions - report = generate_report(items, [0], [], label_suggestions=suggestions) + # Item 0 is an outlier (with distance) — should be excluded from label suggestions + report = generate_report(items, [(0, 15.2)], [], label_suggestions=suggestions) assert "1 unlabeled items" in report # only item 2 assert "#2" in report - # Item 1 (outlier) should NOT be in the suggestions table + # Item 0 (outlier) should NOT be in the suggestions table assert "Spam garbage" not in report.split("Suggested Labels")[1] def test_report_without_label_suggestions(self): @@ -425,22 +461,23 @@ def test_full_flow_with_enough_items( mock_outliers, mock_dupes, mock_suggest, mock_write, mock_create, ): """Test the full flow with >= MIN_SAMPLES items (outlier detection runs).""" + n = MIN_SAMPLES_FOR_OUTLIER_DETECTION items = [ TriageItem( number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}", ) - for i in range(15) + for i in range(n) ] mock_fetch.return_value = items mock_labels.return_value = [ RepoLabel(name="bug", description="Something broken", text="bug: Something broken"), ] - embeddings = np.random.randn(15, 384).astype(np.float32) + embeddings = np.random.randn(n, 384).astype(np.float32) mock_embed.return_value = embeddings mock_norm.return_value = embeddings - mock_reduce.return_value = np.random.randn(15, 10).astype(np.float32) + mock_reduce.return_value = np.random.randn(n, 10).astype(np.float32) main() @@ -470,17 +507,18 @@ def test_skips_outlier_detection_for_few_items( self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_reduce, mock_outliers, mock_dupes, mock_suggest, mock_write, mock_create, ): - """With < MIN_SAMPLES items, outlier detection should be skipped.""" + """With < MIN_SAMPLES (150) items, outlier detection should be skipped.""" + n = MIN_SAMPLES_FOR_OUTLIER_DETECTION - 1 # 149 items = [ TriageItem( number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}", ) - for i in range(5) + for i in range(n) ] mock_fetch.return_value = items - embeddings = np.random.randn(5, 384).astype(np.float32) + embeddings = np.random.randn(n, 384).astype(np.float32) mock_embed.return_value = embeddings mock_norm.return_value = embeddings @@ -585,26 +623,27 @@ def test_outliers_do_not_get_labeled( mock_outliers, mock_dupes, mock_suggest, mock_apply, mock_write, mock_create, ): """Items flagged as outliers should not receive label suggestions.""" + n = MIN_SAMPLES_FOR_OUTLIER_DETECTION items = [ TriageItem( number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}", ) - for i in range(15) + for i in range(n) ] mock_fetch.return_value = items mock_labels.return_value = [ RepoLabel(name="bug", description="Broken", text="bug: Broken"), ] - # Outlier detection flags items 0 and 5 - mock_outliers.return_value = [0, 5] + # Outlier detection flags items 0 and 5 (now returns tuples with distances) + mock_outliers.return_value = [(0, 12.5), (5, 15.3)] # Every item gets a suggestion - mock_suggest.return_value = [[("bug", 0.85)] for _ in range(15)] + mock_suggest.return_value = [[("bug", 0.85)] for _ in range(n)] - embeddings = np.random.randn(15, 384).astype(np.float32) + embeddings = np.random.randn(n, 384).astype(np.float32) mock_embed.return_value = embeddings mock_norm.return_value = embeddings - mock_reduce.return_value = np.random.randn(15, 10).astype(np.float32) + mock_reduce.return_value = np.random.randn(n, 10).astype(np.float32) main() @@ -612,5 +651,5 @@ def test_outliers_do_not_get_labeled( labeled_numbers = [call.args[0] for call in mock_apply.call_args_list] assert 0 not in labeled_numbers assert 5 not in labeled_numbers - # Other items should be labeled (13 items: 15 total - 2 outliers) - assert mock_apply.call_count == 13 + # Other items should be labeled (n - 2 outliers) + assert mock_apply.call_count == n - 2 diff --git a/.github/workflows/triage-sweep.yml b/.github/workflows/triage-sweep.yml index 9e48dd75d6..b3f4840ffd 100644 --- a/.github/workflows/triage-sweep.yml +++ b/.github/workflows/triage-sweep.yml @@ -3,13 +3,19 @@ name: Triage Sweep on: workflow_dispatch: inputs: - mahalanobis_threshold: + outlier_percentile: description: >- - Mahalanobis distance threshold for outlier detection. - Items with distance above this are flagged as potential spam/noise. + Chi2 percentile for dimension-aware outlier cutoff (0-1). + 0.997 is the multivariate equivalent of 3-sigma. Lower = more aggressive flagging. type: number - default: 3.0 + default: 0.997 + contamination: + description: >- + Expected fraction of outliers in the data (0-0.5). + Controls how aggressively EllipticEnvelope downweights extremes. + type: number + default: 0.1 cosine_threshold: description: >- Cosine similarity threshold for duplicate detection. @@ -71,7 +77,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} - INPUT_MAHALANOBIS_THRESHOLD: ${{ inputs.mahalanobis_threshold }} + INPUT_OUTLIER_PERCENTILE: ${{ inputs.outlier_percentile }} + INPUT_CONTAMINATION: ${{ inputs.contamination }} INPUT_COSINE_THRESHOLD: ${{ inputs.cosine_threshold }} INPUT_MAX_ITEMS: ${{ inputs.max_items }} INPUT_DRY_RUN: ${{ inputs.dry_run }}