Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions .github/scripts/triage/embedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
from numpy.typing import NDArray
from fastembed import TextEmbedding
from scipy.stats import chi2
from sklearn.decomposition import PCA
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics.pairwise import cosine_similarity
Expand Down Expand Up @@ -53,15 +54,12 @@ def normalize_rows(matrix: NDArray[np.float32]) -> NDArray[np.float32]:

def reduce_dimensions(
matrix: NDArray[np.float32],
variance_ratio: float,
max_components: int,
) -> NDArray[np.float32]:
"""Reduce dimensionality via PCA.

Computes n_components = min(max_components, n-1, d). If n_components < 1,
returns the matrix unchanged. Logs explained variance for observability.
The variance_ratio parameter documents intent but is not strictly enforced;
the actual retained variance depends on the data and component cap.
"""
n, d = matrix.shape
if n <= 1:
Expand All @@ -80,25 +78,33 @@ def reduce_dimensions(

def detect_outliers(
matrix: NDArray[np.float32],
threshold: float,
) -> list[int]:
"""Flag items whose Mahalanobis distance exceeds the threshold.
percentile: float = 0.997,
contamination: float = 0.1,
) -> list[tuple[int, float]]:
"""Flag items whose Mahalanobis distance exceeds a dimension-aware cutoff.

Uses EllipticEnvelope (robust covariance via MCD) to estimate the
multivariate Gaussian, then computes sqrt(squared Mahalanobis distance)
for each sample. Returns indices of outliers sorted ascending.
for each sample. The cutoff is derived from the chi2 distribution with
k degrees of freedom (k = number of features), so it scales correctly
regardless of dimensionality. Returns (index, distance) tuples sorted
by index ascending.
"""
n = matrix.shape[0]
n, k = matrix.shape
if n < 2:
return []

envelope = EllipticEnvelope(contamination=0.1, random_state=42)
envelope = EllipticEnvelope(contamination=contamination, random_state=42)
envelope.fit(matrix)

# .mahalanobis() returns squared Mahalanobis distances
distances = np.sqrt(envelope.mahalanobis(matrix))
outlier_mask = distances > threshold
return list(np.where(outlier_mask)[0])

# Dimension-aware cutoff: sqrt(chi2.ppf(percentile, df=k))
cutoff = np.sqrt(chi2.ppf(percentile, df=k))
outlier_mask = distances > cutoff
indices = np.where(outlier_mask)[0]
return [(int(idx), float(distances[idx])) for idx in indices]


def find_duplicate_pairs(
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/triage/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
fastembed>=0.5.0
numpy>=1.26.0
scikit-learn>=1.4.0
scipy>=1.10.0
83 changes: 45 additions & 38 deletions .github/scripts/triage/sweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@

# ── Thresholds (overridable via workflow_dispatch inputs) ──────────────

# Mahalanobis distance beyond which an item is flagged as an outlier.
# Default 3.0 ~ 99.7% of a Gaussian distribution (3-sigma rule).
MAHALANOBIS_THRESHOLD: float = float(os.environ.get("INPUT_MAHALANOBIS_THRESHOLD", "3.0"))
# Chi2 percentile for the dimension-aware outlier cutoff.
# 0.997 is the multivariate equivalent of the 3-sigma rule.
OUTLIER_PERCENTILE: float = float(os.environ.get("INPUT_OUTLIER_PERCENTILE", "0.997"))

# EllipticEnvelope contamination: expected fraction of outliers in the data.
# Governs how aggressively the robust covariance downweights extreme points.
CONTAMINATION: float = float(os.environ.get("INPUT_CONTAMINATION", "0.1"))

# Cosine similarity above which two items are flagged as duplicates.
# 0.92 catches near-identical issues while tolerating paraphrasing.
Expand All @@ -42,18 +46,10 @@
# ── Fixed constants (not user-configurable) ───────────────────────────

# Minimum number of samples required for EllipticEnvelope to fit
# a Gaussian. Below this, outlier detection is skipped because
# covariance estimation is unreliable.
MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 10

# PCA: retain components explaining this fraction of variance.
# 0.95 keeps 95% of information while reducing dimensionality enough
# for EllipticEnvelope to be numerically stable.
PCA_VARIANCE_RATIO: float = 0.95

# PCA: maximum number of components regardless of variance ratio.
# Caps dimensionality for EllipticEnvelope's n_samples > n_features^2 rule.
PCA_MAX_COMPONENTS: int = 50
# a Gaussian reliably. Must be >= 3 * PCA_MAX_COMPONENTS so the
# covariance matrix is estimated from enough data points.
PCA_MAX_COMPONENTS: int = 33
MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 100

# GitHub REST API page size (max allowed is 100).
API_PAGE_SIZE: int = 100
Expand Down Expand Up @@ -152,19 +148,27 @@ class RepoLabel(TypedDict):


def fetch_repo_labels() -> list[RepoLabel]:
"""Fetch all labels from the repository.
"""Fetch all labels from the repository, paginating if needed.

Returns labels with name, description, and a text field suitable
for embedding ("name: description"). Labels with no description
use just the name.
"""
data = github_api_get("/labels?per_page=100")
labels: list[RepoLabel] = []
for raw in data:
name = raw["name"]
desc = raw.get("description", "") or ""
text = f"{name}: {desc}" if desc else name
labels.append(RepoLabel(name=name, description=desc, text=text))
page = 1

while True:
data = github_api_get(f"/labels?per_page={API_PAGE_SIZE}&page={page}")
for raw in data:
name = raw["name"]
desc = raw.get("description", "") or ""
text = f"{name}: {desc}" if desc else name
labels.append(RepoLabel(name=name, description=desc, text=text))

if len(data) < API_PAGE_SIZE:
break
page += 1

return labels


Expand Down Expand Up @@ -199,7 +203,7 @@ def apply_labels_to_item(item_number: int, labels: list[str]) -> None:

def generate_report(
items: list[TriageItem],
outlier_indices: list[int],
outlier_results: list[tuple[int, float]],
duplicate_pairs: list[tuple[int, int, float]],
label_suggestions: list[list[tuple[str, float]]] | None = None,
) -> str:
Expand All @@ -212,24 +216,24 @@ def generate_report(
"",
f"**Run:** {now} UTC",
f"**Items analyzed:** {len(items)}",
f"**Thresholds:** Mahalanobis > {MAHALANOBIS_THRESHOLD}, Cosine > {COSINE_THRESHOLD}",
f"**Thresholds:** Outlier percentile {OUTLIER_PERCENTILE}, Cosine > {COSINE_THRESHOLD}",
"",
f"### Potential Outliers / Spam ({len(outlier_indices)})",
f"### Potential Outliers / Spam ({len(outlier_results)})",
"",
"Items with unusually high Mahalanobis distance from the distribution center.",
"These may be spam, off-topic, or poorly described.",
"",
]

if outlier_indices:
if outlier_results:
lines.append("| # | Type | Title | Distance |")
lines.append("|---|------|-------|----------|")
for idx in outlier_indices:
for idx, distance in outlier_results:
item = items[idx]
kind = "PR" if item["is_pr"] else "Issue"
lines.append(
f"| [#{item['number']}]({item['html_url']}) "
f"| {kind} | {item['title']} | flagged |"
f"| {kind} | {item['title']} | {distance:.2f} |"
)
else:
lines.append("None found.")
Expand Down Expand Up @@ -259,11 +263,12 @@ def generate_report(
lines.append("None found.")

# ── Label suggestions section ────────────────────────────────────
outlier_set = set(outlier_indices)
outlier_set = {idx for idx, _ in outlier_results}
if label_suggestions is not None:
# Only unlabeled, non-outlier items — spam shouldn't get categorized
# Show only the top-1 label to match what actually gets applied
items_with_suggestions = [
(i, sugs) for i, sugs in enumerate(label_suggestions)
(i, sugs[:1]) for i, sugs in enumerate(label_suggestions)
if sugs and not items[i]["labels"] and i not in outlier_set
]
lines.extend([
Expand All @@ -276,8 +281,8 @@ def generate_report(
])

if items_with_suggestions:
lines.append("| # | Type | Title | Suggested Labels |")
lines.append("|---|------|-------|-----------------|")
lines.append("| # | Type | Title | Suggested Label |")
lines.append("|---|------|-------|--------------------|")
for idx, sugs in items_with_suggestions:
item = items[idx]
kind = "PR" if item["is_pr"] else "Issue"
Expand All @@ -293,7 +298,7 @@ def generate_report(
"",
"### Summary",
"",
f"- {len(outlier_indices)} outliers flagged for review",
f"- {len(outlier_results)} outliers flagged for review",
f"- {len(duplicate_pairs)} duplicate pairs found",
f"- {len(items)} items analyzed in total",
])
Expand Down Expand Up @@ -385,10 +390,12 @@ def main() -> None:
embeddings = normalize_rows(embeddings)

# 6. Outlier detection (Mahalanobis via EllipticEnvelope)
outlier_indices: list[int] = []
outlier_results: list[tuple[int, float]] = []
if len(items) >= MIN_SAMPLES_FOR_OUTLIER_DETECTION:
reduced = reduce_dimensions(embeddings, PCA_VARIANCE_RATIO, PCA_MAX_COMPONENTS)
outlier_indices = detect_outliers(reduced, MAHALANOBIS_THRESHOLD)
reduced = reduce_dimensions(embeddings, PCA_MAX_COMPONENTS)
outlier_results = detect_outliers(
reduced, percentile=OUTLIER_PERCENTILE, contamination=CONTAMINATION,
)
else:
print(
f"Skipping outlier detection: {len(items)} items < "
Expand All @@ -411,7 +418,7 @@ def main() -> None:

# Apply top label to unlabeled items (unless dry run)
# Skip outliers — flagged items shouldn't get categorized
outlier_set = set(outlier_indices)
outlier_set = {idx for idx, _ in outlier_results}
if not DRY_RUN:
applied_count = 0
for i, sugs in enumerate(label_suggestions):
Expand All @@ -424,7 +431,7 @@ def main() -> None:
print("No repo labels found — skipping label suggestions")

# 9. Generate report
report = generate_report(items, outlier_indices, duplicate_pairs, label_suggestions)
report = generate_report(items, outlier_results, duplicate_pairs, label_suggestions)

# 10. Write report to file (for summary step)
write_report(report)
Expand Down
62 changes: 44 additions & 18 deletions .github/scripts/triage/test_embedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,34 +107,34 @@ class TestReduceDimensions:

def test_single_sample_returns_unchanged(self):
m = np.random.randn(1, 50).astype(np.float32)
result = reduce_dimensions(m, 0.95, 10)
result = reduce_dimensions(m, 10)
np.testing.assert_array_equal(result, m)

def test_reduces_dimensions(self):
rng = np.random.default_rng(42)
m = rng.standard_normal((100, 50)).astype(np.float32)
result = reduce_dimensions(m, 0.95, 10)
result = reduce_dimensions(m, 10)
assert result.shape == (100, 10)
assert result.dtype == np.float32

def test_caps_at_n_minus_1(self):
rng = np.random.default_rng(42)
# 5 samples, 20 features -> max components = 4 (n-1)
m = rng.standard_normal((5, 20)).astype(np.float32)
result = reduce_dimensions(m, 0.95, 50)
result = reduce_dimensions(m, 50)
assert result.shape == (5, 4)

def test_caps_at_d(self):
rng = np.random.default_rng(42)
# 100 samples, 3 features -> max components = 3
m = rng.standard_normal((100, 3)).astype(np.float32)
result = reduce_dimensions(m, 0.95, 50)
result = reduce_dimensions(m, 50)
assert result.shape == (100, 3)

def test_max_components_respected(self):
rng = np.random.default_rng(42)
m = rng.standard_normal((50, 30)).astype(np.float32)
result = reduce_dimensions(m, 0.95, 5)
result = reduce_dimensions(m, 5)
assert result.shape[1] == 5


Expand All @@ -143,13 +143,13 @@ class TestDetectOutliers:

def test_single_sample_returns_empty(self):
m = np.random.randn(1, 5).astype(np.float32)
result = detect_outliers(m, 3.0)
result = detect_outliers(m, percentile=0.997)
assert result == []

def test_empty_returns_empty(self):
# n < 2 case
m = np.empty((0, 5), dtype=np.float32)
result = detect_outliers(m, 3.0)
result = detect_outliers(m, percentile=0.997)
assert result == []

def test_finds_outliers_in_synthetic_data(self):
Expand All @@ -158,25 +158,51 @@ def test_finds_outliers_in_synthetic_data(self):
cluster = rng.standard_normal((50, 3)).astype(np.float32) * 0.1
outlier = np.array([[100.0, 100.0, 100.0]], dtype=np.float32)
m = np.vstack([cluster, outlier])
result = detect_outliers(m, 3.0)
result = detect_outliers(m, percentile=0.997)
# The outlier (index 50) should be detected
assert 50 in result
outlier_indices = [idx for idx, _ in result]
assert 50 in outlier_indices

def test_returns_list_of_ints(self):
def test_returns_list_of_index_distance_tuples(self):
rng = np.random.default_rng(42)
m = rng.standard_normal((20, 3)).astype(np.float32)
result = detect_outliers(m, 3.0)
# Tight cluster + outlier to guarantee at least one result
cluster = rng.standard_normal((20, 3)).astype(np.float32) * 0.1
far_point = np.array([[50.0, 50.0, 50.0]], dtype=np.float32)
m = np.vstack([cluster, far_point])
result = detect_outliers(m, percentile=0.997)
assert isinstance(result, list)
for idx in result:
assert isinstance(idx, (int, np.integer))

def test_low_threshold_flags_more(self):
for item in result:
assert isinstance(item, tuple)
assert len(item) == 2
idx, dist = item
assert isinstance(idx, int)
assert isinstance(dist, float)
assert dist > 0

def test_low_percentile_flags_more(self):
rng = np.random.default_rng(42)
m = rng.standard_normal((30, 3)).astype(np.float32)
low = detect_outliers(m, 1.0)
high = detect_outliers(m, 10.0)
low = detect_outliers(m, percentile=0.5)
high = detect_outliers(m, percentile=0.999)
assert len(low) >= len(high)

def test_dimension_aware_cutoff(self):
"""High-dimensional data should not flag everything with default percentile."""
rng = np.random.default_rng(42)
# 500 samples, 10 dims — well-conditioned for robust covariance
m = rng.standard_normal((500, 10)).astype(np.float32)
result = detect_outliers(m, percentile=0.997)
# With a proper dimension-aware cutoff on clean Gaussian data,
# only a small fraction should be flagged (well under 50%)
assert len(result) < 250

def test_contamination_parameter(self):
rng = np.random.default_rng(42)
m = rng.standard_normal((50, 3)).astype(np.float32)
# Should not raise with different contamination values
result = detect_outliers(m, percentile=0.997, contamination=0.05)
assert isinstance(result, list)


class TestFindDuplicatePairs:
"""Tests for cosine similarity duplicate detection."""
Expand Down
Loading
Loading