From 4031562f6b24d1fafada843efa39be20da3e6c84 Mon Sep 17 00:00:00 2001 From: Zander Raycraft Date: Sat, 21 Mar 2026 23:21:23 -0500 Subject: [PATCH] redoing auto labeling using z score instead of clustering lowkey the method --- .github/scripts/triage/embedding_utils.py | 131 ++++++-- .github/scripts/triage/sweep.py | 229 +++++++++++--- .../scripts/triage/test_embedding_utils.py | 233 ++++++++++---- .github/scripts/triage/test_sweep.py | 286 +++++++++++++++--- .github/workflows/triage-sweep.yml | 23 +- 5 files changed, 727 insertions(+), 175 deletions(-) diff --git a/.github/scripts/triage/embedding_utils.py b/.github/scripts/triage/embedding_utils.py index ede06d61cb..a93b063ec5 100644 --- a/.github/scripts/triage/embedding_utils.py +++ b/.github/scripts/triage/embedding_utils.py @@ -8,7 +8,6 @@ import numpy as np from numpy.typing import NDArray from fastembed import TextEmbedding -from scipy.stats import chi2 from sklearn.decomposition import PCA from sklearn.covariance import EllipticEnvelope from sklearn.metrics.pairwise import cosine_similarity @@ -78,19 +77,25 @@ def reduce_dimensions( def detect_outliers( matrix: NDArray[np.float32], - percentile: float = 0.997, contamination: float = 0.1, + iqr_multiplier: float = 3.0, + max_outlier_pct: float = 0.05, ) -> list[tuple[int, float]]: - """Flag items whose Mahalanobis distance exceeds a dimension-aware cutoff. + """Flag items whose Mahalanobis distance exceeds an IQR-based cutoff. Uses EllipticEnvelope (robust covariance via MCD) to estimate the multivariate Gaussian, then computes sqrt(squared Mahalanobis distance) - for each sample. The cutoff is derived from the chi2 distribution with - k degrees of freedom (k = number of features), so it scales correctly - regardless of dimensionality. Returns (index, distance) tuples sorted - by index ascending. + for each sample. The cutoff is Q75 + iqr_multiplier * IQR, which + adapts to the actual distribution of distances. + + A hard cap ensures no more than max_outlier_pct * n items are flagged; + when the cap is hit, only the most extreme items (sorted by distance + descending) are kept. + + Returns (index, distance) tuples sorted by index ascending, along with + the cutoff value stored as an attribute on the returned list. """ - n, k = matrix.shape + n = matrix.shape[0] if n < 2: return [] @@ -100,11 +105,34 @@ def detect_outliers( # .mahalanobis() returns squared Mahalanobis distances distances = np.sqrt(envelope.mahalanobis(matrix)) - # Dimension-aware cutoff: sqrt(chi2.ppf(percentile, df=k)) - cutoff = np.sqrt(chi2.ppf(percentile, df=k)) + # IQR-based cutoff + q25, q75 = np.percentile(distances, [25, 75]) + iqr = q75 - q25 + cutoff = q75 + iqr_multiplier * iqr + outlier_mask = distances > cutoff indices = np.where(outlier_mask)[0] - return [(int(idx), float(distances[idx])) for idx in indices] + + # Hard cap: keep at most max_outlier_pct * n items + max_count = max(1, int(max_outlier_pct * n)) + if len(indices) > max_count: + # Sort by distance descending, take the most extreme + sorted_by_dist = sorted(indices, key=lambda i: distances[i], reverse=True) + indices = np.array(sorted_by_dist[:max_count]) + + # Sort by index ascending for stable output + indices = np.sort(indices) + result = [(int(idx), float(distances[idx])) for idx in indices] + + # Attach cutoff as metadata so the report can use it + result = _OutlierResult(result) # type: ignore[assignment] + result.cutoff = float(cutoff) # type: ignore[attr-defined] + return result # type: ignore[return-value] + + +class _OutlierResult(list): + """A list subclass that carries metadata (cutoff) from outlier detection.""" + cutoff: float = 0.0 def find_duplicate_pairs( @@ -134,11 +162,22 @@ def find_duplicate_pairs( return pairs -# ── Label suggestion via embedding similarity ──────────────────────── +# ── Label suggestion via z-score normalized embedding similarity ────── + +# Z-score threshold: a label must be this many standard deviations above +# the column mean to be considered a match. +LABEL_Z_THRESHOLD: float = 1.5 -# Minimum similarity between an item and a label to suggest it. -# 0.4 is intentionally permissive — the report is for human review. -LABEL_SIMILARITY_THRESHOLD: float = 0.4 +# Margin gate: the top-1 label must beat the second-best by this many +# z-score units to be accepted (subsequent labels don't need a margin). +LABEL_Z_MARGIN: float = 0.5 + +# Floor for per-column standard deviation to avoid division by near-zero. +LABEL_Z_STD_FLOOR: float = 0.01 + +# Minimum raw cosine similarity required even if z-score is high. +# Prevents suggesting labels that are "relatively best" but still poor. +MIN_RAW_SIMILARITY: float = 0.3 # Maximum number of labels to suggest per item. MAX_LABELS_PER_ITEM: int = 3 @@ -148,37 +187,71 @@ def suggest_labels( item_embeddings: NDArray[np.float32], label_embeddings: NDArray[np.float32], label_names: list[str], - threshold: float = LABEL_SIMILARITY_THRESHOLD, + z_threshold: float = LABEL_Z_THRESHOLD, + z_margin: float = LABEL_Z_MARGIN, + std_floor: float = LABEL_Z_STD_FLOOR, + min_raw_sim: float = MIN_RAW_SIMILARITY, max_per_item: int = MAX_LABELS_PER_ITEM, ) -> list[list[tuple[str, float]]]: - """Suggest labels for each item based on embedding similarity. + """Suggest labels for each item using z-score normalized similarity. - Computes cosine similarity between item embeddings (n, 384) and - label embeddings (m, 384). For each item, returns the top-k labels - whose similarity exceeds the threshold, sorted by similarity descending. + 1. Compute raw cosine similarity matrix (n items x m labels). + 2. Column-wise z-score: for each label j, normalize across all items. + 3. For each item, rank labels by z-score descending. + 4. Accept a label only if z >= z_threshold AND raw_sim >= min_raw_sim. + 5. Margin gate: the top-1 label must beat #2 by z_margin; subsequent + labels don't need a margin. + 6. Cap at max_per_item. Returns a list of length n, where each element is a list of - (label_name, similarity) tuples. Empty list if no label exceeds threshold. + (label_name, raw_similarity) tuples. Empty list if nothing qualifies. """ n = item_embeddings.shape[0] m = label_embeddings.shape[0] if n == 0 or m == 0: return [[] for _ in range(n)] - # (n, m) similarity matrix: each row is one item vs all labels + # (n, m) raw similarity matrix sim_matrix = cosine_similarity(item_embeddings, label_embeddings) + # Column-wise z-score normalization + col_means = sim_matrix.mean(axis=0) # shape (m,) + col_stds = sim_matrix.std(axis=0) # shape (m,) + col_stds = np.maximum(col_stds, std_floor) + z_matrix = (sim_matrix - col_means) / col_stds + suggestions: list[list[tuple[str, float]]] = [] for i in range(n): - row = sim_matrix[i] - # Indices sorted by similarity descending - ranked = np.argsort(row)[::-1] + z_row = z_matrix[i] + raw_row = sim_matrix[i] + + # Rank labels by z-score descending + ranked = np.argsort(z_row)[::-1] + item_labels: list[tuple[str, float]] = [] - for idx in ranked[:max_per_item]: - score = float(row[idx]) - if score < threshold: + + # Margin gate: top-1 z-score must beat #2 by z_margin. + # If not, the assignment is ambiguous — skip this item entirely. + if len(ranked) > 1: + top1_z = float(z_row[ranked[0]]) + top2_z = float(z_row[ranked[1]]) + if top1_z - top2_z < z_margin: + suggestions.append(item_labels) + continue + + for rank_pos, idx in enumerate(ranked): + if len(item_labels) >= max_per_item: break - item_labels.append((label_names[idx], score)) + + z_val = float(z_row[idx]) + raw_val = float(raw_row[idx]) + + # Must pass both z-threshold and raw similarity floor + if z_val < z_threshold or raw_val < min_raw_sim: + continue + + item_labels.append((label_names[idx], raw_val)) + suggestions.append(item_labels) return suggestions diff --git a/.github/scripts/triage/sweep.py b/.github/scripts/triage/sweep.py index 8e0f70d6d7..97892300d1 100644 --- a/.github/scripts/triage/sweep.py +++ b/.github/scripts/triage/sweep.py @@ -21,13 +21,20 @@ detect_outliers, find_duplicate_pairs, suggest_labels, + LABEL_Z_THRESHOLD, + LABEL_Z_MARGIN, + LABEL_Z_STD_FLOOR, + MIN_RAW_SIMILARITY, + MAX_LABELS_PER_ITEM, ) # ── Thresholds (overridable via workflow_dispatch inputs) ────────────── -# Chi2 percentile for the dimension-aware outlier cutoff. -# 0.997 is the multivariate equivalent of the 3-sigma rule. -OUTLIER_PERCENTILE: float = float(os.environ.get("INPUT_OUTLIER_PERCENTILE", "0.997")) +# IQR multiplier for outlier cutoff: cutoff = Q75 + IQR_MULTIPLIER * IQR. +IQR_MULTIPLIER: float = float(os.environ.get("INPUT_IQR_MULTIPLIER", "3.0")) + +# Hard cap: at most this fraction of items can be flagged as outliers. +MAX_OUTLIER_PCT: float = float(os.environ.get("INPUT_MAX_OUTLIER_PCT", "0.05")) # EllipticEnvelope contamination: expected fraction of outliers in the data. # Governs how aggressively the robust covariance downweights extreme points. @@ -48,7 +55,7 @@ # Minimum number of samples required for EllipticEnvelope to fit # a Gaussian reliably. Must be >= 3 * PCA_MAX_COMPONENTS so the # covariance matrix is estimated from enough data points. -PCA_MAX_COMPONENTS: int = 33 +PCA_MAX_COMPONENTS: int = 20 MIN_SAMPLES_FOR_OUTLIER_DETECTION: int = 100 # Max character length for embedding input text. bge-small-en-v1.5 has a @@ -211,6 +218,40 @@ def apply_labels_to_item(item_number: int, labels: list[str]) -> None: print(f"::warning::Failed to label #{item_number}: {e.code} {body}") +def _item_age(created_at: str) -> str: + """Compute a human-readable age string from an ISO 8601 created_at timestamp.""" + try: + created = datetime.fromisoformat(created_at.replace("Z", "+00:00")) + delta = datetime.now(timezone.utc) - created + days = delta.days + if days < 1: + return "<1d" + if days < 30: + return f"{days}d" + if days < 365: + return f"{days // 30}mo" + return f"{days // 365}y" + except (ValueError, TypeError): + return "?" + + +def _suggested_action(a: TriageItem, b: TriageItem) -> str: + """Determine a suggested action for a duplicate pair based on types and age.""" + if a["is_pr"] and b["is_pr"]: + return "Review for overlap" + if not a["is_pr"] and not b["is_pr"]: + # Both issues — close the newer one + try: + a_dt = datetime.fromisoformat(a["created_at"].replace("Z", "+00:00")) + b_dt = datetime.fromisoformat(b["created_at"].replace("Z", "+00:00")) + newer = b if b_dt > a_dt else a + except (ValueError, TypeError): + newer = b + return f"Close #{newer['number']} as duplicate" + # One issue, one PR + return "Link PR to issue" + + def generate_report( items: list[TriageItem], outlier_results: list[tuple[int, float]], @@ -221,33 +262,89 @@ def generate_report( now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") repo = os.environ.get("GITHUB_REPOSITORY", "unknown/repo") + # Compute label suggestion counts early for the health table + outlier_set = {idx for idx, _ in outlier_results} + suggested_count = 0 + if label_suggestions is not None: + suggested_count = sum( + 1 for i, s in enumerate(label_suggestions) + if s and not items[i]["labels"] and i not in outlier_set + ) + + # ── Health summary table at the top ────────────────────────────── lines: list[str] = [ "## Triage Sweep Report", "", f"**Run:** {now} UTC", f"**Items analyzed:** {len(items)}", - f"**Thresholds:** Outlier percentile {OUTLIER_PERCENTILE}, Cosine > {COSINE_THRESHOLD}", + f"**Thresholds:** IQR multiplier {IQR_MULTIPLIER}, Cosine > {COSINE_THRESHOLD}", + "", + "### Health Summary", "", + "| Metric | Value |", + "|--------|-------|", + f"| Items analyzed | {len(items)} |", + f"| Outliers flagged | {len(outlier_results)} |", + f"| Duplicate pairs | {len(duplicate_pairs)} |", + f"| Label suggestions | {suggested_count} |", + "", + ] + + # ── Outlier section ────────────────────────────────────────────── + # Determine cutoff for high-confidence split + cutoff = getattr(outlier_results, "cutoff", 0.0) + high_conf_cutoff = 2 * cutoff if cutoff > 0 else float("inf") + + high_conf = [(idx, d) for idx, d in outlier_results if d > high_conf_cutoff] + borderline = [(idx, d) for idx, d in outlier_results if d <= high_conf_cutoff] + + lines.extend([ f"### Potential Outliers / Spam ({len(outlier_results)})", "", "Items with unusually high Mahalanobis distance from the distribution center.", "These may be spam, off-topic, or poorly described.", "", - ] + ]) - if outlier_results: - lines.append("| # | Type | Title | Distance |") - lines.append("|---|------|-------|----------|") - for idx, distance in outlier_results: + if high_conf: + lines.append(f"**High Confidence** ({len(high_conf)} items, distance > 2x cutoff)") + lines.append("") + lines.append("| # | Type | Title | Distance | Age |") + lines.append("|---|------|-------|----------|-----|") + for idx, distance in high_conf: item = items[idx] kind = "PR" if item["is_pr"] else "Issue" + age = _item_age(item["created_at"]) + title = item["title"][:80] + ("..." if len(item["title"]) > 80 else "") lines.append( f"| [#{item['number']}]({item['html_url']}) " - f"| {kind} | {item['title']} | {distance:.2f} |" + f"| {kind} | {title} | {distance:.2f} | {age} |" ) - else: + lines.append("") + + if borderline: + lines.append("
") + lines.append(f"Borderline ({len(borderline)} items)") + lines.append("") + lines.append("| # | Type | Title | Distance | Age |") + lines.append("|---|------|-------|----------|-----|") + for idx, distance in borderline: + item = items[idx] + kind = "PR" if item["is_pr"] else "Issue" + age = _item_age(item["created_at"]) + title = item["title"][:80] + ("..." if len(item["title"]) > 80 else "") + lines.append( + f"| [#{item['number']}]({item['html_url']}) " + f"| {kind} | {title} | {distance:.2f} | {age} |" + ) + lines.append("") + lines.append("
") + lines.append("") + + if not outlier_results: lines.append("None found.") + # ── Duplicate pairs section ────────────────────────────────────── lines.extend([ "", f"### Potential Duplicates ({len(duplicate_pairs)} pairs)", @@ -257,43 +354,82 @@ def generate_report( ]) if duplicate_pairs: - lines.append("| Item A | Item B | Similarity |") - lines.append("|--------|--------|------------|") + lines.append("| Item A | Item B | Similarity | Suggested Action |") + lines.append("|--------|--------|------------|------------------|") for i, j, sim in duplicate_pairs: a = items[i] b = items[j] kind_a = "PR" if a["is_pr"] else "Issue" kind_b = "PR" if b["is_pr"] else "Issue" + action = _suggested_action(a, b) lines.append( f"| [#{a['number']}]({a['html_url']}) {kind_a}: {a['title']} " f"| [#{b['number']}]({b['html_url']}) {kind_b}: {b['title']} " - f"| {sim:.3f} |" + f"| {sim:.3f} | {action} |" ) else: lines.append("None found.") # ── Label suggestions section ──────────────────────────────────── - outlier_set = {idx for idx, _ in outlier_results} if label_suggestions is not None: - # Only unlabeled, non-outlier items — spam shouldn't get categorized - # Show only the top-1 label to match what actually gets applied - items_with_suggestions = [ - (i, sugs[:1]) for i, sugs in enumerate(label_suggestions) - if sugs and not items[i]["labels"] and i not in outlier_set - ] + # High confidence: top-1 label with raw_sim >= 0.5 + # Low confidence: top-1 label with raw_sim < 0.5 + high_conf_labels: list[tuple[int, list[tuple[str, float]]]] = [] + low_conf_labels: list[tuple[int, list[tuple[str, float]]]] = [] + for i, sugs in enumerate(label_suggestions): + if sugs and not items[i]["labels"] and i not in outlier_set: + top1 = sugs[:1] + if top1[0][1] >= 0.5: + high_conf_labels.append((i, top1)) + else: + low_conf_labels.append((i, top1)) + + total_suggestions = len(high_conf_labels) + len(low_conf_labels) lines.extend([ "", - f"### Suggested Labels ({len(items_with_suggestions)} unlabeled items)", + f"### Suggested Labels ({total_suggestions} unlabeled items)", "", - "Labels suggested by embedding similarity against repo label descriptions.", + "Labels suggested by z-score normalized embedding similarity against repo label descriptions.", "Only shown for unlabeled items that were not flagged as outliers.", "", ]) - if items_with_suggestions: + # Label concentration warning + if total_suggestions > 0: + label_counts: dict[str, int] = {} + for _, sugs in high_conf_labels + low_conf_labels: + for name, _ in sugs: + label_counts[name] = label_counts.get(name, 0) + 1 + for name, count in label_counts.items(): + if count > total_suggestions * 0.5: + lines.append( + f"> **Warning:** Label `{name}` accounts for " + f"{count}/{total_suggestions} suggestions " + f"({count * 100 // total_suggestions}%). " + f"Consider reviewing label descriptions for specificity." + ) + lines.append("") + + if high_conf_labels: + lines.append("| # | Type | Title | Suggested Label |") + lines.append("|---|------|-------|--------------------|") + for idx, sugs in high_conf_labels: + item = items[idx] + kind = "PR" if item["is_pr"] else "Issue" + label_strs = [f"`{name}` ({score:.2f})" for name, score in sugs] + lines.append( + f"| [#{item['number']}]({item['html_url']}) " + f"| {kind} | {item['title']} | {', '.join(label_strs)} |" + ) + + if low_conf_labels: + lines.append("") + lines.append("
") + lines.append(f"Low-confidence suggestions ({len(low_conf_labels)} items)") + lines.append("") lines.append("| # | Type | Title | Suggested Label |") lines.append("|---|------|-------|--------------------|") - for idx, sugs in items_with_suggestions: + for idx, sugs in low_conf_labels: item = items[idx] kind = "PR" if item["is_pr"] else "Issue" label_strs = [f"`{name}` ({score:.2f})" for name, score in sugs] @@ -301,7 +437,10 @@ def generate_report( f"| [#{item['number']}]({item['html_url']}) " f"| {kind} | {item['title']} | {', '.join(label_strs)} |" ) - else: + lines.append("") + lines.append("
") + + if not high_conf_labels and not low_conf_labels: lines.append("No unlabeled items need suggestions.") lines.extend([ @@ -314,11 +453,7 @@ def generate_report( ]) if label_suggestions is not None: - applied = sum( - 1 for i, s in enumerate(label_suggestions) - if s and not items[i]["labels"] and i not in outlier_set - ) - lines.append(f"- {applied} items suggested for labeling") + lines.append(f"- {suggested_count} items suggested for labeling") lines.extend([ "", @@ -404,7 +539,10 @@ def main() -> None: if len(items) >= MIN_SAMPLES_FOR_OUTLIER_DETECTION: reduced = reduce_dimensions(embeddings, PCA_MAX_COMPONENTS) outlier_results = detect_outliers( - reduced, percentile=OUTLIER_PERCENTILE, contamination=CONTAMINATION, + reduced, + contamination=CONTAMINATION, + iqr_multiplier=IQR_MULTIPLIER, + max_outlier_pct=MAX_OUTLIER_PCT, ) else: print( @@ -426,17 +564,20 @@ def main() -> None: label_suggestions = suggest_labels(embeddings, label_embeddings, label_names) print(f"Computed label suggestions against {len(repo_labels)} repo labels") - # Apply top label to unlabeled items (unless dry run) - # Skip outliers — flagged items shouldn't get categorized - outlier_set = {idx for idx, _ in outlier_results} - if not DRY_RUN: - applied_count = 0 - for i, sugs in enumerate(label_suggestions): - if sugs and not items[i]["labels"] and i not in outlier_set: - # Apply only the top-1 label (highest confidence) - apply_labels_to_item(items[i]["number"], [sugs[0][0]]) - applied_count += 1 - print(f"Applied labels to {applied_count} unlabeled items") + # NOTE: Auto-labeling is disabled. The report shows suggestions for + # human review. To re-enable, uncomment the block below. + # + # # Apply top label to unlabeled items (unless dry run) + # # Skip outliers — flagged items shouldn't get categorized + # outlier_set = {idx for idx, _ in outlier_results} + # if not DRY_RUN: + # applied_count = 0 + # for i, sugs in enumerate(label_suggestions): + # if sugs and not items[i]["labels"] and i not in outlier_set: + # # Apply only the top-1 label (highest confidence) + # apply_labels_to_item(items[i]["number"], [sugs[0][0]]) + # applied_count += 1 + # print(f"Applied labels to {applied_count} unlabeled items") else: print("No repo labels found — skipping label suggestions") diff --git a/.github/scripts/triage/test_embedding_utils.py b/.github/scripts/triage/test_embedding_utils.py index 6fbfe83afa..f4a723c6f9 100644 --- a/.github/scripts/triage/test_embedding_utils.py +++ b/.github/scripts/triage/test_embedding_utils.py @@ -20,7 +20,10 @@ EMBEDDING_DIM, EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE, - LABEL_SIMILARITY_THRESHOLD, + LABEL_Z_THRESHOLD, + LABEL_Z_MARGIN, + LABEL_Z_STD_FLOOR, + MIN_RAW_SIMILARITY, MAX_LABELS_PER_ITEM, ) @@ -139,17 +142,17 @@ def test_max_components_respected(self): class TestDetectOutliers: - """Tests for Mahalanobis-based outlier detection.""" + """Tests for IQR-based outlier detection.""" def test_single_sample_returns_empty(self): m = np.random.randn(1, 5).astype(np.float32) - result = detect_outliers(m, percentile=0.997) + result = detect_outliers(m) assert result == [] def test_empty_returns_empty(self): # n < 2 case m = np.empty((0, 5), dtype=np.float32) - result = detect_outliers(m, percentile=0.997) + result = detect_outliers(m) assert result == [] def test_finds_outliers_in_synthetic_data(self): @@ -158,7 +161,7 @@ def test_finds_outliers_in_synthetic_data(self): cluster = rng.standard_normal((50, 3)).astype(np.float32) * 0.1 outlier = np.array([[100.0, 100.0, 100.0]], dtype=np.float32) m = np.vstack([cluster, outlier]) - result = detect_outliers(m, percentile=0.997) + result = detect_outliers(m) # The outlier (index 50) should be detected outlier_indices = [idx for idx, _ in result] assert 50 in outlier_indices @@ -169,7 +172,7 @@ def test_returns_list_of_index_distance_tuples(self): cluster = rng.standard_normal((20, 3)).astype(np.float32) * 0.1 far_point = np.array([[50.0, 50.0, 50.0]], dtype=np.float32) m = np.vstack([cluster, far_point]) - result = detect_outliers(m, percentile=0.997) + result = detect_outliers(m) assert isinstance(result, list) for item in result: assert isinstance(item, tuple) @@ -179,20 +182,21 @@ def test_returns_list_of_index_distance_tuples(self): assert isinstance(dist, float) assert dist > 0 - def test_low_percentile_flags_more(self): + def test_iqr_cutoff_behavior(self): + """Lower IQR multiplier should flag more items than higher multiplier.""" rng = np.random.default_rng(42) - m = rng.standard_normal((30, 3)).astype(np.float32) - low = detect_outliers(m, percentile=0.5) - high = detect_outliers(m, percentile=0.999) + m = rng.standard_normal((100, 3)).astype(np.float32) + low = detect_outliers(m, iqr_multiplier=1.0, max_outlier_pct=0.5) + high = detect_outliers(m, iqr_multiplier=5.0, max_outlier_pct=0.5) assert len(low) >= len(high) - def test_dimension_aware_cutoff(self): - """High-dimensional data should not flag everything with default percentile.""" + def test_dimension_aware_no_mass_flagging(self): + """High-dimensional clean Gaussian data should not flag everything.""" rng = np.random.default_rng(42) # 500 samples, 10 dims — well-conditioned for robust covariance m = rng.standard_normal((500, 10)).astype(np.float32) - result = detect_outliers(m, percentile=0.997) - # With a proper dimension-aware cutoff on clean Gaussian data, + result = detect_outliers(m) + # With IQR-based cutoff on clean Gaussian data, # only a small fraction should be flagged (well under 50%) assert len(result) < 250 @@ -200,9 +204,48 @@ def test_contamination_parameter(self): rng = np.random.default_rng(42) m = rng.standard_normal((50, 3)).astype(np.float32) # Should not raise with different contamination values - result = detect_outliers(m, percentile=0.997, contamination=0.05) + result = detect_outliers(m, contamination=0.05) assert isinstance(result, list) + def test_max_outlier_pct_hard_cap(self): + """The hard cap should limit outlier count to max_outlier_pct * n.""" + rng = np.random.default_rng(42) + # Create data with many potential outliers (bimodal) + cluster = rng.standard_normal((80, 3)).astype(np.float32) * 0.1 + outliers = rng.standard_normal((20, 3)).astype(np.float32) * 50.0 + m = np.vstack([cluster, outliers]) + # Very low IQR multiplier to flag a lot, but cap at 5% + result = detect_outliers(m, iqr_multiplier=0.5, max_outlier_pct=0.05) + max_allowed = max(1, int(0.05 * 100)) # 5 + assert len(result) <= max_allowed + + def test_hard_cap_keeps_most_extreme(self): + """When capped, the most extreme items (highest distance) should be kept.""" + rng = np.random.default_rng(42) + cluster = rng.standard_normal((90, 3)).astype(np.float32) * 0.1 + # Create outliers with increasing extremity + outliers = np.array([ + [10.0, 10.0, 10.0], + [20.0, 20.0, 20.0], + [50.0, 50.0, 50.0], + ], dtype=np.float32) + m = np.vstack([cluster, outliers]) + # Cap at ~1 item (0.01 * 93 = 0, but min is 1) + result = detect_outliers(m, iqr_multiplier=0.5, max_outlier_pct=0.02) + if len(result) > 0: + # The most extreme (index 92, distance for [50,50,50]) should be kept + indices = [idx for idx, _ in result] + assert 92 in indices + + def test_cutoff_attribute(self): + """Returned result should carry a cutoff attribute.""" + rng = np.random.default_rng(42) + m = rng.standard_normal((50, 3)).astype(np.float32) + result = detect_outliers(m) + assert hasattr(result, "cutoff") + assert isinstance(result.cutoff, float) + assert result.cutoff > 0 + class TestFindDuplicatePairs: """Tests for cosine similarity duplicate detection.""" @@ -265,7 +308,7 @@ def test_high_threshold_fewer_pairs(self): class TestSuggestLabels: - """Tests for embedding-based label suggestion.""" + """Tests for z-score normalized label suggestion.""" def test_empty_items_returns_empty_lists(self): items = np.empty((0, 10), dtype=np.float32) @@ -281,57 +324,145 @@ def test_empty_labels_returns_empty_per_item(self): assert all(s == [] for s in result) def test_identical_embedding_gets_that_label(self): - """If an item embedding equals a label embedding, it should suggest that label.""" - vec = np.array([1.0, 0.0, 0.0], dtype=np.float32) - items = np.array([vec], dtype=np.float32) - labels = np.array([vec, [0, 1, 0], [0, 0, 1]], dtype=np.float32) - result = suggest_labels(items, labels, ["bug", "feature", "docs"], threshold=0.5) - assert len(result) == 1 - assert result[0][0][0] == "bug" - assert result[0][0][1] > 0.99 - - def test_threshold_filters_low_similarity(self): - """With a high threshold, orthogonal vectors should get no suggestions.""" - items = np.eye(3, dtype=np.float32) - labels = np.eye(3, dtype=np.float32) - # threshold=0.99 means only near-exact matches - result = suggest_labels(items, labels, ["a", "b", "c"], threshold=0.99) - # Each item should match exactly one label (itself) + """If an item embedding strongly matches one label, z-score should highlight it.""" + # Create multiple items so z-score normalization is meaningful + rng = np.random.default_rng(42) + # 10 random items + 1 item that matches label "bug" exactly + random_items = rng.standard_normal((10, 3)).astype(np.float32) + bug_vec = np.array([[1.0, 0.0, 0.0]], dtype=np.float32) + items = np.vstack([random_items, bug_vec]) + labels = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], dtype=np.float32) + result = suggest_labels( + items, labels, ["bug", "feature", "docs"], + z_threshold=0.5, z_margin=0.0, min_raw_sim=0.1, + ) + # The last item (matching bug_vec) should get "bug" as top suggestion + last_item_sugs = result[-1] + if last_item_sugs: + assert last_item_sugs[0][0] == "bug" + + def test_z_score_suppresses_dominant_label(self): + """When all items are similar to one label, z-scores should be low + (none stands out) and that label should not be blindly suggested.""" + # All items identical — z-score for every item on every label is 0 + items = np.ones((10, 3), dtype=np.float32) + labels = np.array([[1.0, 1.0, 1.0], [0.0, 1.0, 0.0]], dtype=np.float32) + result = suggest_labels( + items, labels, ["catch-all", "specific"], + z_threshold=1.5, min_raw_sim=0.3, + ) + # With identical items, std=0 -> z-scores are all 0 -> nothing passes z_threshold + for sugs in result: + assert sugs == [] + + def test_margin_gate_blocks_top1(self): + """Top-1 label must beat #2 by z_margin to be accepted as position 0.""" + rng = np.random.default_rng(99) + # 20 items, each slightly different, 2 labels + items = rng.standard_normal((20, 5)).astype(np.float32) + # Two labels that are nearly identical -> margin gate should block top-1 + labels = np.array([[1.0, 0.5, 0.0, 0.0, 0.0], + [1.0, 0.5, 0.01, 0.0, 0.0]], dtype=np.float32) + result = suggest_labels( + items, labels, ["label-a", "label-b"], + z_threshold=0.0, z_margin=10.0, min_raw_sim=0.0, max_per_item=1, + ) + # With a huge margin requirement and max_per_item=1, nothing should pass + # because the only candidate (top-1) is blocked by margin gate, + # and max_per_item=1 prevents falling through to position 2 for sugs in result: - assert len(sugs) == 1 + assert sugs == [] + + def test_margin_gate_passes_when_clear_winner(self): + """When top-1 clearly beats #2, it should pass the margin gate.""" + # Create items where one strongly matches label 0 vs label 1 + items = np.array([ + [1.0, 0.0, 0.0, 0.0, 0.0], # strongly matches label-a + [0.0, 0.0, 0.0, 0.0, 1.0], # matches neither well + ] * 5, dtype=np.float32) # 10 items for stable z-scores + labels = np.array([ + [1.0, 0.0, 0.0, 0.0, 0.0], # label-a + [0.0, 1.0, 0.0, 0.0, 0.0], # label-b (orthogonal) + ], dtype=np.float32) + result = suggest_labels( + items, labels, ["label-a", "label-b"], + z_threshold=0.5, z_margin=0.3, min_raw_sim=0.1, + ) + # Items matching label-a should get it suggested (clear z-score advantage) + got_label_a = sum(1 for sugs in result if sugs and sugs[0][0] == "label-a") + assert got_label_a > 0 + + def test_min_raw_similarity_filter(self): + """Even with high z-score, low raw similarity should be filtered out.""" + # Items are orthogonal to all labels -> raw similarity near 0 + items = np.array([[1.0, 0.0, 0.0]], dtype=np.float32) + labels = np.array([[0.0, 0.0, 1.0]], dtype=np.float32) + result = suggest_labels( + items, labels, ["irrelevant"], + z_threshold=0.0, z_margin=0.0, min_raw_sim=0.9, + ) + # Raw similarity is ~0, which is below min_raw_sim=0.9 + assert result[0] == [] def test_max_per_item_respected(self): - """Even if all labels are similar, max_per_item caps the results.""" + """Even if many labels qualify, max_per_item caps the results.""" rng = np.random.default_rng(42) - base = rng.standard_normal(10).astype(np.float32) - items = np.array([base]) - # All labels very similar to item + # Create items with some variance so z-scores differentiate + items = rng.standard_normal((20, 10)).astype(np.float32) + base = items[0] + # All labels very similar to item 0 labels = np.array([base + rng.standard_normal(10) * 0.01 for _ in range(10)]) names = [f"label-{i}" for i in range(10)] - result = suggest_labels(items, labels, names, threshold=0.1, max_per_item=2) - assert len(result[0]) <= 2 + result = suggest_labels( + items, labels, names, + z_threshold=0.0, z_margin=0.0, min_raw_sim=0.0, max_per_item=2, + ) + for sugs in result: + assert len(sugs) <= 2 - def test_returns_sorted_by_similarity_descending(self): - """Suggestions should be ordered highest similarity first.""" - items = np.array([[1.0, 0.5, 0.0]], dtype=np.float32) - labels = np.array([ - [1.0, 0.0, 0.0], # decent match - [1.0, 0.5, 0.0], # exact match - [0.0, 0.0, 1.0], # poor match - ], dtype=np.float32) - result = suggest_labels(items, labels, ["a", "b", "c"], threshold=0.1) - scores = [s for _, s in result[0]] - assert scores == sorted(scores, reverse=True) + def test_returns_raw_similarity_not_z_score(self): + """Returned scores should be raw cosine similarity, not z-scores.""" + rng = np.random.default_rng(42) + items = rng.standard_normal((15, 5)).astype(np.float32) + labels = rng.standard_normal((3, 5)).astype(np.float32) + names = ["bug", "feature", "docs"] + result = suggest_labels( + items, labels, names, + z_threshold=0.0, z_margin=0.0, min_raw_sim=-1.0, + ) + # Raw cosine similarity should be in [-1, 1] range + for sugs in result: + for name, score in sugs: + assert -1.0 <= score <= 1.0 + 1e-5 + assert isinstance(name, str) + assert isinstance(score, float) def test_returns_correct_format(self): rng = np.random.default_rng(42) items = rng.standard_normal((3, 10)).astype(np.float32) labels = rng.standard_normal((5, 10)).astype(np.float32) names = ["bug", "feature", "docs", "ci", "test"] - result = suggest_labels(items, labels, names, threshold=0.0) + result = suggest_labels( + items, labels, names, + z_threshold=0.0, z_margin=0.0, min_raw_sim=-1.0, + ) assert len(result) == 3 for sugs in result: for name, score in sugs: assert isinstance(name, str) assert isinstance(score, float) assert name in names + + def test_text_truncation_in_labels(self): + """Label names should be returned as-is even when very long.""" + rng = np.random.default_rng(42) + items = rng.standard_normal((10, 5)).astype(np.float32) + long_name = "a" * 200 + labels = rng.standard_normal((1, 5)).astype(np.float32) + result = suggest_labels( + items, labels, [long_name], + z_threshold=0.0, z_margin=0.0, min_raw_sim=-1.0, + ) + for sugs in result: + if sugs: + assert sugs[0][0] == long_name diff --git a/.github/scripts/triage/test_sweep.py b/.github/scripts/triage/test_sweep.py index feded34fa0..39c8bef3af 100644 --- a/.github/scripts/triage/test_sweep.py +++ b/.github/scripts/triage/test_sweep.py @@ -35,18 +35,23 @@ MIN_SAMPLES_FOR_OUTLIER_DETECTION, PCA_MAX_COMPONENTS, MAX_EMBED_CHARS, + IQR_MULTIPLIER, + MAX_OUTLIER_PCT, + _item_age, + _suggested_action, ) def _make_api_issue(number: int, title: str = "Test issue", is_pr: bool = False, - body: str = "Issue body", labels: list[str] | None = None) -> dict: + body: str = "Issue body", labels: list[str] | None = None, + created_at: str = "2026-03-21T00:00:00Z") -> dict: """Helper to build a mock GitHub API issue response object.""" result: dict = { "number": number, "title": title, "html_url": f"https://github.com/owner/repo/issues/{number}", "body": body, - "created_at": "2026-03-21T00:00:00Z", + "created_at": created_at, "labels": [{"name": lbl} for lbl in (labels or [])], } if is_pr: @@ -94,6 +99,15 @@ def test_min_samples_is_at_least_3x_pca_max(self): def test_min_samples_is_100(self): assert MIN_SAMPLES_FOR_OUTLIER_DETECTION == 100 + def test_pca_max_components_is_20(self): + assert PCA_MAX_COMPONENTS == 20 + + def test_iqr_multiplier_default(self): + assert IQR_MULTIPLIER == 3.0 + + def test_max_outlier_pct_default(self): + assert MAX_OUTLIER_PCT == 0.05 + class TestFetchAllOpenItems: """Tests for fetch_all_open_items.""" @@ -179,6 +193,71 @@ def test_pagination(self, mock_get): assert mock_get.call_count == 2 +class TestItemAge: + """Tests for _item_age helper.""" + + def test_recent_item(self): + from datetime import datetime, timezone, timedelta + recent = (datetime.now(timezone.utc) - timedelta(hours=12)).isoformat() + assert _item_age(recent) == "<1d" + + def test_days_old(self): + from datetime import datetime, timezone, timedelta + old = (datetime.now(timezone.utc) - timedelta(days=15)).isoformat() + assert _item_age(old) == "15d" + + def test_months_old(self): + from datetime import datetime, timezone, timedelta + old = (datetime.now(timezone.utc) - timedelta(days=90)).isoformat() + assert _item_age(old) == "3mo" + + def test_years_old(self): + from datetime import datetime, timezone, timedelta + old = (datetime.now(timezone.utc) - timedelta(days=400)).isoformat() + assert _item_age(old) == "1y" + + def test_invalid_date(self): + assert _item_age("not-a-date") == "?" + + +class TestSuggestedAction: + """Tests for _suggested_action helper.""" + + def test_both_issues_close_newer(self): + a = TriageItem( + number=1, title="A", html_url="u", is_pr=False, labels=[], + created_at="2026-01-01T00:00:00Z", text="t", + ) + b = TriageItem( + number=2, title="B", html_url="u", is_pr=False, labels=[], + created_at="2026-02-01T00:00:00Z", text="t", + ) + result = _suggested_action(a, b) + assert "Close #2 as duplicate" in result + + def test_both_prs_review(self): + a = TriageItem( + number=1, title="A", html_url="u", is_pr=True, labels=[], + created_at="2026-01-01T00:00:00Z", text="t", + ) + b = TriageItem( + number=2, title="B", html_url="u", is_pr=True, labels=[], + created_at="2026-01-01T00:00:00Z", text="t", + ) + assert _suggested_action(a, b) == "Review for overlap" + + def test_issue_pr_link(self): + a = TriageItem( + number=1, title="A", html_url="u", is_pr=False, labels=[], + created_at="2026-01-01T00:00:00Z", text="t", + ) + b = TriageItem( + number=2, title="B", html_url="u", is_pr=True, labels=[], + created_at="2026-01-01T00:00:00Z", text="t", + ) + assert _suggested_action(a, b) == "Link PR to issue" + + class TestGenerateReport: """Tests for the markdown report generator.""" @@ -186,7 +265,7 @@ def test_no_findings(self): items = [ TriageItem( number=1, title="Test", html_url="https://example.com/1", - is_pr=False, labels=[], created_at="2026-01-01", text="Test", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="Test", ), ] report = generate_report(items, [], []) @@ -196,15 +275,39 @@ def test_no_findings(self): assert "0 outliers flagged" in report assert "0 duplicate pairs found" in report - def test_with_outliers_shows_distance(self): + def test_health_summary_table(self): + items = [ + TriageItem( + number=1, title="Test", html_url="https://example.com/1", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="Test", + ), + ] + report = generate_report(items, [], []) + assert "### Health Summary" in report + assert "| Metric | Value |" in report + assert "| Items analyzed | 1 |" in report + + def test_iqr_multiplier_in_thresholds(self): + """Report should show IQR multiplier, not percentile.""" + items = [ + TriageItem( + number=1, title="Test", html_url="https://example.com/1", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="Test", + ), + ] + report = generate_report(items, [], []) + assert "IQR multiplier" in report + assert "percentile" not in report.lower().split("thresholds")[0] # not in thresholds line + + def test_with_outliers_shows_distance_and_age(self): items = [ TriageItem( number=10, title="Spam Issue", html_url="https://example.com/10", - is_pr=False, labels=[], created_at="2026-01-01", text="spam", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam", ), TriageItem( number=20, title="Good Issue", html_url="https://example.com/20", - is_pr=False, labels=[], created_at="2026-01-01", text="good", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="good", ), ] report = generate_report(items, [(0, 12.34)], []) @@ -212,16 +315,48 @@ def test_with_outliers_shows_distance(self): assert "Spam Issue" in report assert "12.34" in report assert "1 outliers flagged" in report + # Age column should be present + assert "| Age |" in report + + def test_outlier_borderline_in_details(self): + """Borderline outliers should be in a
section.""" + from embedding_utils import _OutlierResult + items = [ + TriageItem( + number=10, title="Borderline", html_url="https://example.com/10", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam", + ), + ] + # Create outlier results with cutoff=10.0, distance=12.0 (< 2*cutoff=20) + outlier_results = _OutlierResult([(0, 12.0)]) + outlier_results.cutoff = 10.0 + report = generate_report(items, outlier_results, []) + assert "
" in report + assert "Borderline" in report + + def test_outlier_high_confidence(self): + """Items with distance > 2x cutoff should be in high confidence section.""" + from embedding_utils import _OutlierResult + items = [ + TriageItem( + number=10, title="Definite Spam", html_url="https://example.com/10", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam", + ), + ] + outlier_results = _OutlierResult([(0, 25.0)]) + outlier_results.cutoff = 10.0 + report = generate_report(items, outlier_results, []) + assert "High Confidence" in report - def test_with_duplicates(self): + def test_with_duplicates_suggested_action(self): items = [ TriageItem( number=1, title="First", html_url="https://example.com/1", - is_pr=False, labels=[], created_at="2026-01-01", text="a", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="a", ), TriageItem( number=2, title="Second", html_url="https://example.com/2", - is_pr=True, labels=[], created_at="2026-01-01", text="b", + is_pr=True, labels=[], created_at="2026-02-01T00:00:00Z", text="b", ), ] report = generate_report(items, [], [(0, 1, 0.954)]) @@ -229,12 +364,42 @@ def test_with_duplicates(self): assert "#2" in report assert "0.954" in report assert "1 duplicate pairs found" in report + assert "Suggested Action" in report + assert "Link PR to issue" in report + + def test_duplicate_both_issues_close_newer(self): + items = [ + TriageItem( + number=1, title="First", html_url="https://example.com/1", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="a", + ), + TriageItem( + number=2, title="Second", html_url="https://example.com/2", + is_pr=False, labels=[], created_at="2026-02-01T00:00:00Z", text="b", + ), + ] + report = generate_report(items, [], [(0, 1, 0.95)]) + assert "Close #2 as duplicate" in report + + def test_duplicate_both_prs_review(self): + items = [ + TriageItem( + number=1, title="PR A", html_url="https://example.com/1", + is_pr=True, labels=[], created_at="2026-01-01T00:00:00Z", text="a", + ), + TriageItem( + number=2, title="PR B", html_url="https://example.com/2", + is_pr=True, labels=[], created_at="2026-01-01T00:00:00Z", text="b", + ), + ] + report = generate_report(items, [], [(0, 1, 0.95)]) + assert "Review for overlap" in report def test_pr_type_label(self): items = [ TriageItem( number=5, title="PR Title", html_url="https://example.com/5", - is_pr=True, labels=[], created_at="2026-01-01", text="pr", + is_pr=True, labels=[], created_at="2026-01-01T00:00:00Z", text="pr", ), ] report = generate_report(items, [(0, 8.5)], []) @@ -244,7 +409,7 @@ def test_footer_present(self): items = [ TriageItem( number=1, title="T", html_url="u", - is_pr=False, labels=[], created_at="d", text="t", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="t", ), ] report = generate_report(items, [], []) @@ -386,26 +551,39 @@ def test_http_error_is_non_fatal(self, mock_urlopen): class TestGenerateReportWithLabels: """Tests for label suggestions in the report.""" - def test_report_includes_label_section_with_top1_only(self): + def test_report_includes_label_section_high_confidence(self): + """High-confidence label (raw_sim >= 0.5) should appear in main table.""" items = [ TriageItem( number=1, title="Fix crash", html_url="https://example.com/1", - is_pr=False, labels=[], created_at="2026-01-01", text="crash", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="crash", ), ] - suggestions = [[("bug", 0.85), ("enhancement", 0.42)]] + suggestions = [[("bug", 0.85)]] report = generate_report(items, [], [], label_suggestions=suggestions) assert "Suggested Labels" in report - # Only the top-1 label should appear in the report assert "`bug` (0.85)" in report - assert "`enhancement`" not in report assert "1 items suggested for labeling" in report + def test_report_low_confidence_in_details(self): + """Low-confidence label (raw_sim < 0.5) should be in
section.""" + items = [ + TriageItem( + number=1, title="Something", html_url="https://example.com/1", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="something", + ), + ] + suggestions = [[("maybe-bug", 0.35)]] + report = generate_report(items, [], [], label_suggestions=suggestions) + assert "Low-confidence suggestions" in report + assert "
" in report + assert "`maybe-bug` (0.35)" in report + def test_report_skips_already_labeled_items(self): items = [ TriageItem( number=1, title="Already labeled", html_url="https://example.com/1", - is_pr=False, labels=["bug"], created_at="2026-01-01", text="bug", + is_pr=False, labels=["bug"], created_at="2026-01-01T00:00:00Z", text="bug", ), ] suggestions = [[("bug", 0.95)]] @@ -417,11 +595,11 @@ def test_report_excludes_outliers_from_suggestions(self): items = [ TriageItem( number=1, title="Spam garbage", html_url="https://example.com/1", - is_pr=False, labels=[], created_at="2026-01-01", text="spam", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="spam", ), TriageItem( number=2, title="Real bug", html_url="https://example.com/2", - is_pr=False, labels=[], created_at="2026-01-01", text="bug", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="bug", ), ] suggestions = [[("bug", 0.85)], [("bug", 0.90)]] @@ -436,12 +614,33 @@ def test_report_without_label_suggestions(self): items = [ TriageItem( number=1, title="T", html_url="u", - is_pr=False, labels=[], created_at="d", text="t", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="t", ), ] report = generate_report(items, [], [], label_suggestions=None) assert "Suggested Labels" not in report + def test_label_concentration_warning(self): + """When >50% of suggestions point to the same label, a warning should appear.""" + items = [ + TriageItem( + number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}", + ) + for i in range(4) + ] + # 3 out of 4 items get "bug" label -> 75% concentration + suggestions = [ + [("bug", 0.85)], + [("bug", 0.80)], + [("bug", 0.75)], + [("enhancement", 0.90)], + ] + report = generate_report(items, [], [], label_suggestions=suggestions) + assert "Warning" in report + assert "`bug`" in report + assert "3/4" in report + class TestMain: """Tests for the main orchestration function.""" @@ -485,7 +684,7 @@ def test_full_flow_with_enough_items( items = [ TriageItem( number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", - is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}", ) for i in range(n) ] @@ -527,12 +726,12 @@ def test_skips_outlier_detection_for_few_items( self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_reduce, mock_outliers, mock_dupes, mock_suggest, mock_write, mock_create, ): - """With < MIN_SAMPLES (150) items, outlier detection should be skipped.""" - n = MIN_SAMPLES_FOR_OUTLIER_DETECTION - 1 # 149 + """With < MIN_SAMPLES items, outlier detection should be skipped.""" + n = MIN_SAMPLES_FOR_OUTLIER_DETECTION - 1 items = [ TriageItem( number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", - is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}", ) for i in range(n) ] @@ -568,7 +767,7 @@ def test_dry_run_skips_issue_creation_and_labeling( items = [ TriageItem( number=1, title="Item", html_url="https://example.com/1", - is_pr=False, labels=[], created_at="2026-01-01", text="text", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="text", ) ] mock_fetch.return_value = items @@ -594,19 +793,19 @@ def test_dry_run_skips_issue_creation_and_labeling( @patch("sweep.embed_texts") @patch("sweep.fetch_repo_labels") @patch("sweep.fetch_all_open_items") - def test_applies_labels_to_unlabeled_items( + def test_labels_not_auto_applied( self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_dupes, mock_suggest, mock_apply, mock_write, mock_create, ): - """When not dry run, top-1 label should be applied to unlabeled items.""" + """Auto-labeling is disabled; labels should appear in report only.""" items = [ TriageItem( number=1, title="Crash bug", html_url="https://example.com/1", - is_pr=False, labels=[], created_at="2026-01-01", text="crash", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text="crash", ), TriageItem( number=2, title="Already labeled", html_url="https://example.com/2", - is_pr=False, labels=["enhancement"], created_at="2026-01-01", text="feat", + is_pr=False, labels=["enhancement"], created_at="2026-01-01T00:00:00Z", text="feat", ), ] mock_fetch.return_value = items @@ -614,8 +813,8 @@ def test_applies_labels_to_unlabeled_items( RepoLabel(name="bug", description="Broken", text="bug: Broken"), ] mock_suggest.return_value = [ - [("bug", 0.90)], # item 1: unlabeled, should get labeled - [("bug", 0.45)], # item 2: already labeled, skip + [("bug", 0.90)], + [("bug", 0.45)], ] embeddings = np.random.randn(2, 384).astype(np.float32) @@ -624,8 +823,8 @@ def test_applies_labels_to_unlabeled_items( main() - # Only item 1 (unlabeled) should get a label applied - mock_apply.assert_called_once_with(1, ["bug"]) + # Auto-labeling is disabled — apply_labels_to_item should never be called + mock_apply.assert_not_called() @patch("sweep.create_report_issue") @patch("sweep.write_report") @@ -638,16 +837,16 @@ def test_applies_labels_to_unlabeled_items( @patch("sweep.embed_texts") @patch("sweep.fetch_repo_labels") @patch("sweep.fetch_all_open_items") - def test_outliers_do_not_get_labeled( + def test_outliers_excluded_from_report_suggestions( self, mock_fetch, mock_labels, mock_embed, mock_norm, mock_reduce, mock_outliers, mock_dupes, mock_suggest, mock_apply, mock_write, mock_create, ): - """Items flagged as outliers should not receive label suggestions.""" + """Items flagged as outliers should not appear in report label suggestions.""" n = MIN_SAMPLES_FOR_OUTLIER_DETECTION items = [ TriageItem( number=i, title=f"Item {i}", html_url=f"https://example.com/{i}", - is_pr=False, labels=[], created_at="2026-01-01", text=f"text {i}", + is_pr=False, labels=[], created_at="2026-01-01T00:00:00Z", text=f"text {i}", ) for i in range(n) ] @@ -655,9 +854,7 @@ def test_outliers_do_not_get_labeled( mock_labels.return_value = [ RepoLabel(name="bug", description="Broken", text="bug: Broken"), ] - # Outlier detection flags items 0 and 5 (now returns tuples with distances) mock_outliers.return_value = [(0, 12.5), (5, 15.3)] - # Every item gets a suggestion mock_suggest.return_value = [[("bug", 0.85)] for _ in range(n)] embeddings = np.random.randn(n, 384).astype(np.float32) @@ -667,9 +864,10 @@ def test_outliers_do_not_get_labeled( main() - # Items 0 and 5 are outliers — should NOT be labeled - labeled_numbers = [call.args[0] for call in mock_apply.call_args_list] - assert 0 not in labeled_numbers - assert 5 not in labeled_numbers - # Other items should be labeled (n - 2 outliers) - assert mock_apply.call_count == n - 2 + # Auto-labeling is disabled + mock_apply.assert_not_called() + # Report should still be generated (outliers excluded from suggestions in report) + mock_write.assert_called_once() + report = mock_write.call_args[0][0] + # Outlier items 0 and 5 should not appear in the label suggestions section + assert "Item 0" not in report.split("Suggested Labels")[1] if "Suggested Labels" in report else True diff --git a/.github/workflows/triage-sweep.yml b/.github/workflows/triage-sweep.yml index b3f4840ffd..ba5514dbf6 100644 --- a/.github/workflows/triage-sweep.yml +++ b/.github/workflows/triage-sweep.yml @@ -3,13 +3,19 @@ name: Triage Sweep on: workflow_dispatch: inputs: - outlier_percentile: + iqr_multiplier: description: >- - Chi2 percentile for dimension-aware outlier cutoff (0-1). - 0.997 is the multivariate equivalent of 3-sigma. - Lower = more aggressive flagging. + IQR multiplier for outlier cutoff. + cutoff = Q75 + multiplier * IQR. + Higher = fewer outliers flagged. type: number - default: 0.997 + default: 3.0 + max_outlier_pct: + description: >- + Maximum fraction of items that can be flagged as outliers (0-1). + Hard cap to prevent over-flagging. + type: number + default: 0.05 contamination: description: >- Expected fraction of outliers in the data (0-0.5). @@ -39,6 +45,7 @@ on: permissions: contents: read issues: write + pull-requests: write concurrency: group: triage-sweep @@ -69,7 +76,7 @@ jobs: - name: Cache FastEmbed model weights uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5 with: - path: ~/.cache/fastembed_cache + path: ${{ github.workspace }}/.fastembed_cache key: fastembed-bge-small-en-v1.5 - name: Run triage sweep @@ -77,7 +84,9 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} - INPUT_OUTLIER_PERCENTILE: ${{ inputs.outlier_percentile }} + FASTEMBED_CACHE_PATH: ${{ github.workspace }}/.fastembed_cache + INPUT_IQR_MULTIPLIER: ${{ inputs.iqr_multiplier }} + INPUT_MAX_OUTLIER_PCT: ${{ inputs.max_outlier_pct }} INPUT_CONTAMINATION: ${{ inputs.contamination }} INPUT_COSINE_THRESHOLD: ${{ inputs.cosine_threshold }} INPUT_MAX_ITEMS: ${{ inputs.max_items }}