diff --git a/agents/codex-719.md b/agents/codex-719.md new file mode 100644 index 000000000..16a3ac036 --- /dev/null +++ b/agents/codex-719.md @@ -0,0 +1 @@ + diff --git a/scripts/langchain/integration_layer.py b/scripts/langchain/integration_layer.py new file mode 100644 index 000000000..9b0a659fa --- /dev/null +++ b/scripts/langchain/integration_layer.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Integration helpers for applying semantic labels to issues. +""" + +from __future__ import annotations + +import re +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass, field +from typing import Any + +from scripts.langchain import label_matcher + + +@dataclass +class IssueData: + title: str + body: str | None = None + labels: list[str] = field(default_factory=list) + + def apply_labels(self, new_labels: Iterable[str]) -> None: + self.labels = merge_labels(self.labels, new_labels) + + +def merge_labels(existing: Iterable[str], incoming: Iterable[str]) -> list[str]: + seen: set[str] = set() + merged: list[str] = [] + for label in list(existing) + list(incoming): + normalized = _normalize_label(label) + if not normalized or normalized in seen: + continue + seen.add(normalized) + merged.append(label) + return merged + + +def label_issue( + issue: IssueData, + available_labels: Iterable[Any], + *, + threshold: float | None = None, + k: int | None = None, + max_labels: int | None = None, +) -> list[str]: + issue_text = _build_issue_text(issue) + label_store = _build_label_store(available_labels) + if label_store is None: + return [] + + matches = label_matcher.find_similar_labels(label_store, issue_text, threshold=threshold, k=k) + names = _select_label_names(matches, max_labels=max_labels) + issue.apply_labels(names) + return names + + +def _build_issue_text(issue: IssueData) -> str: + if not isinstance(issue.title, str) or not issue.title.strip(): + raise ValueError("issue title must be a non-empty string.") + parts = [issue.title.strip()] + if issue.body and issue.body.strip(): + parts.append(issue.body.strip()) + return "\n\n".join(parts) + + +def _build_label_store(labels: Iterable[Any]) -> label_matcher.LabelVectorStore | None: + label_records = _collect_label_records(labels) + if not label_records: + return None + + vector_store = label_matcher.build_label_vector_store(label_records) + if vector_store is not None: + return vector_store + + return label_matcher.LabelVectorStore( + store=object(), + provider="keyword", + model="keyword", + labels=label_records, + ) + + +def _collect_label_records(labels: Iterable[Any]) -> list[label_matcher.LabelRecord]: + if labels is None: + raise ValueError("labels must be an iterable of label records, not None.") + if isinstance(labels, (str, bytes)): + raise ValueError("labels must be an iterable of label records, not a string.") + if not isinstance(labels, Iterable): + raise ValueError("labels must be an iterable of label records.") + + records: list[label_matcher.LabelRecord] = [] + for index, item in enumerate(labels): + record = _coerce_label_record(item) + if record is not None: + records.append(record) + else: + if isinstance(item, Mapping): + raise ValueError(f"Label entry at index {index} is missing a name.") + if getattr(item, "name", None) is not None or getattr(item, "label", None) is not None: + raise ValueError(f"Label entry at index {index} has an empty name.") + raise ValueError(f"Unsupported label entry at index {index}: {type(item).__name__}.") + return records + + +def _coerce_label_record(item: Any) -> label_matcher.LabelRecord | None: + if isinstance(item, label_matcher.LabelRecord): + return item + if isinstance(item, (str, bytes)): + name = item.decode("utf-8", errors="replace") if isinstance(item, bytes) else item + name = name.strip() + if not name: + return None + return label_matcher.LabelRecord(name=name) + if isinstance(item, Mapping): + name = str(item.get("name") or item.get("label") or "").strip() + if not name: + return None + description = item.get("description") + return label_matcher.LabelRecord( + name=name, + description=str(description) if description is not None else None, + ) + name = str(getattr(item, "name", "") or "").strip() + if not name: + return None + description = getattr(item, "description", None) + return label_matcher.LabelRecord( + name=name, + description=str(description) if description is not None else None, + ) + + +def _select_label_names( + matches: Sequence[label_matcher.LabelMatch], + *, + max_labels: int | None = None, +) -> list[str]: + if not matches: + return [] + names: list[str] = [] + seen: set[str] = set() + for match in matches: + normalized = _normalize_label(match.label.name) + if not normalized or normalized in seen: + continue + seen.add(normalized) + names.append(match.label.name) + if max_labels is not None and len(names) >= max_labels: + break + return names + + +def _normalize_label(label: str) -> str: + return re.sub(r"[^a-z0-9]+", "", str(label or "").lower()) diff --git a/scripts/langchain/label_matcher.py b/scripts/langchain/label_matcher.py index d54d08d59..ac6e71f52 100755 --- a/scripts/langchain/label_matcher.py +++ b/scripts/langchain/label_matcher.py @@ -119,6 +119,28 @@ def _coerce_label(item: Any) -> LabelRecord | None: ) +def _ensure_label_iterable(labels: Iterable[Any]) -> Iterable[Any]: + if labels is None: + raise ValueError("labels must be an iterable of label records, not None.") + if isinstance(labels, (str, bytes)): + raise ValueError("labels must be an iterable of label records, not a string.") + if not isinstance(labels, Iterable): + raise ValueError("labels must be an iterable of label records.") + return labels + + +def _ensure_label_store(label_store: LabelVectorStore) -> LabelVectorStore: + if not isinstance(label_store, LabelVectorStore): + raise ValueError("label_store must be a LabelVectorStore instance.") + return label_store + + +def _ensure_query_text(query: Any) -> str: + if query is None or not isinstance(query, str): + raise ValueError("query must be a string.") + return query + + def _label_text(label: LabelRecord) -> str: description = (label.description or "").strip() if description: @@ -133,10 +155,16 @@ def build_label_vector_store( model: str | None = None, ) -> LabelVectorStore | None: label_records: list[LabelRecord] = [] - for item in labels: + for index, item in enumerate(_ensure_label_iterable(labels)): record = _coerce_label(item) if record is not None: label_records.append(record) + else: + if isinstance(item, Mapping): + raise ValueError(f"Label entry at index {index} is missing a name.") + if getattr(item, "name", None) is not None or getattr(item, "label", None) is not None: + raise ValueError(f"Label entry at index {index} has an empty name.") + raise ValueError(f"Unsupported label entry at index {index}: {type(item).__name__}.") if not label_records: return None @@ -277,7 +305,9 @@ def find_similar_labels( threshold: float | None = None, k: int | None = None, ) -> list[LabelMatch]: - if not query or not query.strip(): + label_store = _ensure_label_store(label_store) + query = _ensure_query_text(query) + if not query.strip(): return [] store = label_store.store @@ -335,6 +365,8 @@ def resolve_label_match( threshold: float | None = None, k: int | None = None, ) -> LabelMatch | None: + label_store = _ensure_label_store(label_store) + query = _ensure_query_text(query) exact = _exact_short_label_match(label_store, query) if exact is not None: return exact diff --git a/tests/scripts/integration_test.py b/tests/scripts/integration_test.py new file mode 100644 index 000000000..e6b5cada2 --- /dev/null +++ b/tests/scripts/integration_test.py @@ -0,0 +1,35 @@ +from scripts.langchain import integration_layer + + +def test_labeling_integration_applies_expected_labels(): + available_labels = [ + {"name": "type:bug", "description": "Bug reports"}, + {"name": "type:feature", "description": "Feature requests"}, + {"name": "documentation", "description": "Docs updates"}, + ] + + bug_issue = integration_layer.IssueData( + title="App crashes on login", + body="The app crashes after the sign-in screen.", + ) + bug_labels = integration_layer.label_issue(bug_issue, available_labels, threshold=0.8) + assert "type:bug" in bug_labels + assert "type:bug" in bug_issue.labels + + feature_issue = integration_layer.IssueData( + title="Add dark mode support", + body="It would be great to enable a dark theme.", + ) + feature_labels = integration_layer.label_issue(feature_issue, available_labels, threshold=0.8) + assert "type:feature" in feature_labels + assert "type:feature" in feature_issue.labels + + multi_issue = integration_layer.IssueData( + title="Bug in dark mode feature", + body="The new theme crashes on settings.", + ) + multi_labels = integration_layer.label_issue(multi_issue, available_labels, threshold=0.8) + assert "type:bug" in multi_labels + assert "type:feature" in multi_labels + assert "type:bug" in multi_issue.labels + assert "type:feature" in multi_issue.labels diff --git a/tests/scripts/test_label_matcher.py b/tests/scripts/test_label_matcher.py index 0292c836c..9bf1bcea9 100644 --- a/tests/scripts/test_label_matcher.py +++ b/tests/scripts/test_label_matcher.py @@ -2,6 +2,8 @@ import types from dataclasses import dataclass +import pytest + from scripts.langchain import label_matcher, semantic_matcher @@ -56,6 +58,30 @@ def test_build_label_vector_store_returns_none_without_client(monkeypatch): assert result is None +def test_build_label_vector_store_rejects_invalid_label_iterables(): + with pytest.raises(ValueError, match="labels must be an iterable of label records, not None."): + label_matcher.build_label_vector_store(None) + with pytest.raises( + ValueError, + match="labels must be an iterable of label records, not a string.", + ): + label_matcher.build_label_vector_store("bug") + with pytest.raises(ValueError, match="labels must be an iterable of label records."): + label_matcher.build_label_vector_store(123) + + +def test_build_label_vector_store_rejects_invalid_label_entries(): + with pytest.raises(ValueError, match="Label entry at index 0 is missing a name."): + label_matcher.build_label_vector_store([{"description": "missing name"}]) + with pytest.raises(ValueError, match="Label entry at index 0 has an empty name."): + label_matcher.build_label_vector_store([types.SimpleNamespace(name=" ")]) + with pytest.raises( + ValueError, + match="Unsupported label entry at index 0: int.", + ): + label_matcher.build_label_vector_store([123]) + + def test_find_similar_labels_filters_by_relevance_score(): store = types.SimpleNamespace( similarity_search_with_relevance_scores=lambda query, k=5: [ @@ -94,6 +120,33 @@ def test_find_similar_labels_converts_distance_scores(): assert matches[0].score_type == "distance" +def test_find_similar_labels_rejects_invalid_inputs(): + vector_store = label_matcher.LabelVectorStore( + store=object(), provider="unit-test", model="unit-test-model", labels=[] + ) + + with pytest.raises(ValueError, match="label_store must be a LabelVectorStore instance."): + label_matcher.find_similar_labels(object(), "bug") + with pytest.raises(ValueError, match="query must be a string."): + label_matcher.find_similar_labels(vector_store, None) + + +def test_find_similar_labels_handles_missing_metadata_name(): + store = types.SimpleNamespace( + similarity_search_with_relevance_scores=lambda query, k=5: [ + (DummyDoc("type:bug", None), 0.92), + ] + ) + vector_store = label_matcher.LabelVectorStore( + store=store, provider="unit-test", model="unit-test-model", labels=[] + ) + + matches = label_matcher.find_similar_labels(vector_store, "bug", threshold=0.8) + + assert len(matches) == 1 + assert matches[0].label.name == "type:bug" + + def test_resolve_label_match_prefers_exact_for_short_labels(): label = label_matcher.LabelRecord(name="CI", description="Pipeline failures") vector_store = label_matcher.LabelVectorStore(