Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions agents/codex-719.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!-- bootstrap for codex on issue #719 -->
154 changes: 154 additions & 0 deletions scripts/langchain/integration_layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Integration helpers for applying semantic labels to issues.
"""

from __future__ import annotations

import re
from collections.abc import Iterable, Mapping, Sequence
from dataclasses import dataclass, field
from typing import Any

from scripts.langchain import label_matcher


@dataclass
class IssueData:
title: str
body: str | None = None
labels: list[str] = field(default_factory=list)

def apply_labels(self, new_labels: Iterable[str]) -> None:
self.labels = merge_labels(self.labels, new_labels)


def merge_labels(existing: Iterable[str], incoming: Iterable[str]) -> list[str]:
seen: set[str] = set()
merged: list[str] = []
for label in list(existing) + list(incoming):
normalized = _normalize_label(label)
if not normalized or normalized in seen:
continue
seen.add(normalized)
merged.append(label)
return merged


def label_issue(
issue: IssueData,
available_labels: Iterable[Any],
*,
threshold: float | None = None,
k: int | None = None,
max_labels: int | None = None,
) -> list[str]:
issue_text = _build_issue_text(issue)
label_store = _build_label_store(available_labels)
if label_store is None:
return []

matches = label_matcher.find_similar_labels(label_store, issue_text, threshold=threshold, k=k)
names = _select_label_names(matches, max_labels=max_labels)
issue.apply_labels(names)
return names


def _build_issue_text(issue: IssueData) -> str:
if not isinstance(issue.title, str) or not issue.title.strip():
raise ValueError("issue title must be a non-empty string.")
parts = [issue.title.strip()]
if issue.body and issue.body.strip():
parts.append(issue.body.strip())
return "\n\n".join(parts)


def _build_label_store(labels: Iterable[Any]) -> label_matcher.LabelVectorStore | None:
label_records = _collect_label_records(labels)
if not label_records:
return None

vector_store = label_matcher.build_label_vector_store(label_records)
if vector_store is not None:
return vector_store

return label_matcher.LabelVectorStore(
store=object(),
provider="keyword",
model="keyword",
labels=label_records,
)


def _collect_label_records(labels: Iterable[Any]) -> list[label_matcher.LabelRecord]:
if labels is None:
raise ValueError("labels must be an iterable of label records, not None.")
if isinstance(labels, (str, bytes)):
raise ValueError("labels must be an iterable of label records, not a string.")
if not isinstance(labels, Iterable):
raise ValueError("labels must be an iterable of label records.")

records: list[label_matcher.LabelRecord] = []
for index, item in enumerate(labels):
record = _coerce_label_record(item)
if record is not None:
records.append(record)
else:
if isinstance(item, Mapping):
raise ValueError(f"Label entry at index {index} is missing a name.")
if getattr(item, "name", None) is not None or getattr(item, "label", None) is not None:
raise ValueError(f"Label entry at index {index} has an empty name.")
raise ValueError(f"Unsupported label entry at index {index}: {type(item).__name__}.")
return records


def _coerce_label_record(item: Any) -> label_matcher.LabelRecord | None:
if isinstance(item, label_matcher.LabelRecord):
return item
if isinstance(item, (str, bytes)):
name = item.decode("utf-8", errors="replace") if isinstance(item, bytes) else item
name = name.strip()
if not name:
return None
return label_matcher.LabelRecord(name=name)
if isinstance(item, Mapping):
name = str(item.get("name") or item.get("label") or "").strip()
if not name:
return None
description = item.get("description")
return label_matcher.LabelRecord(
name=name,
description=str(description) if description is not None else None,
)
name = str(getattr(item, "name", "") or "").strip()
if not name:
return None
description = getattr(item, "description", None)
return label_matcher.LabelRecord(
name=name,
description=str(description) if description is not None else None,
)


def _select_label_names(
matches: Sequence[label_matcher.LabelMatch],
*,
max_labels: int | None = None,
) -> list[str]:
if not matches:
return []
names: list[str] = []
seen: set[str] = set()
for match in matches:
normalized = _normalize_label(match.label.name)
if not normalized or normalized in seen:
continue
seen.add(normalized)
names.append(match.label.name)
if max_labels is not None and len(names) >= max_labels:
break
return names


def _normalize_label(label: str) -> str:
return re.sub(r"[^a-z0-9]+", "", str(label or "").lower())
36 changes: 34 additions & 2 deletions scripts/langchain/label_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,28 @@ def _coerce_label(item: Any) -> LabelRecord | None:
)


def _ensure_label_iterable(labels: Iterable[Any]) -> Iterable[Any]:
if labels is None:
raise ValueError("labels must be an iterable of label records, not None.")
if isinstance(labels, (str, bytes)):
raise ValueError("labels must be an iterable of label records, not a string.")
if not isinstance(labels, Iterable):
raise ValueError("labels must be an iterable of label records.")
return labels


def _ensure_label_store(label_store: LabelVectorStore) -> LabelVectorStore:
if not isinstance(label_store, LabelVectorStore):
raise ValueError("label_store must be a LabelVectorStore instance.")
return label_store


def _ensure_query_text(query: Any) -> str:
if query is None or not isinstance(query, str):
raise ValueError("query must be a string.")
return query


def _label_text(label: LabelRecord) -> str:
description = (label.description or "").strip()
if description:
Expand All @@ -133,10 +155,16 @@ def build_label_vector_store(
model: str | None = None,
) -> LabelVectorStore | None:
label_records: list[LabelRecord] = []
for item in labels:
for index, item in enumerate(_ensure_label_iterable(labels)):
record = _coerce_label(item)
if record is not None:
label_records.append(record)
else:
if isinstance(item, Mapping):
raise ValueError(f"Label entry at index {index} is missing a name.")
if getattr(item, "name", None) is not None or getattr(item, "label", None) is not None:
raise ValueError(f"Label entry at index {index} has an empty name.")
raise ValueError(f"Unsupported label entry at index {index}: {type(item).__name__}.")

if not label_records:
return None
Expand Down Expand Up @@ -277,7 +305,9 @@ def find_similar_labels(
threshold: float | None = None,
k: int | None = None,
) -> list[LabelMatch]:
if not query or not query.strip():
label_store = _ensure_label_store(label_store)
query = _ensure_query_text(query)
if not query.strip():
return []

store = label_store.store
Expand Down Expand Up @@ -335,6 +365,8 @@ def resolve_label_match(
threshold: float | None = None,
k: int | None = None,
) -> LabelMatch | None:
label_store = _ensure_label_store(label_store)
query = _ensure_query_text(query)
exact = _exact_short_label_match(label_store, query)
if exact is not None:
return exact
Expand Down
35 changes: 35 additions & 0 deletions tests/scripts/integration_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from scripts.langchain import integration_layer


def test_labeling_integration_applies_expected_labels():
available_labels = [
{"name": "type:bug", "description": "Bug reports"},
{"name": "type:feature", "description": "Feature requests"},
{"name": "documentation", "description": "Docs updates"},
]

bug_issue = integration_layer.IssueData(
title="App crashes on login",
body="The app crashes after the sign-in screen.",
)
bug_labels = integration_layer.label_issue(bug_issue, available_labels, threshold=0.8)
assert "type:bug" in bug_labels
assert "type:bug" in bug_issue.labels

feature_issue = integration_layer.IssueData(
title="Add dark mode support",
body="It would be great to enable a dark theme.",
)
feature_labels = integration_layer.label_issue(feature_issue, available_labels, threshold=0.8)
assert "type:feature" in feature_labels
assert "type:feature" in feature_issue.labels

multi_issue = integration_layer.IssueData(
title="Bug in dark mode feature",
body="The new theme crashes on settings.",
)
multi_labels = integration_layer.label_issue(multi_issue, available_labels, threshold=0.8)
assert "type:bug" in multi_labels
assert "type:feature" in multi_labels
assert "type:bug" in multi_issue.labels
assert "type:feature" in multi_issue.labels
53 changes: 53 additions & 0 deletions tests/scripts/test_label_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import types
from dataclasses import dataclass

import pytest

from scripts.langchain import label_matcher, semantic_matcher


Expand Down Expand Up @@ -56,6 +58,30 @@ def test_build_label_vector_store_returns_none_without_client(monkeypatch):
assert result is None


def test_build_label_vector_store_rejects_invalid_label_iterables():
with pytest.raises(ValueError, match="labels must be an iterable of label records, not None."):
label_matcher.build_label_vector_store(None)
with pytest.raises(
ValueError,
match="labels must be an iterable of label records, not a string.",
):
label_matcher.build_label_vector_store("bug")
with pytest.raises(ValueError, match="labels must be an iterable of label records."):
label_matcher.build_label_vector_store(123)


def test_build_label_vector_store_rejects_invalid_label_entries():
with pytest.raises(ValueError, match="Label entry at index 0 is missing a name."):
label_matcher.build_label_vector_store([{"description": "missing name"}])
with pytest.raises(ValueError, match="Label entry at index 0 has an empty name."):
label_matcher.build_label_vector_store([types.SimpleNamespace(name=" ")])
with pytest.raises(
ValueError,
match="Unsupported label entry at index 0: int.",
):
label_matcher.build_label_vector_store([123])


def test_find_similar_labels_filters_by_relevance_score():
store = types.SimpleNamespace(
similarity_search_with_relevance_scores=lambda query, k=5: [
Expand Down Expand Up @@ -94,6 +120,33 @@ def test_find_similar_labels_converts_distance_scores():
assert matches[0].score_type == "distance"


def test_find_similar_labels_rejects_invalid_inputs():
vector_store = label_matcher.LabelVectorStore(
store=object(), provider="unit-test", model="unit-test-model", labels=[]
)

with pytest.raises(ValueError, match="label_store must be a LabelVectorStore instance."):
label_matcher.find_similar_labels(object(), "bug")
with pytest.raises(ValueError, match="query must be a string."):
label_matcher.find_similar_labels(vector_store, None)


def test_find_similar_labels_handles_missing_metadata_name():
store = types.SimpleNamespace(
similarity_search_with_relevance_scores=lambda query, k=5: [
(DummyDoc("type:bug", None), 0.92),
]
)
vector_store = label_matcher.LabelVectorStore(
store=store, provider="unit-test", model="unit-test-model", labels=[]
)

matches = label_matcher.find_similar_labels(vector_store, "bug", threshold=0.8)

assert len(matches) == 1
assert matches[0].label.name == "type:bug"


def test_resolve_label_match_prefers_exact_for_short_labels():
label = label_matcher.LabelRecord(name="CI", description="Pipeline failures")
vector_store = label_matcher.LabelVectorStore(
Expand Down
Loading