Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- **Unsupported-language alerts** — when lingua detects a language not in the regconfig mapping, write tools fire an info-level structural alert (`unsupported-language-{iso}`). One alert per unsupported language (upsert, not duplicate). Signals demand for Phase 3 non-Western language support. New `detect_language_iso` function in `language.py` returns raw ISO code even for unmapped languages. Refs [#264](https://github.com/cmeans/mcp-awareness/issues/264), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
- **Language backfill migration** — Alembic data migration detects language on existing entries via lingua-py and updates the `language` column. Processes in batches, idempotent, gracefully skips if lingua is not installed. Refs [#263](https://github.com/cmeans/mcp-awareness/issues/263), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
- **`get_knowledge` language filter** — optional `language` parameter (ISO 639-1) filters entries by their stored regconfig. Refs [#262](https://github.com/cmeans/mcp-awareness/issues/262), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
- **`search` tool** — renamed from `semantic_search` to reflect the hybrid vector + FTS nature. `semantic_search` remains as a deprecated alias (delegates to `search`) and will be removed in a future release. Refs [#261](https://github.com/cmeans/mcp-awareness/issues/261), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
Expand Down
19 changes: 19 additions & 0 deletions src/mcp_awareness/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,25 @@ def detect_language(text: str) -> str | None:
return ISO_639_1_TO_REGCONFIG.get(iso)


def detect_language_iso(text: str) -> str | None:
"""Detect the language of text and return the raw ISO 639-1 code.

Unlike :func:`detect_language`, this returns the ISO code even when
it's not in :data:`ISO_639_1_TO_REGCONFIG`. Returns ``None`` when
detection fails or text is too short. Used by the unsupported-language
alert infrastructure to identify *which* unsupported language was detected.
"""
if not text or len(text.strip()) < _MIN_DETECTION_LENGTH:
return None
detector = _get_detector()
if detector is None:
return None
detected = detector.detect_language_of(text)
if detected is None:
return None
return detected.iso_code_639_1.name.lower()


def resolve_language(
explicit: str | None = None,
user_preference: str | None = None,
Expand Down
8 changes: 5 additions & 3 deletions src/mcp_awareness/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from enum import Enum
from typing import Any

from .language import SIMPLE


class EntryType(str, Enum):
STATUS = "status"
Expand Down Expand Up @@ -108,7 +110,7 @@ class Entry:
expires: datetime | None = None
data: dict[str, Any] = field(default_factory=dict)
logical_key: str | None = None
language: str = "simple"
language: str = SIMPLE

def to_dict(self) -> dict[str, Any]:
d: dict[str, Any] = {
Expand All @@ -123,7 +125,7 @@ def to_dict(self) -> dict[str, Any]:
}
if self.logical_key is not None:
d["logical_key"] = self.logical_key
if self.language != "simple":
if self.language != SIMPLE:
d["language"] = self.language
return d

Expand Down Expand Up @@ -167,7 +169,7 @@ def from_dict(cls, d: dict[str, Any]) -> Entry:
expires=ensure_dt_optional(d.get("expires")),
data=d.get("data", {}),
logical_key=d.get("logical_key"),
language=d.get("language", "simple"),
language=d.get("language", SIMPLE),
)

def is_expired(self) -> bool:
Expand Down
3 changes: 2 additions & 1 deletion src/mcp_awareness/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from datetime import datetime
from typing import Any, Protocol, runtime_checkable

from .language import SIMPLE
from .schema import Entry, EntryType

# How long soft-deleted entries remain recoverable before auto-purge
Expand Down Expand Up @@ -325,7 +326,7 @@ def semantic_search(
embedding: list[float],
model: str,
query_text: str = "",
query_language: str = "simple",
query_language: str = SIMPLE,
entry_type: EntryType | None = None,
source: str | None = None,
tags: list[str] | None = None,
Expand Down
50 changes: 47 additions & 3 deletions src/mcp_awareness/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,52 @@
_validate_pagination,
_validate_timestamp,
)
from .language import resolve_language
from .language import ISO_639_1_TO_REGCONFIG, SIMPLE, detect_language_iso, resolve_language
from .schema import Entry, EntryType, make_id, now_utc, to_iso

logger = logging.getLogger(__name__)


def _check_unsupported_language(text: str, resolved: str) -> None:
"""Fire an info alert when lingua detects a language not in the regconfig mapping.

Only fires when: resolved == 'simple' (fallback) AND lingua detected a specific
language that has no regconfig. This signals demand for a language the server
doesn't support, informing the Phase 3 reactivation decision.

Note: this re-runs lingua detection via detect_language_iso() after
resolve_language() already ran it. The double call is intentional —
lingua caches internally, the cost is negligible, and threading the
raw ISO code through resolve_language would complicate its API for
a rare-path optimization.
"""
if resolved != SIMPLE:
return
iso = detect_language_iso(text)
if iso is None or iso in ISO_639_1_TO_REGCONFIG:
return
alert_id = f"unsupported-language-{iso}"
try:
_srv.store.upsert_alert(
_srv._owner_id(),
source="mcp-awareness",
tags=["language", "unsupported"],
alert_id=alert_id,
data={
"alert_id": alert_id,
"level": "info",
"alert_type": "structural",
"message": f"Detected language '{iso}' has no Postgres regconfig — "
f"entry stored with 'simple' fallback. If this language appears "
f"frequently, consider adding support in a future release.",
"resolved": False,
"detected_iso": iso,
},
)
except Exception:
logger.debug("Failed to fire unsupported-language alert for %s", iso, exc_info=True)


# ---------------------------------------------------------------------------
# Read tools (mirrors of resources, for MCP clients that only support tools)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -248,7 +288,7 @@ async def get_knowledge(
embedding=hint_vec[0],
model=provider.model_name,
query_text=hint,
query_language="simple",
query_language=SIMPLE,
source=source,
tags=tags,
entry_type=hint_et,
Expand Down Expand Up @@ -388,6 +428,7 @@ async def learn_pattern(
now = now_utc()
text_for_detect = f"{description} {effect or ''}"
resolved_lang = resolve_language(explicit=language, text_for_detection=text_for_detect)
_check_unsupported_language(text_for_detect, resolved_lang)
entry = Entry(
id=make_id(),
type=EntryType.PATTERN,
Expand Down Expand Up @@ -456,6 +497,7 @@ async def remember(
data["content_type"] = content_type
text_for_detect = f"{description} {content or ''}"
resolved_lang = resolve_language(explicit=language, text_for_detection=text_for_detect)
_check_unsupported_language(text_for_detect, resolved_lang)
entry = Entry(
id=make_id(),
type=EntryType.NOTE,
Expand Down Expand Up @@ -634,6 +676,7 @@ async def add_context(
now = now_utc()
expires = now + timedelta(days=expires_days)
resolved_lang = resolve_language(explicit=language, text_for_detection=description)
_check_unsupported_language(description, resolved_lang)
entry = Entry(
id=make_id(),
type=EntryType.CONTEXT,
Expand Down Expand Up @@ -1002,6 +1045,7 @@ async def remind(
now = now_utc()
deliver_at_dt = _validate_timestamp(deliver_at, "deliver_at")
resolved_lang = resolve_language(explicit=language, text_for_detection=goal)
_check_unsupported_language(goal, resolved_lang)
entry = Entry(
id=make_id(),
type=EntryType.INTENTION,
Expand Down Expand Up @@ -1113,7 +1157,7 @@ async def search(
since_dt = _validate_timestamp(since, "since")
until_dt = _validate_timestamp(until, "until")
et = _parse_entry_type(entry_type)
query_language = iso_to_regconfig(language) if language else "simple"
query_language = iso_to_regconfig(language) if language else SIMPLE
provider = _srv._get_embedding_provider()
if not provider.is_available():
_error_response(
Expand Down
47 changes: 47 additions & 0 deletions tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,3 +300,50 @@ def test_detect_language_returns_none_when_detector_unavailable(self) -> None:
with patch.object(lang_mod, "_get_detector", return_value=None):
text = "The quick brown fox jumps over the lazy dog and runs."
assert lang_mod.detect_language(text) is None


class TestDetectLanguageIso:
"""Tests for detect_language_iso — raw ISO code detection."""

def test_returns_none_for_short_text(self) -> None:
assert lang_mod.detect_language_iso("hi") is None

def test_returns_none_for_empty_text(self) -> None:
assert lang_mod.detect_language_iso("") is None

def test_returns_none_when_detector_unavailable(self) -> None:
with patch.object(lang_mod, "_get_detector", return_value=None):
assert lang_mod.detect_language_iso("A long enough text for detection") is None

def test_returns_iso_for_mapped_language(self) -> None:
"""When lingua detects a mapped language, returns its ISO code."""
mock_result = type("MockLang", (), {"iso_code_639_1": type("Code", (), {"name": "EN"})()})()
mock_detector = type(
"MockDetector", (), {"detect_language_of": lambda self, t: mock_result}
)()
with patch.object(lang_mod, "_get_detector", return_value=mock_detector):
assert (
lang_mod.detect_language_iso("A sufficiently long English text for detection")
== "en"
)

def test_returns_iso_for_unmapped_language(self) -> None:
"""When lingua detects an unmapped language (e.g., Japanese), still returns its ISO code."""
mock_result = type("MockLang", (), {"iso_code_639_1": type("Code", (), {"name": "JA"})()})()
mock_detector = type(
"MockDetector", (), {"detect_language_of": lambda self, t: mock_result}
)()
with patch.object(lang_mod, "_get_detector", return_value=mock_detector):
iso = lang_mod.detect_language_iso(
"日本語のサンプルテキストです。これは十分に長いテストテキストです。"
)
assert iso == "ja"

def test_returns_none_when_detection_uncertain(self) -> None:
"""When lingua returns None (uncertain), detect_language_iso returns None."""
mock_detector = type("MockDetector", (), {"detect_language_of": lambda self, t: None})()
with patch.object(lang_mod, "_get_detector", return_value=mock_detector):
assert (
lang_mod.detect_language_iso("Some ambiguous text that confuses the detector")
is None
)
76 changes: 74 additions & 2 deletions tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

from mcp_awareness import server as server_mod
from mcp_awareness.embeddings import OllamaEmbedding
from mcp_awareness.language import SIMPLE
from mcp_awareness.postgres_store import PostgresStore
from mcp_awareness.schema import Entry, EntryType, make_id, now_utc
from mcp_awareness.store import Store
Expand Down Expand Up @@ -1017,7 +1018,7 @@ async def test_get_knowledge_language_simple_filter(self) -> None:
source="test", tags=["lang"], description="English note", language="en"
)
await server_mod.remember(source="test", tags=["lang"], description="Simple note")
result = await server_mod.get_knowledge(language="simple")
result = await server_mod.get_knowledge(language=SIMPLE)
entries = json.loads(result)["entries"]
assert len(entries) == 1
assert entries[0]["data"]["description"] == "Simple note"
Expand Down Expand Up @@ -1421,7 +1422,7 @@ async def test_update_language(self) -> None:
assert data["status"] == "ok"
entries = json.loads(await server_mod.get_knowledge(include_history="true"))["entries"]
assert entries[0].get("language") == "french"
assert entries[0]["data"]["changelog"][0]["changed"]["language"] == "simple"
assert entries[0]["data"]["changelog"][0]["changed"]["language"] == SIMPLE

@pytest.mark.anyio
async def test_update_noop_same_value(self) -> None:
Expand Down Expand Up @@ -1494,6 +1495,77 @@ async def test_multiple_updates_accumulatechangelog(self) -> None:
assert changelog[1]["changed"]["description"] == "v2"


class TestUnsupportedLanguageAlert:
@pytest.mark.anyio
async def test_fires_alert_for_unmapped_language(self, monkeypatch) -> None:
"""When lingua detects a language not in the mapping, an alert is fired."""
import mcp_awareness.tools as tools_mod

# Mock resolve_language to return 'simple' (simulating unmapped detection)
monkeypatch.setattr(tools_mod, "resolve_language", lambda **kwargs: SIMPLE)
# Mock detect_language_iso to return 'ja' (simulating Japanese detection)
monkeypatch.setattr(tools_mod, "detect_language_iso", lambda text: "ja")
await server_mod.remember(
source="test",
tags=["lang"],
description="This is a long enough test sentence for language detection to trigger",
)
alerts = _store().get_active_alerts(TEST_OWNER)
lang_alerts = [a for a in alerts if "unsupported-language" in a.data.get("alert_id", "")]
assert len(lang_alerts) == 1
assert lang_alerts[0].data["detected_iso"] == "ja"
assert lang_alerts[0].data["level"] == "info"

@pytest.mark.anyio
async def test_no_alert_when_language_is_mapped(self, monkeypatch) -> None:
"""When lingua detects a mapped language, no alert is fired."""
import mcp_awareness.tools as tools_mod

monkeypatch.setattr(tools_mod, "detect_language_iso", lambda text: "en")
await server_mod.remember(
source="test",
tags=["lang"],
description="This is a long enough English test sentence for detection",
)
alerts = _store().get_active_alerts(TEST_OWNER)
lang_alerts = [a for a in alerts if "unsupported-language" in a.data.get("alert_id", "")]
assert len(lang_alerts) == 0

@pytest.mark.anyio
async def test_alert_failure_does_not_break_write(self, monkeypatch) -> None:
"""If alert firing fails, the write still succeeds."""
import mcp_awareness.tools as tools_mod

monkeypatch.setattr(tools_mod, "resolve_language", lambda **kwargs: SIMPLE)
monkeypatch.setattr(tools_mod, "detect_language_iso", lambda text: "ja")
original_upsert = _store().upsert_alert
monkeypatch.setattr(
_store(), "upsert_alert", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("boom"))
)
result = await server_mod.remember(
source="test",
tags=["lang"],
description="Long enough text for detection to trigger the alert path",
)
data = json.loads(result)
assert data["status"] == "ok"
# Restore for other tests
monkeypatch.setattr(_store(), "upsert_alert", original_upsert)

@pytest.mark.anyio
async def test_no_alert_when_explicit_language_set(self) -> None:
"""When caller sets language explicitly, no alert fires even if text is foreign."""
await server_mod.remember(
source="test",
tags=["lang"],
description="日本語のテキスト。これは十分に長いテストです。",
language="en",
)
alerts = _store().get_active_alerts(TEST_OWNER)
lang_alerts = [a for a in alerts if "unsupported-language" in a.data.get("alert_id", "")]
assert len(lang_alerts) == 0


class TestGetKnowledgeHistory:
@pytest.mark.anyio
async def test_history_stripped_by_default(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@

import pytest

from mcp_awareness.language import SIMPLE
from mcp_awareness.schema import Entry, EntryType, make_id, now_utc

TEST_OWNER = "test-owner"
SIMPLE = "simple"

# store fixture comes from conftest.py (testcontainers Postgres)

Expand Down