From 5e64104e06f2c94d6880355bc0bbec44cb16e7b4 Mon Sep 17 00:00:00 2001 From: "cmeans-claude-dev[bot]" <3223881+cmeans-claude-dev[bot]@users.noreply.github.com> Date: Sun, 12 Apr 2026 10:43:03 -0500 Subject: [PATCH 1/3] feat: get_knowledge language filter (#262) Add optional `language` parameter (ISO 639-1) to get_knowledge tool, store protocol, and PostgresStore. Maps to WHERE e.language = %s::regconfig filter. Enables queries like "show me all French entries". Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 1 + src/mcp_awareness/postgres_store.py | 4 ++++ src/mcp_awareness/store.py | 1 + src/mcp_awareness/tools.py | 8 ++++++++ tests/test_server.py | 14 ++++++++++++++ 5 files changed, 28 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d106674..d848e80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **`get_knowledge` language filter** — optional `language` parameter (ISO 639-1) filters entries by their stored regconfig. Refs [#262](https://github.com/cmeans/mcp-awareness/issues/262), [#238](https://github.com/cmeans/mcp-awareness/issues/238). - **`search` tool** — renamed from `semantic_search` to reflect the hybrid vector + FTS nature. `semantic_search` remains as a deprecated alias (delegates to `search`) and will be removed in a future release. Refs [#261](https://github.com/cmeans/mcp-awareness/issues/261), [#238](https://github.com/cmeans/mcp-awareness/issues/238). - **Regconfig validation cache** — `PostgresStore` caches valid Postgres regconfig names from `pg_ts_config` at startup. Write-time validation falls back to `'simple'` for invalid regconfigs (with one cache-refresh retry in case an extension was installed after startup). Prevents INSERT failures from invalid `language` values reaching the generated `tsv` column. Refs [#260](https://github.com/cmeans/mcp-awareness/issues/260), [#238](https://github.com/cmeans/mcp-awareness/issues/238). - **Layer 1 hybrid retrieval wiring** — Alembic migration adds `language` (regconfig) and `tsv` (generated tsvector with weighted fields) columns to the `entries` table with GIN index. `semantic_search` SQL rewritten to a hybrid CTE fusing vector (HNSW) and lexical (FTS/GIN) branches via Reciprocal Rank Fusion (k=60). Write tools (`remember`, `add_context`, `learn_pattern`, `remind`) gain optional `language` parameter (ISO 639-1) for explicit language override; auto-detection via lingua-py falls back to `simple`. `update_entry` supports changing an entry's language. `Entry` model carries `language` field (default `"simple"`). Graceful degradation: empty FTS branch when query text is short/stop-words-only; empty vector branch when no embeddings exist. Refs [#238](https://github.com/cmeans/mcp-awareness/issues/238). diff --git a/src/mcp_awareness/postgres_store.py b/src/mcp_awareness/postgres_store.py index 7431b5b..76f1739 100644 --- a/src/mcp_awareness/postgres_store.py +++ b/src/mcp_awareness/postgres_store.py @@ -550,6 +550,7 @@ def get_knowledge( learned_from: str | None = None, created_after: datetime | None = None, created_before: datetime | None = None, + language: str | None = None, limit: int | None = None, offset: int | None = None, ) -> list[Entry]: @@ -589,6 +590,9 @@ def get_knowledge( if created_before is not None: clauses.append(psql.SQL("created <= %s")) params.append(created_before) + if language is not None: + clauses.append(psql.SQL("language = %s::regconfig")) + params.append(language) where = psql.SQL(" AND ").join(clauses) # Push LIMIT/OFFSET to SQL unless include_history="only" (post-filter changes count) sql_limit = limit if include_history != "only" else None diff --git a/src/mcp_awareness/store.py b/src/mcp_awareness/store.py index 9df210a..db7aaa5 100644 --- a/src/mcp_awareness/store.py +++ b/src/mcp_awareness/store.py @@ -132,6 +132,7 @@ def get_knowledge( learned_from: str | None = None, created_after: datetime | None = None, created_before: datetime | None = None, + language: str | None = None, limit: int | None = None, offset: int | None = None, ) -> list[Entry]: diff --git a/src/mcp_awareness/tools.py b/src/mcp_awareness/tools.py index b35d346..eb9ba7d 100644 --- a/src/mcp_awareness/tools.py +++ b/src/mcp_awareness/tools.py @@ -148,6 +148,7 @@ async def get_knowledge( mode: str | None = None, limit: int | None = None, offset: int | None = None, + language: str | None = None, ) -> str: """Get knowledge entries: learned patterns, historical context, preferences, notes. Knowledge belongs to the system, not any specific agent. Call when you need @@ -175,6 +176,9 @@ async def get_knowledge( description, tags, created, updated — no content or changelog). Use 'list' to orient before pulling full entries. Use limit/offset for pagination (e.g., limit=10, offset=0 for first page). + language: optional ISO 639-1 code to filter entries by their stored language + (e.g., language='fr' returns only French entries). Useful for "show me all + entries in language X" queries. Results are sorted by most recently updated first (or by relevance if hint is set). This tool always returns JSON with a status field or an entry list. If you receive an unstructured error, the failure is in the transport @@ -195,6 +199,9 @@ async def get_knowledge( created_after_dt = _validate_timestamp(created_after, "created_after") created_before_dt = _validate_timestamp(created_before, "created_before") et = _parse_entry_type(entry_type) + from .language import iso_to_regconfig + + lang_regconfig = iso_to_regconfig(language) if language else None entries = _srv.store.get_knowledge( _srv._owner_id(), tags=tags, @@ -206,6 +213,7 @@ async def get_knowledge( learned_from=learned_from, created_after=created_after_dt, created_before=created_before_dt, + language=lang_regconfig, limit=limit + 1, offset=offset, ) diff --git a/tests/test_server.py b/tests/test_server.py index 9325e31..b1c77c5 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -997,6 +997,20 @@ async def test_get_knowledge_filtered_by_tags(self) -> None: assert len(entries) == 1 assert entries[0]["data"]["description"] == "personal pattern" + @pytest.mark.anyio + async def test_get_knowledge_filtered_by_language(self) -> None: + await server_mod.remember( + source="test", tags=["lang"], description="English note", language="en" + ) + await server_mod.remember( + source="test", tags=["lang"], description="French note", language="fr" + ) + result = await server_mod.get_knowledge(language="fr") + entries = json.loads(result)["entries"] + assert len(entries) == 1 + assert entries[0]["data"]["description"] == "French note" + assert entries[0]["language"] == "french" + @pytest.mark.anyio async def test_get_knowledge_filtered_by_entry_type(self) -> None: await server_mod.learn_pattern(source="nas", tags=["infra"], description="a pattern") From 599b78df815a1de808e56c03041522e0c7c1006c Mon Sep 17 00:00:00 2001 From: "cmeans-claude-dev[bot]" <3223881+cmeans-claude-dev[bot]@users.noreply.github.com> Date: Sun, 12 Apr 2026 11:32:54 -0500 Subject: [PATCH 2/3] fix: reject unknown language codes in get_knowledge filter (QA round 1) Unknown ISO codes (e.g., 'xx') now return an error instead of silently matching all 'simple' entries. Supports language='simple' explicitly for filtering entries with no detected language. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/mcp_awareness/tools.py | 22 +++++++++++++++++++--- tests/test_server.py | 8 ++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/mcp_awareness/tools.py b/src/mcp_awareness/tools.py index eb9ba7d..8af3aad 100644 --- a/src/mcp_awareness/tools.py +++ b/src/mcp_awareness/tools.py @@ -199,9 +199,25 @@ async def get_knowledge( created_after_dt = _validate_timestamp(created_after, "created_after") created_before_dt = _validate_timestamp(created_before, "created_before") et = _parse_entry_type(entry_type) - from .language import iso_to_regconfig - - lang_regconfig = iso_to_regconfig(language) if language else None + from .language import ISO_639_1_TO_REGCONFIG, iso_to_regconfig + + lang_regconfig: str | None = None + if language: + normalized = language.strip().lower() + if normalized == "simple": + lang_regconfig = "simple" + elif normalized not in ISO_639_1_TO_REGCONFIG: + _error_response( + "invalid_parameter", + f"Unknown language code: '{language}'. Use ISO 639-1 codes " + f"(e.g., 'en', 'fr', 'de'). Use language='simple' to filter " + f"entries with no detected language.", + retryable=False, + param="language", + value=language, + ) + else: + lang_regconfig = iso_to_regconfig(language) entries = _srv.store.get_knowledge( _srv._owner_id(), tags=tags, diff --git a/tests/test_server.py b/tests/test_server.py index b1c77c5..ba5285b 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -1011,6 +1011,14 @@ async def test_get_knowledge_filtered_by_language(self) -> None: assert entries[0]["data"]["description"] == "French note" assert entries[0]["language"] == "french" + @pytest.mark.anyio + async def test_get_knowledge_language_unknown_code_errors(self) -> None: + with pytest.raises(ToolError) as exc_info: + await server_mod.get_knowledge(language="xx") + body = _parse_tool_error(exc_info) + assert body["error"]["code"] == "invalid_parameter" + assert "Unknown language code" in body["error"]["message"] + @pytest.mark.anyio async def test_get_knowledge_filtered_by_entry_type(self) -> None: await server_mod.learn_pattern(source="nas", tags=["infra"], description="a pattern") From d3aa98ac10131bd973237f2cff96732dad6178a2 Mon Sep 17 00:00:00 2001 From: "cmeans-claude-dev[bot]" <3223881+cmeans-claude-dev[bot]@users.noreply.github.com> Date: Sun, 12 Apr 2026 13:29:27 -0500 Subject: [PATCH 3/3] fix: use SIMPLE constant, add simple filter test, fix codecov gap (QA round 1) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/mcp_awareness/tools.py | 6 +++--- tests/test_server.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/mcp_awareness/tools.py b/src/mcp_awareness/tools.py index 8af3aad..4e1224a 100644 --- a/src/mcp_awareness/tools.py +++ b/src/mcp_awareness/tools.py @@ -199,13 +199,13 @@ async def get_knowledge( created_after_dt = _validate_timestamp(created_after, "created_after") created_before_dt = _validate_timestamp(created_before, "created_before") et = _parse_entry_type(entry_type) - from .language import ISO_639_1_TO_REGCONFIG, iso_to_regconfig + from .language import ISO_639_1_TO_REGCONFIG, SIMPLE, iso_to_regconfig lang_regconfig: str | None = None if language: normalized = language.strip().lower() - if normalized == "simple": - lang_regconfig = "simple" + if normalized == SIMPLE: + lang_regconfig = SIMPLE elif normalized not in ISO_639_1_TO_REGCONFIG: _error_response( "invalid_parameter", diff --git a/tests/test_server.py b/tests/test_server.py index ba5285b..324470f 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -1011,6 +1011,17 @@ async def test_get_knowledge_filtered_by_language(self) -> None: assert entries[0]["data"]["description"] == "French note" assert entries[0]["language"] == "french" + @pytest.mark.anyio + async def test_get_knowledge_language_simple_filter(self) -> None: + await server_mod.remember( + source="test", tags=["lang"], description="English note", language="en" + ) + await server_mod.remember(source="test", tags=["lang"], description="Simple note") + result = await server_mod.get_knowledge(language="simple") + entries = json.loads(result)["entries"] + assert len(entries) == 1 + assert entries[0]["data"]["description"] == "Simple note" + @pytest.mark.anyio async def test_get_knowledge_language_unknown_code_errors(self) -> None: with pytest.raises(ToolError) as exc_info: