Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- **`get_knowledge` language filter** — optional `language` parameter (ISO 639-1) filters entries by their stored regconfig. Refs [#262](https://github.com/cmeans/mcp-awareness/issues/262), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
- **`search` tool** — renamed from `semantic_search` to reflect the hybrid vector + FTS nature. `semantic_search` remains as a deprecated alias (delegates to `search`) and will be removed in a future release. Refs [#261](https://github.com/cmeans/mcp-awareness/issues/261), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
- **Regconfig validation cache** — `PostgresStore` caches valid Postgres regconfig names from `pg_ts_config` at startup. Write-time validation falls back to `'simple'` for invalid regconfigs (with one cache-refresh retry in case an extension was installed after startup). Prevents INSERT failures from invalid `language` values reaching the generated `tsv` column. Refs [#260](https://github.com/cmeans/mcp-awareness/issues/260), [#238](https://github.com/cmeans/mcp-awareness/issues/238).
- **Layer 1 hybrid retrieval wiring** — Alembic migration adds `language` (regconfig) and `tsv` (generated tsvector with weighted fields) columns to the `entries` table with GIN index. `semantic_search` SQL rewritten to a hybrid CTE fusing vector (HNSW) and lexical (FTS/GIN) branches via Reciprocal Rank Fusion (k=60). Write tools (`remember`, `add_context`, `learn_pattern`, `remind`) gain optional `language` parameter (ISO 639-1) for explicit language override; auto-detection via lingua-py falls back to `simple`. `update_entry` supports changing an entry's language. `Entry` model carries `language` field (default `"simple"`). Graceful degradation: empty FTS branch when query text is short/stop-words-only; empty vector branch when no embeddings exist. Refs [#238](https://github.com/cmeans/mcp-awareness/issues/238).
Expand Down
4 changes: 4 additions & 0 deletions src/mcp_awareness/postgres_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,7 @@ def get_knowledge(
learned_from: str | None = None,
created_after: datetime | None = None,
created_before: datetime | None = None,
language: str | None = None,
limit: int | None = None,
offset: int | None = None,
) -> list[Entry]:
Expand Down Expand Up @@ -589,6 +590,9 @@ def get_knowledge(
if created_before is not None:
clauses.append(psql.SQL("created <= %s"))
params.append(created_before)
if language is not None:
clauses.append(psql.SQL("language = %s::regconfig"))
params.append(language)
where = psql.SQL(" AND ").join(clauses)
# Push LIMIT/OFFSET to SQL unless include_history="only" (post-filter changes count)
sql_limit = limit if include_history != "only" else None
Expand Down
1 change: 1 addition & 0 deletions src/mcp_awareness/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def get_knowledge(
learned_from: str | None = None,
created_after: datetime | None = None,
created_before: datetime | None = None,
language: str | None = None,
limit: int | None = None,
offset: int | None = None,
) -> list[Entry]:
Expand Down
24 changes: 24 additions & 0 deletions src/mcp_awareness/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ async def get_knowledge(
mode: str | None = None,
limit: int | None = None,
offset: int | None = None,
language: str | None = None,
) -> str:
"""Get knowledge entries: learned patterns, historical context, preferences, notes.
Knowledge belongs to the system, not any specific agent. Call when you need
Expand Down Expand Up @@ -175,6 +176,9 @@ async def get_knowledge(
description, tags, created, updated — no content or changelog). Use 'list'
to orient before pulling full entries.
Use limit/offset for pagination (e.g., limit=10, offset=0 for first page).
language: optional ISO 639-1 code to filter entries by their stored language
(e.g., language='fr' returns only French entries). Useful for "show me all
entries in language X" queries.
Results are sorted by most recently updated first (or by relevance if hint is set).
This tool always returns JSON with a status field or an entry list.
If you receive an unstructured error, the failure is in the transport
Expand All @@ -195,6 +199,25 @@ async def get_knowledge(
created_after_dt = _validate_timestamp(created_after, "created_after")
created_before_dt = _validate_timestamp(created_before, "created_before")
et = _parse_entry_type(entry_type)
from .language import ISO_639_1_TO_REGCONFIG, SIMPLE, iso_to_regconfig

lang_regconfig: str | None = None
if language:
normalized = language.strip().lower()
if normalized == SIMPLE:
lang_regconfig = SIMPLE
elif normalized not in ISO_639_1_TO_REGCONFIG:
_error_response(
"invalid_parameter",
f"Unknown language code: '{language}'. Use ISO 639-1 codes "
f"(e.g., 'en', 'fr', 'de'). Use language='simple' to filter "
f"entries with no detected language.",
retryable=False,
param="language",
value=language,
)
else:
lang_regconfig = iso_to_regconfig(language)
entries = _srv.store.get_knowledge(
_srv._owner_id(),
tags=tags,
Expand All @@ -206,6 +229,7 @@ async def get_knowledge(
learned_from=learned_from,
created_after=created_after_dt,
created_before=created_before_dt,
language=lang_regconfig,
limit=limit + 1,
offset=offset,
)
Expand Down
33 changes: 33 additions & 0 deletions tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,39 @@ async def test_get_knowledge_filtered_by_tags(self) -> None:
assert len(entries) == 1
assert entries[0]["data"]["description"] == "personal pattern"

@pytest.mark.anyio
async def test_get_knowledge_filtered_by_language(self) -> None:
await server_mod.remember(
source="test", tags=["lang"], description="English note", language="en"
)
await server_mod.remember(
source="test", tags=["lang"], description="French note", language="fr"
)
result = await server_mod.get_knowledge(language="fr")
entries = json.loads(result)["entries"]
assert len(entries) == 1
assert entries[0]["data"]["description"] == "French note"
assert entries[0]["language"] == "french"

@pytest.mark.anyio
async def test_get_knowledge_language_simple_filter(self) -> None:
await server_mod.remember(
source="test", tags=["lang"], description="English note", language="en"
)
await server_mod.remember(source="test", tags=["lang"], description="Simple note")
result = await server_mod.get_knowledge(language="simple")
entries = json.loads(result)["entries"]
assert len(entries) == 1
assert entries[0]["data"]["description"] == "Simple note"

@pytest.mark.anyio
async def test_get_knowledge_language_unknown_code_errors(self) -> None:
with pytest.raises(ToolError) as exc_info:
await server_mod.get_knowledge(language="xx")
body = _parse_tool_error(exc_info)
assert body["error"]["code"] == "invalid_parameter"
assert "Unknown language code" in body["error"]["message"]

@pytest.mark.anyio
async def test_get_knowledge_filtered_by_entry_type(self) -> None:
await server_mod.learn_pattern(source="nas", tags=["infra"], description="a pattern")
Expand Down