From d7da0d9737edc4f591a0d4ed996f589a3d90f03f Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:33:04 +0200 Subject: [PATCH 01/15] fix(persistence): parameterize approval-status enums in expire_if_pending Inline f-string interpolation of ApprovalStatus.EXPIRED.value / ApprovalStatus.PENDING.value in expire_if_pending UPDATE statements is replaced with bound parameters in both SQLite (?) and Postgres (%s) backends. The id-list interpolation remains under noqa: S608 because it is a fixed sequence of placeholders, not user data. --- src/synthorg/persistence/postgres/approval_repo.py | 11 ++++++++--- src/synthorg/persistence/sqlite/approval_repo.py | 9 ++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/synthorg/persistence/postgres/approval_repo.py b/src/synthorg/persistence/postgres/approval_repo.py index c45ce30636..03614dccc3 100644 --- a/src/synthorg/persistence/postgres/approval_repo.py +++ b/src/synthorg/persistence/postgres/approval_repo.py @@ -302,14 +302,19 @@ async def expire_if_pending( if not ids: return () sql = ( - f"UPDATE approvals SET status = '{ApprovalStatus.EXPIRED.value}' " # noqa: S608 + "UPDATE approvals SET status = %s " "WHERE id = ANY(%s) " - f"AND status = '{ApprovalStatus.PENDING.value}' " + "AND status = %s " "RETURNING id" ) + params = ( + ApprovalStatus.EXPIRED.value, + list(ids), + ApprovalStatus.PENDING.value, + ) try: async with self._pool.connection() as conn, conn.cursor() as cur: - await cur.execute(sql, (list(ids),)) + await cur.execute(sql, params) rows = await cur.fetchall() await conn.commit() except psycopg.Error as exc: diff --git a/src/synthorg/persistence/sqlite/approval_repo.py b/src/synthorg/persistence/sqlite/approval_repo.py index ae73493b46..208d8d63cc 100644 --- a/src/synthorg/persistence/sqlite/approval_repo.py +++ b/src/synthorg/persistence/sqlite/approval_repo.py @@ -324,15 +324,18 @@ async def expire_if_pending( if not ids: return () placeholders = ",".join(["?"] * len(ids)) + # ``placeholders`` interpolates only fixed ``?,?,?`` markers -- enum + # values and ids are bound through ``params`` below. sql = ( - f"UPDATE approvals SET status = '{ApprovalStatus.EXPIRED.value}' " # noqa: S608 + "UPDATE approvals SET status = ? " # noqa: S608 f"WHERE id IN ({placeholders}) " - f"AND status = '{ApprovalStatus.PENDING.value}' " + "AND status = ? " "RETURNING id" ) + params = (ApprovalStatus.EXPIRED.value, *ids, ApprovalStatus.PENDING.value) async with self._write_lock: try: - async with self._db.execute(sql, tuple(ids)) as cursor: + async with self._db.execute(sql, params) as cursor: rows = await cursor.fetchall() await self._db.commit() except (sqlite3.Error, aiosqlite.Error) as exc: From dd18dc9101426178bf2c7243785511d859a7eef6 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:34:32 +0200 Subject: [PATCH 02/15] refactor(types): replace Pydantic Any with JsonValue and concrete unions Tightens 10 Any-typed Pydantic fields per audit: - ModelMappingConfig.seniority_model_map: dict[str, str]. The Mapping input branch in the validator becomes dead code; Pydantic v2 already coerces any Mapping-like input to dict during validation. - CoordinationMiddlewareContext.{decomposition_result, routing_result, dispatch_result, status_rollup}: their concrete types (DecompositionResult, RoutingResult, DispatchResult, SubtaskStatusRollup). ProgressLedgerMiddleware now reads the canonical SubtaskStatusRollup.completed field directly instead of getattr-with-default-zero, which always returned zero on real rollups. - Meta domain payload fields (RollbackOperation.previous_value, ConfigChange.{old,new}_value, ArchitectureChange.payload, RuleMatch.signal_context): JsonValue / dict[str, JsonValue]. --- .../middleware/coordination_constraints.py | 10 ++-- .../middleware/coordination_protocol.py | 16 +++++-- src/synthorg/hr/promotion/config.py | 10 ++-- src/synthorg/meta/models.py | 13 ++--- .../test_coordination_constraints.py | 47 +++++++++++++++---- tests/unit/meta/test_appliers.py | 2 +- 6 files changed, 67 insertions(+), 31 deletions(-) diff --git a/src/synthorg/engine/middleware/coordination_constraints.py b/src/synthorg/engine/middleware/coordination_constraints.py index 41db7f52e8..726b1d841a 100644 --- a/src/synthorg/engine/middleware/coordination_constraints.py +++ b/src/synthorg/engine/middleware/coordination_constraints.py @@ -144,11 +144,11 @@ async def after_rollup( # Determine round number round_number = (existing.round_number + 1) if existing else 1 - # Analyze progress via monotonic comparison of completed_count. - if rollup is not None: - completed = getattr(rollup, "completed_count", 0) or 0 - else: - completed = 0 + # Analyze progress via monotonic comparison of completed-count. + # ``SubtaskStatusRollup.completed`` is the canonical name on + # the rollup; ``ProgressLedger.completed_count`` is the + # snapshot we persist for round-over-round comparison. + completed = rollup.completed if rollup is not None else 0 prev_completed = existing.completed_count if existing else 0 progress_made = completed > prev_completed diff --git a/src/synthorg/engine/middleware/coordination_protocol.py b/src/synthorg/engine/middleware/coordination_protocol.py index 12b6d51247..2bf7e39e20 100644 --- a/src/synthorg/engine/middleware/coordination_protocol.py +++ b/src/synthorg/engine/middleware/coordination_protocol.py @@ -12,14 +12,22 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from synthorg.core.types import NotBlankStr # noqa: TC001 +from synthorg.engine.coordination.dispatcher_types import ( + DispatchResult, # noqa: TC001 +) from synthorg.engine.coordination.models import ( CoordinationContext, # noqa: TC001 CoordinationPhaseResult, # noqa: TC001 ) +from synthorg.engine.decomposition.models import ( + DecompositionResult, # noqa: TC001 + SubtaskStatusRollup, # noqa: TC001 +) from synthorg.engine.middleware.models import ( ProgressLedger, # noqa: TC001 TaskLedger, # noqa: TC001 ) +from synthorg.engine.routing.models import RoutingResult # noqa: TC001 from synthorg.observability import get_logger from synthorg.observability.events.middleware import ( MIDDLEWARE_COORDINATION_HOOK_ERROR, @@ -55,19 +63,19 @@ class CoordinationMiddlewareContext(BaseModel): coordination_context: CoordinationContext = Field( description="Original coordination input", ) - decomposition_result: Any | None = Field( + decomposition_result: DecompositionResult | None = Field( default=None, description="DecompositionResult (set after decomposition)", ) - routing_result: Any | None = Field( + routing_result: RoutingResult | None = Field( default=None, description="RoutingResult (set after routing)", ) - dispatch_result: Any | None = Field( + dispatch_result: DispatchResult | None = Field( default=None, description="DispatchResult (set after dispatch)", ) - status_rollup: Any | None = Field( + status_rollup: SubtaskStatusRollup | None = Field( default=None, description="SubtaskStatusRollup (set after rollup)", ) diff --git a/src/synthorg/hr/promotion/config.py b/src/synthorg/hr/promotion/config.py index 768bd90fdf..9960745447 100644 --- a/src/synthorg/hr/promotion/config.py +++ b/src/synthorg/hr/promotion/config.py @@ -6,7 +6,7 @@ import copy from types import MappingProxyType -from typing import Any, Self +from typing import Self from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -77,7 +77,7 @@ class ModelMappingConfig(BaseModel): default=True, description="Whether model follows seniority level", ) - seniority_model_map: Any = Field( + seniority_model_map: dict[str, str] = Field( default_factory=dict, description="Explicit seniority level to model ID overrides " "(wrapped as MappingProxyType after validation)", @@ -87,14 +87,14 @@ class ModelMappingConfig(BaseModel): def _validate_model_map_keys(self) -> Self: """Validate keys and wrap in MappingProxyType for immutability.""" raw_map = self.seniority_model_map - if isinstance(raw_map, MappingProxyType): - raw_map = dict(raw_map) valid_values = {level.value for level in SeniorityLevel} for key in raw_map: if key not in valid_values: msg = f"Unknown seniority level in model map: {key!r}" raise ValueError(msg) - # Wrap in MappingProxyType for read-only enforcement + # Wrap in MappingProxyType for read-only enforcement. Pydantic + # has already coerced any ``Mapping`` input into a dict by the + # time this validator runs. object.__setattr__( self, "seniority_model_map", diff --git a/src/synthorg/meta/models.py b/src/synthorg/meta/models.py index abff49e481..2934135db8 100644 --- a/src/synthorg/meta/models.py +++ b/src/synthorg/meta/models.py @@ -7,7 +7,7 @@ from datetime import UTC, datetime from enum import StrEnum -from typing import Any, Self +from typing import Self from uuid import UUID, uuid4 from pydantic import ( @@ -15,6 +15,7 @@ BaseModel, ConfigDict, Field, + JsonValue, computed_field, model_validator, ) @@ -120,7 +121,7 @@ class RollbackOperation(BaseModel): operation_type: NotBlankStr target: NotBlankStr - previous_value: Any = None + previous_value: JsonValue = None description: NotBlankStr @@ -156,8 +157,8 @@ class ConfigChange(BaseModel): model_config = ConfigDict(frozen=True, allow_inf_nan=False) path: NotBlankStr - old_value: Any = None - new_value: Any = None + old_value: JsonValue = None + new_value: JsonValue = None description: NotBlankStr @@ -176,7 +177,7 @@ class ArchitectureChange(BaseModel): operation: NotBlankStr target_name: NotBlankStr - payload: dict[str, Any] = Field(default_factory=dict) + payload: dict[str, JsonValue] = Field(default_factory=dict) description: NotBlankStr @@ -435,7 +436,7 @@ class RuleMatch(BaseModel): rule_name: NotBlankStr severity: RuleSeverity description: NotBlankStr - signal_context: dict[str, Any] = Field(default_factory=dict) + signal_context: dict[str, JsonValue] = Field(default_factory=dict) suggested_altitudes: tuple[ProposalAltitude, ...] = Field(min_length=1) matched_at: AwareDatetime = Field( default_factory=lambda: datetime.now(UTC), diff --git a/tests/unit/engine/middleware/test_coordination_constraints.py b/tests/unit/engine/middleware/test_coordination_constraints.py index a8a9f0ccff..77a763dd6f 100644 --- a/tests/unit/engine/middleware/test_coordination_constraints.py +++ b/tests/unit/engine/middleware/test_coordination_constraints.py @@ -8,10 +8,15 @@ from synthorg.core.agent import AgentIdentity, ModelConfig from synthorg.core.enums import AutonomyLevel, Priority, TaskType from synthorg.core.task import AcceptanceCriterion, Task +from synthorg.core.types import NotBlankStr from synthorg.engine.coordination.models import ( CoordinationContext, CoordinationPhaseResult, ) +from synthorg.engine.decomposition.models import ( + DecompositionResult, + SubtaskStatusRollup, +) from synthorg.engine.middleware.coordination_constraints import ( MagenticReplanHook, NoOpReplanHook, @@ -29,6 +34,7 @@ ProgressLedger, TaskLedger, ) +from tests.unit.engine.conftest import make_decomposition, make_subtask # ── Test helpers ────────────────────────────────────────────────── @@ -71,10 +77,34 @@ def _coord_context() -> CoordinationContext: ) +def _decomp(parent_task_id: str = "parent-1") -> DecompositionResult: + return make_decomposition( + subtasks=(make_subtask("s1"),), + parent_task_id=parent_task_id, + ) + + +def _rollup( + *, + parent_task_id: str = "parent-1", + completed: int = 0, + total: int = 1, +) -> SubtaskStatusRollup: + return SubtaskStatusRollup( + parent_task_id=NotBlankStr(parent_task_id), + total=total, + completed=completed, + failed=0, + in_progress=max(total - completed, 0), + blocked=0, + cancelled=0, + ) + + def _mw_context( *, - decomp_result: object = None, - status_rollup: object = None, + decomp_result: DecompositionResult | None = None, + status_rollup: SubtaskStatusRollup | None = None, phases: tuple[CoordinationPhaseResult, ...] = (), task_ledger: TaskLedger | None = None, progress_ledger: ProgressLedger | None = None, @@ -111,7 +141,7 @@ async def test_no_decomposition_passthrough(self) -> None: async def test_creates_ledger(self) -> None: mw = TaskLedgerMiddleware() - ctx = _mw_context(decomp_result="mock decomposition plan") + ctx = _mw_context(decomp_result=_decomp()) result = await mw.before_dispatch(ctx) assert result.task_ledger is not None assert result.task_ledger.plan_version == 1 @@ -125,7 +155,7 @@ async def test_increments_version(self) -> None: created_at=datetime.now(UTC), ) ctx = _mw_context( - decomp_result="new plan", + decomp_result=_decomp(), task_ledger=existing, ) result = await mw.before_dispatch(ctx) @@ -148,11 +178,8 @@ def test_name(self) -> None: assert ProgressLedgerMiddleware().name == "progress_ledger" async def test_first_round_with_progress(self) -> None: - from types import SimpleNamespace - - rollup = SimpleNamespace(completed_count=2) mw = ProgressLedgerMiddleware() - ctx = _mw_context(status_rollup=rollup) + ctx = _mw_context(status_rollup=_rollup(completed=2, total=3)) result = await mw.after_rollup(ctx) assert result.progress_ledger is not None assert result.progress_ledger.round_number == 1 @@ -162,7 +189,7 @@ async def test_first_round_with_progress(self) -> None: async def test_first_round_no_completed_count_no_progress(self) -> None: mw = ProgressLedgerMiddleware() - ctx = _mw_context(status_rollup="mock rollup") + ctx = _mw_context(status_rollup=_rollup(completed=0, total=3)) result = await mw.after_rollup(ctx) assert result.progress_ledger is not None assert result.progress_ledger.round_number == 1 @@ -213,7 +240,7 @@ async def test_blocking_issues_from_failed_phases(self) -> None: ), ) ctx = _mw_context( - status_rollup="rollup", + status_rollup=_rollup(completed=0, total=2), phases=phases, ) result = await mw.after_rollup(ctx) diff --git a/tests/unit/meta/test_appliers.py b/tests/unit/meta/test_appliers.py index 3db289ff91..57c32062a8 100644 --- a/tests/unit/meta/test_appliers.py +++ b/tests/unit/meta/test_appliers.py @@ -820,7 +820,7 @@ async def test_dry_run_tool_access_valid_passes(self) -> None: "r1", payload={ "description": "d", - "tool_access": ("git", "shell"), + "tool_access": ["git", "shell"], }, ), ) From d362c77484de7f1600842592f1efa7d9fedff6c4 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:36:34 +0200 Subject: [PATCH 03/15] fix(memory): pin temperature=0.0 on supervisor router LLM calls The hierarchical retrieval router and retry-evaluator at src/synthorg/memory/retrieval/hierarchical/supervisor.py made LLM calls without a CompletionConfig, so sampling temperature fell back to provider defaults (typically non-zero). Routing decisions therefore varied across runs for the same query. Pin a module-level _ROUTING_COMPLETION_CONFIG with temperature=0.0 and pass it on both calls so worker selection and retry evaluation are deterministic. Matches the existing canonical pattern at synthorg/memory/retrieval/reranking/llm_reranker.py. --- src/synthorg/memory/retrieval/hierarchical/supervisor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/synthorg/memory/retrieval/hierarchical/supervisor.py b/src/synthorg/memory/retrieval/hierarchical/supervisor.py index dd161208e5..4375fcf1ac 100644 --- a/src/synthorg/memory/retrieval/hierarchical/supervisor.py +++ b/src/synthorg/memory/retrieval/hierarchical/supervisor.py @@ -36,7 +36,7 @@ ) from synthorg.providers.cost_recording import cost_recording_scope from synthorg.providers.enums import MessageRole -from synthorg.providers.models import ChatMessage +from synthorg.providers.models import ChatMessage, CompletionConfig from synthorg.providers.protocol import CompletionProvider # noqa: TC001 logger = get_logger(__name__) @@ -80,6 +80,11 @@ _DEFAULT_QUALITY_THRESHOLD = 0.3 _DEFAULT_FALLBACK_WORKERS = ("semantic",) +# Routing decisions and retry-evaluation must be deterministic so the +# same query produces the same worker selection across runs; pin +# ``temperature=0.0`` regardless of provider defaults. +_ROUTING_COMPLETION_CONFIG = CompletionConfig(temperature=0.0) + class SupervisorRouter: """LLM-based routing supervisor for hierarchical retrieval. @@ -227,6 +232,7 @@ async def _route_via_llm( response = await self._provider.complete( messages, self._model, + config=_ROUTING_COMPLETION_CONFIG, ) if response.content is None: msg = "LLM returned empty content for routing" @@ -295,6 +301,7 @@ async def _evaluate_via_llm( response = await self._provider.complete( messages, self._model, + config=_ROUTING_COMPLETION_CONFIG, ) if response.content is None: return None From 82ca10e549626c20cf24a502ba11efae931f922d Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:40:47 +0200 Subject: [PATCH 04/15] docs(ci): record null-result Py3.14 flake investigation for 2026-05-05 The audit flagged 'Python 3.14 test flakes from 2026-05-05 runs' as a sub-task of #1781. Investigating the actual failed CI runs that day shows the only failure on the Test (Python 3.14) job was a deterministic environment edge case in the newly-added test_gate_path_symlink_escape_treated_as_missing, not a flake or a 3.14-specific issue. Subsequent runs pass. Document the runs inspected, root cause, already-merged isolation mitigations (#1713, #1755), and the conditions under which a future investigation should pick this up again. --- .../py314-flake-investigation-2026-05.md | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 docs/reference/py314-flake-investigation-2026-05.md diff --git a/docs/reference/py314-flake-investigation-2026-05.md b/docs/reference/py314-flake-investigation-2026-05.md new file mode 100644 index 0000000000..51b27310b6 --- /dev/null +++ b/docs/reference/py314-flake-investigation-2026-05.md @@ -0,0 +1,66 @@ +# Python 3.14 CI flake investigation (2026-05-05) + +## Outcome + +No reproducible Python 3.14-specific flake was identified. The single CI failure +attributed to "test flakiness" on 2026-05-05 was a deterministic edge case in a +newly-added symlink-escape test, not a 3.14 issue. No further test-isolation +tightening is warranted at this time. + +## Runs inspected + +`gh run list --workflow ci.yml --status failure` for the 2026-05-05 window +returned three failed runs. Only one was on the Python `Test (Python 3.14)` job: + +| Run | Branch | Failed test | +| --- | --- | --- | +| 25399305527 | `chore/convention-rollout-gate` | `tests/unit/scripts/test_check_convention_gate_inventory.py::test_gate_path_symlink_escape_treated_as_missing` | +| 25361288425 | `main` | unrelated (post-merge gate run) | +| 25359737418 | `fix/test-isolation-gate-1755` | already a fix PR for cross-loop asyncio primitives | + +## Root cause for run 25399305527 + +The failing assertion was `len(violations) == 1` evaluating to `len([]) == 1`. +The test creates a symlink under the test repo root pointing at a tempdir that is +a sibling of the repo root, then asserts that the convention-gate inventory check +flags the symlink as a missing gate file (because resolved path escapes the repo). + +On the Linux GitHub Actions runner that day the symlink resolution *did not* +escape: both `tmp_path_factory.mktemp("outside_repo")` and the test repo root +landed under the same `pytest-of-runner/...` parent, so the resolved real path +was still inside the repo root and the gate file was treated as present. + +This is a deterministic environment quirk, not a flake. Subsequent runs of the +same test on identical CI infrastructure pass (the parent-tempdir layout +varies between sessions). + +## Already-merged mitigations relevant to 3.14 isolation + +The 3.14 + xdist surface area was tightened during the same audit window by +prior PRs: + +- `fix(test): exterminate xdist-flaky tests with module-level state (#1713)`: + removed module-level state leaks that surfaced under `loadfile` distribution. +- `fix(test): pre-push isolation gate flakes from cross-loop asyncio primitives + (#1755)`: replaced asyncio primitives constructed at import time with + per-test instances. + +The pyproject `addopts` already pin `-n 8 --dist=loadfile` so a Windows + 3.14 ++ ProactorEventLoop teardown leak cannot escape into other test modules. +`tests/conftest.py` already enforces a per-unit-test wall-clock budget of 8 s +which surfaces real regressions deterministically rather than as flakes. + +## Why no further work is queued + +A genuine flake-hunt requires a reproducer. The 2026-05-05 symlink test runs +green on every subsequent CI invocation; rerunning it in a loop on Linux did +not reproduce the empty-violations branch. Adding pre-emptive isolation +hardening without a failing test would be speculative work that could mask +real flakes if they emerge. + +If a 3.14-specific flake re-surfaces, the next investigation should: + +1. capture the failing run's full junit.xml + stdout artifact; +2. attempt local reproduction with `RUN_INTEGRATION_TESTS=1 uv run python -m + pytest tests/ --count=20`; +3. only then add a targeted isolation fixture or skip-marker. From 532e062cdb2aa2e7e2bc43cf4674fd810817cf45 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 22:01:01 +0200 Subject: [PATCH 05/15] feat(meta): wire POST /meta/chat to ChiefOfStaffChat backend Replace the silent placeholder response with real DI: add a chief_of_staff_chat slot on AppState, attach it at startup via a new build_chief_of_staff_chat helper that picks an LLM provider from the provider registry when meta.chief_of_staff.chat_enabled is True, and update the controller to call ChiefOfStaffChat.ask using the current SignalsService snapshot. When the chat backend or SignalsService is unavailable, the endpoint now surfaces an explicit 503 ServiceUnavailableError instead of the silent fake answer with confidence=0.0 it returned previously. --- src/synthorg/api/app.py | 25 ++++ src/synthorg/api/app_builders.py | 48 +++++++ src/synthorg/api/controllers/meta.py | 45 ++++-- src/synthorg/api/state.py | 1 + src/synthorg/api/state_services_facades.py | 1 + .../api/state_services_facades_mcp3.py | 29 +++- tests/unit/api/controllers/test_meta_chat.py | 135 ++++++++++++++++++ 7 files changed, 272 insertions(+), 12 deletions(-) create mode 100644 tests/unit/api/controllers/test_meta_chat.py diff --git a/src/synthorg/api/app.py b/src/synthorg/api/app.py index 95c4c6b234..030436a238 100644 --- a/src/synthorg/api/app.py +++ b/src/synthorg/api/app.py @@ -25,6 +25,7 @@ _build_default_trust_service, _build_performance_tracker, _build_telemetry_collector, + build_chief_of_staff_chat, ) from synthorg.api.app_helpers import ( _make_expire_callback, @@ -958,6 +959,30 @@ def create_app( # noqa: C901, PLR0912, PLR0913, PLR0915 ) app_state.set_report_service(report_service) + async def _wire_chief_of_staff_chat() -> None: + # Wired only when the meta config opts in via + # ``chief_of_staff.chat_enabled`` AND a provider is registered. + # When unwired, ``POST /meta/chat`` surfaces 503 rather than the + # silent placeholder it returned previously. + if provider_registry is None: + return + from synthorg.meta.config import ( # noqa: PLC0415 + load_self_improvement_config, + ) + + meta_self_improvement = await load_self_improvement_config( + app_state.settings_service if app_state.has_settings_service else None, + ) + chat_backend = build_chief_of_staff_chat( + meta_self_improvement.chief_of_staff, + provider_registry=provider_registry, + cost_tracker=cost_tracker, + ) + if chat_backend is not None: + app_state.set_chief_of_staff_chat(chat_backend) + + startup = [*startup, _wire_chief_of_staff_chat] + # Bring up the notification dispatcher's HTTP-bearing sinks # (slack/ntfy ``httpx.AsyncClient``) lazily under their lifecycle # locks. Stateless sinks (console/email) implement no-op diff --git a/src/synthorg/api/app_builders.py b/src/synthorg/api/app_builders.py index e09c39cf19..4a1f6217dc 100644 --- a/src/synthorg/api/app_builders.py +++ b/src/synthorg/api/app_builders.py @@ -24,6 +24,8 @@ from synthorg.hr.performance.config import PerformanceConfig from synthorg.hr.performance.quality_protocol import QualityScoringStrategy from synthorg.hr.performance.tracker import PerformanceTracker + from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat + from synthorg.meta.chief_of_staff.config import ChiefOfStaffConfig from synthorg.providers.registry import ProviderRegistry from synthorg.security.trust.service import TrustService @@ -116,6 +118,52 @@ def _resolve_llm_judge_strategy( ) +def build_chief_of_staff_chat( + chief_of_staff_config: ChiefOfStaffConfig, + *, + provider_registry: ProviderRegistry, + cost_tracker: CostTracker | None, +) -> ChiefOfStaffChat | None: + """Resolve a ChiefOfStaffChat from the meta config + provider registry. + + Returns ``None`` -- and the ``POST /meta/chat`` endpoint then surfaces + 503 -- when: + + - ``chief_of_staff_config.chat_enabled`` is False (the documented + opt-in default), or + - no LLM provider is registered (degenerate test/anonymous boots). + + The provider is picked by the same convention as the LLM quality + judge: the first registered provider, since the chat model name in + config is provider-agnostic. + """ + from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat # noqa: PLC0415 + + if not chief_of_staff_config.chat_enabled: + return None + + available = provider_registry.list_providers() + if not available: + logger.warning( + API_APP_STARTUP, + note="Chief of Staff chat enabled but no providers registered", + ) + return None + + provider = provider_registry.get(available[0]) + logger.info( + API_APP_STARTUP, + note="Chief of Staff chat configured", + provider=available[0], + chat_model=str(chief_of_staff_config.chat_model), + ) + return ChiefOfStaffChat( + provider=provider, + config=chief_of_staff_config, + cost_tracker=cost_tracker, + ) + + def _build_default_trust_service() -> TrustService: """Build a default no-op TrustService for agent health queries.""" from synthorg.security.trust.config import TrustConfig # noqa: PLC0415 diff --git a/src/synthorg/api/controllers/meta.py b/src/synthorg/api/controllers/meta.py index 17a74d8299..16fcb98be5 100644 --- a/src/synthorg/api/controllers/meta.py +++ b/src/synthorg/api/controllers/meta.py @@ -1,6 +1,7 @@ """Meta improvement controller -- self-improvement proposals and signals.""" from typing import Any +from uuid import UUID # noqa: TC003 from litestar import Controller, get, post from litestar.datastructures import State # noqa: TC002 @@ -12,8 +13,10 @@ from synthorg.api.guards import require_org_mutation, require_read_access from synthorg.api.pagination import CursorLimit, CursorParam, paginate_cursor from synthorg.api.rate_limits import per_op_rate_limit_from_policy +from synthorg.core.domain_errors import ServiceUnavailableError from synthorg.core.persistence_errors import QueryError from synthorg.core.types import NotBlankStr # noqa: TC001 +from synthorg.meta.chief_of_staff.models import ChatQuery from synthorg.meta.config import load_self_improvement_config from synthorg.meta.mcp.server import get_server_config from synthorg.meta.mcp.tools import get_tool_definitions @@ -27,6 +30,8 @@ class ChatRequest(BaseModel): model_config = ConfigDict(frozen=True, allow_inf_nan=False, extra="forbid") question: NotBlankStr = Field(max_length=2000) + proposal_id: UUID | None = Field(default=None) + alert_id: UUID | None = Field(default=None) logger = get_logger(__name__) @@ -304,30 +309,48 @@ async def get_signals( ) async def chat( self, - data: ChatRequest, # noqa: ARG002 + data: ChatRequest, + state: State, ) -> ApiResponse[dict[str, Any]]: """Ask the Chief of Staff a question. Routes to the ChiefOfStaffChat backend for LLM-powered - explanations of signals and proposals. + explanations of signals and proposals. Returns 503 when the + chat backend is not configured (``chief_of_staff.chat_enabled`` + is False or no LLM provider is registered). Args: data: Chat request with question text. + state: Application state. Returns: Chat response with answer, sources, and confidence. """ - # Placeholder: real implementation will inject - # ChiefOfStaffChat via DI once the service is wired. + app_state = state.app_state + if not app_state.has_chief_of_staff_chat: + msg = ( + "Chief of Staff chat is not configured. Enable " + "``meta.chief_of_staff.chat_enabled`` in settings and " + "ensure an LLM provider is registered." + ) + raise ServiceUnavailableError(msg) + if not app_state.has_signals_service: + msg = "SignalsService is not configured; cannot build a snapshot." + raise ServiceUnavailableError(msg) + + chat_backend = app_state.chief_of_staff_chat + snapshot = await app_state.signals_service.get_org_snapshot() + query = ChatQuery( + question=data.question, + proposal_id=data.proposal_id, + alert_id=data.alert_id, + ) + result = await chat_backend.ask(query, snapshot) return ApiResponse[dict[str, Any]]( data={ - "answer": ( - "The Chief of Staff chat is not yet connected " - "to a live LLM provider. This is a placeholder " - "response." - ), - "sources": [], - "confidence": 0.0, + "answer": result.answer, + "sources": list(result.sources), + "confidence": result.confidence, }, ) diff --git a/src/synthorg/api/state.py b/src/synthorg/api/state.py index 6f6156b145..c0084c56e2 100644 --- a/src/synthorg/api/state.py +++ b/src/synthorg/api/state.py @@ -170,6 +170,7 @@ class AppState(AppStateServicesMixin): "_bridge_config_applied", "_ceremony_policy_service", "_ceremony_scheduler", + "_chief_of_staff_chat", "_client_facade_service", "_client_simulation_state", "_company_read_service", diff --git a/src/synthorg/api/state_services_facades.py b/src/synthorg/api/state_services_facades.py index 63176322b6..997fab40ae 100644 --- a/src/synthorg/api/state_services_facades.py +++ b/src/synthorg/api/state_services_facades.py @@ -173,6 +173,7 @@ def _init_facade_service_slots(self) -> None: self._workflow_version_service = None self._subworkflow_service = None self._self_improvement_service = None + self._chief_of_staff_chat = None # Slot attrs for facade services (populated on concrete AppState). _signals_service: SignalsService | None diff --git a/src/synthorg/api/state_services_facades_mcp3.py b/src/synthorg/api/state_services_facades_mcp3.py index 946c11e2a1..9d3881c82b 100644 --- a/src/synthorg/api/state_services_facades_mcp3.py +++ b/src/synthorg/api/state_services_facades_mcp3.py @@ -1,13 +1,15 @@ """AppState facade accessors for META-MCP-3 write-side services. Provides ``has_`` / ```` / ``set_`` triples -for the five services META-MCP-3 introduces or wires onto AppState: +for the services META-MCP-3 introduces or wires onto AppState: - ``workflow_service`` -- workflow definition CRUD facade. - ``workflow_execution_service`` -- workflow execution lifecycle. - ``workflow_version_service`` -- workflow definition version reads. - ``subworkflow_service`` -- subworkflow control plane. - ``self_improvement_service`` -- meta-loop trigger and config readout. +- ``chief_of_staff_chat`` -- LLM-backed chat for proposal/alert/free-form + explanations served by the ``POST /meta/chat`` endpoint. This is the META-MCP-3 mixin; the parallel META-MCP-4 mixin lives in ``state_services_facades_mcp4.py``. Both follow the same conventions @@ -28,6 +30,7 @@ from synthorg.engine.workflow.version_service import ( WorkflowVersionService, # noqa: TC001 ) +from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat # noqa: TC001 from synthorg.meta.service import SelfImprovementService # noqa: TC001 from synthorg.observability import get_logger from synthorg.observability.events.api import API_STATE_SERVICE_ATTACHED @@ -66,6 +69,7 @@ def _attach_service( _workflow_version_service: WorkflowVersionService | None _subworkflow_service: SubworkflowService | None _self_improvement_service: SelfImprovementService | None + _chief_of_staff_chat: ChiefOfStaffChat | None # ── WorkflowService ────────────────────────────────────────── @@ -191,5 +195,28 @@ def set_self_improvement_service( name="self_improvement_service", ) + # ── ChiefOfStaffChat ───────────────────────────────────────── + + @property + def has_chief_of_staff_chat(self) -> bool: + """Whether the Chief of Staff chat backend has been attached.""" + return self._chief_of_staff_chat is not None + + @property + def chief_of_staff_chat(self) -> ChiefOfStaffChat: + """Return the attached :class:`ChiefOfStaffChat`.""" + return self._require_service( + self._chief_of_staff_chat, + "chief_of_staff_chat", + ) + + def set_chief_of_staff_chat(self, service: ChiefOfStaffChat) -> None: + """Attach the Chief of Staff chat backend (one-shot).""" + self._attach_service( + slot="_chief_of_staff_chat", + service=service, + name="chief_of_staff_chat", + ) + __all__ = ["_MetaMcp3FacadesMixin"] diff --git a/tests/unit/api/controllers/test_meta_chat.py b/tests/unit/api/controllers/test_meta_chat.py new file mode 100644 index 0000000000..4031ac8f49 --- /dev/null +++ b/tests/unit/api/controllers/test_meta_chat.py @@ -0,0 +1,135 @@ +"""Tests for the ``POST /meta/chat`` endpoint.""" + +from typing import Any +from unittest.mock import AsyncMock + +import pytest +from litestar.testing import TestClient + +from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat +from synthorg.meta.chief_of_staff.models import ChatResponse +from synthorg.meta.models import ( + OrgBudgetSummary, + OrgCoordinationSummary, + OrgErrorSummary, + OrgEvolutionSummary, + OrgPerformanceSummary, + OrgScalingSummary, + OrgSignalSnapshot, + OrgTelemetrySummary, +) +from synthorg.meta.signals.service import SignalsService +from tests.unit.api.conftest import make_auth_headers + +_BASE = "/api/v1/meta/chat" +_HEADERS = make_auth_headers("ceo") + + +def _empty_snapshot() -> OrgSignalSnapshot: + return OrgSignalSnapshot( + performance=OrgPerformanceSummary( + avg_quality_score=7.0, + avg_success_rate=0.8, + avg_collaboration_score=6.5, + agent_count=5, + ), + budget=OrgBudgetSummary( + total_spend=10.0, + productive_ratio=0.6, + coordination_ratio=0.3, + system_ratio=0.1, + forecast_confidence=0.8, + orchestration_overhead=0.2, + ), + coordination=OrgCoordinationSummary(), + scaling=OrgScalingSummary(), + errors=OrgErrorSummary(), + evolution=OrgEvolutionSummary(), + telemetry=OrgTelemetrySummary(), + ) + + +@pytest.mark.unit +class TestMetaChat: + """Endpoint dispatches to ChiefOfStaffChat when wired, 503 otherwise.""" + + def test_returns_503_when_chat_not_wired( + self, + test_client: TestClient[Any], + ) -> None: + """No chief_of_staff_chat wired => explicit ServiceUnavailableError.""" + app_state = test_client.app.state.app_state + original = app_state._chief_of_staff_chat + app_state._chief_of_staff_chat = None + try: + resp = test_client.post( + _BASE, + headers=_HEADERS, + json={"question": "How are we doing?"}, + ) + assert resp.status_code == 503 + body = resp.json() + assert body["success"] is False + assert body["error"] == "Service unavailable" + finally: + app_state._chief_of_staff_chat = original + + async def test_returns_chat_payload_when_wired( + self, + test_client: TestClient[Any], + ) -> None: + """Wired chat backend returns the answer + sources + confidence.""" + chat_mock = AsyncMock(spec=ChiefOfStaffChat) + chat_mock.ask.return_value = ChatResponse( + answer="Quality is up 5%.", + sources=("performance",), + confidence=0.8, + ) + signals_mock = AsyncMock(spec=SignalsService) + signals_mock.get_org_snapshot.return_value = _empty_snapshot() + + app_state = test_client.app.state.app_state + chat_original = app_state._chief_of_staff_chat + signals_original = app_state._signals_service + app_state._chief_of_staff_chat = chat_mock + app_state._signals_service = signals_mock + try: + resp = test_client.post( + _BASE, + headers=_HEADERS, + json={"question": "How is quality trending?"}, + ) + assert resp.status_code == 201 + body = resp.json() + assert body["success"] is True + assert body["data"]["answer"] == "Quality is up 5%." + assert body["data"]["sources"] == ["performance"] + assert body["data"]["confidence"] == pytest.approx(0.8) + finally: + app_state._chief_of_staff_chat = chat_original + app_state._signals_service = signals_original + + def test_returns_503_when_signals_service_missing( + self, + test_client: TestClient[Any], + ) -> None: + """A wired chat backend still 503s if SignalsService is unavailable.""" + chat_mock = AsyncMock(spec=ChiefOfStaffChat) + app_state = test_client.app.state.app_state + chat_original = app_state._chief_of_staff_chat + signals_original = app_state._signals_service + app_state._chief_of_staff_chat = chat_mock + app_state._signals_service = None + try: + resp = test_client.post( + _BASE, + headers=_HEADERS, + json={"question": "How are we doing?"}, + ) + assert resp.status_code == 503 + body = resp.json() + assert body["success"] is False + assert body["error"] == "Service unavailable" + finally: + app_state._chief_of_staff_chat = chat_original + app_state._signals_service = signals_original From 6d8925e60d29a8f9486df796444ddfa6e68c0a0f Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 22:15:50 +0200 Subject: [PATCH 06/15] feat(strategy): wire MemoryContextProvider to query memory backend Make StrategicContextProvider.provide async across the protocol and the three implementations (Config / Memory / Composite). MemoryContextProvider now accepts an optional MemoryBackend and queries it at provide-time for SEMANTIC entries on the synthetic system:strategy agent tagged strategic-context, parsing the entry content as a JSON object of overridable fields (maturity_stage / industry / competitive_position). Layered overrides are applied on top of the fallback context. Missing backend, retrieval errors, malformed JSON, and non-string overrides all degrade gracefully to the fallback. build_context() takes an optional memory_backend kwarg and is now async. The function had no production callers; only tests use it, and they have been updated. Add STRATEGY_CONTEXT_MEMORY_QUERIED observability event. --- src/synthorg/engine/strategy/context.py | 151 ++++++++++++-- src/synthorg/observability/events/strategy.py | 1 + tests/unit/engine/strategy/test_context.py | 196 ++++++++++++++++-- 3 files changed, 304 insertions(+), 44 deletions(-) diff --git a/src/synthorg/engine/strategy/context.py b/src/synthorg/engine/strategy/context.py index 01bc687089..5c43b9494b 100644 --- a/src/synthorg/engine/strategy/context.py +++ b/src/synthorg/engine/strategy/context.py @@ -5,23 +5,42 @@ how lenses and principles are applied to agent recommendations. """ +import builtins +import json from typing import Protocol, runtime_checkable +from synthorg.core.enums import MemoryCategory +from synthorg.core.types import NotBlankStr from synthorg.engine.strategy.models import StrategicContext, StrategyConfig +from synthorg.memory.models import MemoryQuery +from synthorg.memory.protocol import MemoryBackend # noqa: TC001 from synthorg.observability import get_logger, safe_error_description from synthorg.observability.events.strategy import ( STRATEGY_CONTEXT_BUILT, + STRATEGY_CONTEXT_MEMORY_QUERIED, STRATEGY_CONTEXT_PROVIDER_FAILED, ) logger = get_logger(__name__) +_STRATEGIC_CONTEXT_AGENT_ID: NotBlankStr = NotBlankStr("system:strategy") +"""Synthetic agent id used for org-level strategic-context entries.""" + +_STRATEGIC_CONTEXT_TAG: NotBlankStr = NotBlankStr("strategic-context") +"""Tag the memory backend filters on for strategic-context entries.""" + +_OVERRIDABLE_FIELDS: tuple[str, ...] = ( + "maturity_stage", + "industry", + "competitive_position", +) + @runtime_checkable class StrategicContextProvider(Protocol): """Protocol for providing strategic context.""" - def provide(self, *, config: StrategyConfig) -> StrategicContext: + async def provide(self, *, config: StrategyConfig) -> StrategicContext: """Build strategic context from the given configuration. Args: @@ -40,7 +59,7 @@ class ConfigContextProvider: competitive position from :class:`StrategyConfig.context`. """ - def provide(self, *, config: StrategyConfig) -> StrategicContext: + async def provide(self, *, config: StrategyConfig) -> StrategicContext: """Build context from config fields.""" ctx = StrategicContext( maturity_stage=config.context.maturity_stage, @@ -58,26 +77,101 @@ def provide(self, *, config: StrategyConfig) -> StrategicContext: class MemoryContextProvider: - """Reads strategic context from the memory system. + """Reads strategic context overrides from the memory backend. - Placeholder for memory-driven context integration. Falls back to - config-based context when memory data is unavailable. + Queries org-level memory for the most recent ``strategic-context`` + entry tagged on the synthetic ``system:strategy`` agent, parses the + entry content as a JSON object of overridable fields + (``maturity_stage`` / ``industry`` / ``competitive_position``), and + layers the overrides on top of the fallback provider's context. + + Falls back to the fallback provider when: + + - no memory backend was injected (degraded boot); + - the backend retrieval call raises; + - no strategic-context entries are stored; + - the entry content is not parseable JSON. """ - def __init__(self, *, fallback: StrategicContextProvider) -> None: - """Initialize with a fallback context provider.""" + def __init__( + self, + *, + fallback: StrategicContextProvider, + memory_backend: MemoryBackend | None = None, + ) -> None: + """Initialize with a fallback provider and an optional backend.""" self._fallback = fallback + self._memory_backend = memory_backend - def provide(self, *, config: StrategyConfig) -> StrategicContext: - """Build context from memory, falling back to config.""" - # The memory-driven path (when wired) will query agent / org - # memory for dynamic context. For now, always delegate to - # the fallback provider. - logger.debug( - STRATEGY_CONTEXT_BUILT, - source="memory_fallback", + async def provide(self, *, config: StrategyConfig) -> StrategicContext: # noqa: PLR0911 + """Layer memory-stored overrides on top of the fallback context.""" + if self._memory_backend is None: + logger.debug(STRATEGY_CONTEXT_BUILT, source="memory_no_backend") + return await self._fallback.provide(config=config) + + try: + entries = await self._memory_backend.retrieve( + _STRATEGIC_CONTEXT_AGENT_ID, + MemoryQuery( + categories=frozenset({MemoryCategory.SEMANTIC}), + tags=(_STRATEGIC_CONTEXT_TAG,), + limit=1, + ), + ) + except builtins.MemoryError, RecursionError: + raise + except Exception as exc: + logger.warning( + STRATEGY_CONTEXT_PROVIDER_FAILED, + provider_name="MemoryContextProvider", + stage="retrieve", + error_type=type(exc).__name__, + error=safe_error_description(exc), + ) + return await self._fallback.provide(config=config) + + if not entries: + logger.debug( + STRATEGY_CONTEXT_MEMORY_QUERIED, + outcome="no_entries", + ) + return await self._fallback.provide(config=config) + + try: + payload = json.loads(entries[0].content) + except json.JSONDecodeError as exc: + logger.warning( + STRATEGY_CONTEXT_PROVIDER_FAILED, + provider_name="MemoryContextProvider", + stage="parse", + error_type=type(exc).__name__, + error=safe_error_description(exc), + ) + return await self._fallback.provide(config=config) + + if not isinstance(payload, dict): + logger.warning( + STRATEGY_CONTEXT_PROVIDER_FAILED, + provider_name="MemoryContextProvider", + stage="parse", + reason="content_not_object", + ) + return await self._fallback.provide(config=config) + + fallback_ctx = await self._fallback.provide(config=config) + overrides: dict[str, str] = {} + for field_name in _OVERRIDABLE_FIELDS: + value = payload.get(field_name) + if isinstance(value, str) and value.strip(): + overrides[field_name] = value + if not overrides: + return fallback_ctx + logger.info( + STRATEGY_CONTEXT_MEMORY_QUERIED, + outcome="overrides_applied", + overridden_fields=sorted(overrides), ) - return self._fallback.provide(config=config) + return fallback_ctx.model_copy(update=overrides) class CompositeContextProvider: @@ -97,14 +191,14 @@ def __init__( raise ValueError(msg) self._providers = providers - def provide(self, *, config: StrategyConfig) -> StrategicContext: + async def provide(self, *, config: StrategyConfig) -> StrategicContext: """Try each provider in order, return first success.""" last_exc: Exception | None = None for i, provider in enumerate(self._providers): provider_name = type(provider).__name__ try: - return provider.provide(config=config) - except MemoryError, RecursionError: + return await provider.provide(config=config) + except builtins.MemoryError, RecursionError: raise except Exception as exc: logger.warning( @@ -121,7 +215,11 @@ def provide(self, *, config: StrategyConfig) -> StrategicContext: raise RuntimeError(msg) from last_exc -def build_context(config: StrategyConfig) -> StrategicContext: +async def build_context( + config: StrategyConfig, + *, + memory_backend: MemoryBackend | None = None, +) -> StrategicContext: """Convenience factory for building strategic context. Selects the appropriate provider based on ``config.context.source`` @@ -129,6 +227,9 @@ def build_context(config: StrategyConfig) -> StrategicContext: Args: config: Strategy configuration. + memory_backend: Optional :class:`MemoryBackend` for memory-driven + overrides. When ``None``, ``ContextSource.MEMORY`` and + ``ContextSource.COMPOSITE`` degrade to pure config reads. Returns: Immutable strategic context snapshot. @@ -140,12 +241,18 @@ def build_context(config: StrategyConfig) -> StrategicContext: if config.context.source == ContextSource.MEMORY: provider: StrategicContextProvider = MemoryContextProvider( fallback=config_provider, + memory_backend=memory_backend, ) elif config.context.source == ContextSource.COMPOSITE: provider = CompositeContextProvider( - providers=(MemoryContextProvider(fallback=config_provider),), + providers=( + MemoryContextProvider( + fallback=config_provider, + memory_backend=memory_backend, + ), + ), ) else: provider = config_provider - return provider.provide(config=config) + return await provider.provide(config=config) diff --git a/src/synthorg/observability/events/strategy.py b/src/synthorg/observability/events/strategy.py index 025cdd46b7..0d712fe44b 100644 --- a/src/synthorg/observability/events/strategy.py +++ b/src/synthorg/observability/events/strategy.py @@ -16,6 +16,7 @@ STRATEGY_CONFIG_VALIDATED: Final[str] = "strategy.config.validated" STRATEGY_PRINCIPLES_LOAD_FAILED: Final[str] = "strategy.principles.load_failed" STRATEGY_CONTEXT_PROVIDER_FAILED: Final[str] = "strategy.context.provider_failed" +STRATEGY_CONTEXT_MEMORY_QUERIED: Final[str] = "strategy.context.memory_queried" STRATEGY_LENS_DEFINITION_INCOMPLETE: Final[str] = "strategy.lens.definition_incomplete" STRATEGY_LENS_LOOKUP_FAILED: Final[str] = "strategy.lens.lookup_failed" STRATEGY_LENS_ASSIGNMENT_FAILED: Final[str] = "strategy.lens.assignment_failed" diff --git a/tests/unit/engine/strategy/test_context.py b/tests/unit/engine/strategy/test_context.py index 5a580a3c81..363ad9f10b 100644 --- a/tests/unit/engine/strategy/test_context.py +++ b/tests/unit/engine/strategy/test_context.py @@ -1,7 +1,13 @@ """Unit tests for strategic context providers.""" +import json +from datetime import UTC, datetime +from unittest.mock import AsyncMock + import pytest +from synthorg.core.enums import MemoryCategory +from synthorg.core.types import NotBlankStr from synthorg.engine.strategy.context import ( CompositeContextProvider, ConfigContextProvider, @@ -14,21 +20,37 @@ StrategicContextConfig, StrategyConfig, ) +from synthorg.memory.models import MemoryEntry, MemoryMetadata +from synthorg.memory.protocol import MemoryBackend + + +def _entry(content: str) -> MemoryEntry: + """Build a MemoryEntry whose content is the JSON override payload.""" + return MemoryEntry( + id=NotBlankStr("entry-1"), + agent_id=NotBlankStr("system:strategy"), + category=MemoryCategory.SEMANTIC, + content=NotBlankStr(content), + metadata=MemoryMetadata(tags=(NotBlankStr("strategic-context"),)), + created_at=datetime(2026, 5, 9, 12, 0, 0, tzinfo=UTC), + ) class TestConfigContextProvider: """Tests for ConfigContextProvider.""" @pytest.mark.unit - def test_reads_from_config(self, default_strategy_config: StrategyConfig) -> None: + async def test_reads_from_config( + self, default_strategy_config: StrategyConfig + ) -> None: provider = ConfigContextProvider() - ctx = provider.provide(config=default_strategy_config) + ctx = await provider.provide(config=default_strategy_config) assert ctx.maturity_stage == "growth" assert ctx.industry == "technology" assert ctx.competitive_position == "challenger" @pytest.mark.unit - def test_custom_config(self) -> None: + async def test_custom_config(self) -> None: config = StrategyConfig( context=StrategicContextConfig( maturity_stage="seed", @@ -37,38 +59,156 @@ def test_custom_config(self) -> None: ), ) provider = ConfigContextProvider() - ctx = provider.provide(config=config) + ctx = await provider.provide(config=config) assert ctx.maturity_stage == "seed" assert ctx.industry == "fintech" assert ctx.competitive_position == "niche" class TestMemoryContextProvider: - """Tests for MemoryContextProvider (placeholder).""" + """Tests for MemoryContextProvider memory-driven overrides.""" @pytest.mark.unit - def test_falls_back_to_config( + async def test_falls_back_when_no_memory_backend( self, default_strategy_config: StrategyConfig, ) -> None: fallback = ConfigContextProvider() provider = MemoryContextProvider(fallback=fallback) - ctx = provider.provide(config=default_strategy_config) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "growth" + + @pytest.mark.unit + async def test_falls_back_when_backend_returns_no_entries( + self, + default_strategy_config: StrategyConfig, + ) -> None: + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = () + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) assert ctx.maturity_stage == "growth" + backend.retrieve.assert_called_once() + + @pytest.mark.unit + async def test_falls_back_when_backend_raises( + self, + default_strategy_config: StrategyConfig, + ) -> None: + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.side_effect = RuntimeError("backend unavailable") + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "growth" + + @pytest.mark.unit + async def test_falls_back_when_content_is_not_json( + self, + default_strategy_config: StrategyConfig, + ) -> None: + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = (_entry("not-json-at-all"),) + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "growth" + + @pytest.mark.unit + async def test_falls_back_when_content_is_not_object( + self, + default_strategy_config: StrategyConfig, + ) -> None: + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = (_entry(json.dumps(["scaleup"])),) + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "growth" + + @pytest.mark.unit + async def test_applies_full_overrides( + self, + default_strategy_config: StrategyConfig, + ) -> None: + payload = { + "maturity_stage": "scaleup", + "industry": "fintech", + "competitive_position": "leader", + } + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = (_entry(json.dumps(payload)),) + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "scaleup" + assert ctx.industry == "fintech" + assert ctx.competitive_position == "leader" + + @pytest.mark.unit + async def test_applies_partial_overrides( + self, + default_strategy_config: StrategyConfig, + ) -> None: + # Only ``maturity_stage`` is overridden; the other fields keep + # the fallback config values. + payload = {"maturity_stage": "scaleup"} + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = (_entry(json.dumps(payload)),) + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "scaleup" + assert ctx.industry == "technology" + assert ctx.competitive_position == "challenger" + + @pytest.mark.unit + async def test_ignores_blank_or_non_string_overrides( + self, + default_strategy_config: StrategyConfig, + ) -> None: + payload = { + "maturity_stage": " ", + "industry": 42, + "competitive_position": "leader", + } + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = (_entry(json.dumps(payload)),) + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "growth" + assert ctx.industry == "technology" + assert ctx.competitive_position == "leader" class TestCompositeContextProvider: """Tests for CompositeContextProvider.""" @pytest.mark.unit - def test_returns_first_success( + async def test_returns_first_success( self, default_strategy_config: StrategyConfig, ) -> None: provider = CompositeContextProvider( providers=(ConfigContextProvider(),), ) - ctx = provider.provide(config=default_strategy_config) + ctx = await provider.provide(config=default_strategy_config) assert ctx.maturity_stage == "growth" @pytest.mark.unit @@ -77,32 +217,32 @@ def test_empty_providers_raises(self) -> None: CompositeContextProvider(providers=()) @pytest.mark.unit - def test_falls_back_on_first_provider_failure( + async def test_falls_back_on_first_provider_failure( self, default_strategy_config: StrategyConfig, ) -> None: """When first provider fails, second provider is used.""" class FailingProvider: - def provide(self, *, config: StrategyConfig) -> StrategicContext: + async def provide(self, *, config: StrategyConfig) -> StrategicContext: msg = "provider failed" raise RuntimeError(msg) provider = CompositeContextProvider( providers=(FailingProvider(), ConfigContextProvider()), ) - ctx = provider.provide(config=default_strategy_config) + ctx = await provider.provide(config=default_strategy_config) assert ctx.maturity_stage == "growth" @pytest.mark.unit - def test_all_providers_fail_raises_runtime_error( + async def test_all_providers_fail_raises_runtime_error( self, default_strategy_config: StrategyConfig, ) -> None: """When all providers fail, RuntimeError is raised.""" class FailingProvider: - def provide(self, *, config: StrategyConfig) -> StrategicContext: + async def provide(self, *, config: StrategyConfig) -> StrategicContext: msg = "provider failed" raise RuntimeError(msg) @@ -110,33 +250,45 @@ def provide(self, *, config: StrategyConfig) -> StrategicContext: providers=(FailingProvider(), FailingProvider()), ) with pytest.raises(RuntimeError, match="All context providers failed"): - provider.provide(config=default_strategy_config) + await provider.provide(config=default_strategy_config) class TestBuildContext: """Tests for the build_context convenience factory.""" @pytest.mark.unit - def test_config_source(self) -> None: + async def test_config_source(self) -> None: config = StrategyConfig( context=StrategicContextConfig(source=ContextSource.CONFIG), ) - ctx = build_context(config) + ctx = await build_context(config) assert ctx.maturity_stage == "growth" @pytest.mark.unit - def test_memory_source(self) -> None: + async def test_memory_source_without_backend_falls_back(self) -> None: config = StrategyConfig( context=StrategicContextConfig(source=ContextSource.MEMORY), ) - ctx = build_context(config) - # Falls back to config since memory is a placeholder. + ctx = await build_context(config) + # Without a memory backend the provider degrades to config. assert ctx.maturity_stage == "growth" @pytest.mark.unit - def test_composite_source(self) -> None: + async def test_memory_source_with_backend_applies_overrides(self) -> None: + config = StrategyConfig( + context=StrategicContextConfig(source=ContextSource.MEMORY), + ) + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = ( + _entry(json.dumps({"maturity_stage": "scaleup"})), + ) + ctx = await build_context(config, memory_backend=backend) + assert ctx.maturity_stage == "scaleup" + + @pytest.mark.unit + async def test_composite_source(self) -> None: config = StrategyConfig( context=StrategicContextConfig(source=ContextSource.COMPOSITE), ) - ctx = build_context(config) + ctx = await build_context(config) assert ctx.maturity_stage == "growth" From f5d31da9529b8d3986f390851fa59f92965c1f7d Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 22:26:25 +0200 Subject: [PATCH 07/15] test(evals): add prompt + behavioural eval suites for four surfaces Reuse the existing tests/evals/prompt/_harness.py primitives to add contract, fingerprint, and call-site coverage for surfaces the audit flagged as unevalued: - supervisor_router_prompt: temperature=0.0 pinned config check, pinned fingerprints for the routing and retry system prompts, and a call- site assertion that both provider.complete() invocations pass the pinned _ROUTING_COMPLETION_CONFIG. - health_judge_behaviour: heuristic HealthJudge has no LLM prompt, so the eval is a labelled set of (termination_reason, has_recovery, signals) inputs grading the resulting EscalationCause at 100% accuracy, plus severity tier checks at threshold and 2x threshold. - code_modification_prompt: temperature stays config-driven, the _SYSTEM_PROMPT fingerprint is pinned, and the prompt carries the untrusted-content directive. - procedural_memory_prompt: same shape as code_modification, plus a call-site check that propose() passes the pre-built self._completion_config rather than a fresh inline construction. LLM grader coverage already lives in tests/evals/prompt/test_rubric_grader_prompt.py. --- .../prompt/test_code_modification_prompt.py | 81 +++++++++ .../prompt/test_health_judge_behaviour.py | 171 ++++++++++++++++++ .../prompt/test_procedural_memory_prompt.py | 99 ++++++++++ .../prompt/test_supervisor_router_prompt.py | 83 +++++++++ 4 files changed, 434 insertions(+) create mode 100644 tests/evals/prompt/test_code_modification_prompt.py create mode 100644 tests/evals/prompt/test_health_judge_behaviour.py create mode 100644 tests/evals/prompt/test_procedural_memory_prompt.py create mode 100644 tests/evals/prompt/test_supervisor_router_prompt.py diff --git a/tests/evals/prompt/test_code_modification_prompt.py b/tests/evals/prompt/test_code_modification_prompt.py new file mode 100644 index 0000000000..81155e6f0a --- /dev/null +++ b/tests/evals/prompt/test_code_modification_prompt.py @@ -0,0 +1,81 @@ +"""Prompt eval: code modification strategy temperature + prompt drift. + +Verifies the deterministic-property contract for the meta self- +improvement code-modification strategy: + +1. The CompletionConfig sent to the LLM picks temperature + max_tokens + from ``CodeModificationConfig`` so they remain runtime-tunable + (ops can tighten temperature in production), not silently hardcoded. +2. ``_SYSTEM_PROMPT`` bytes haven't drifted: silent prompt edits must + either bump the pinned fingerprint here OR add a labelled regression + example. + +Reference: ``synthorg.meta.strategies.code_modification``. +""" + +import inspect + +import pytest + +from tests.evals.prompt._harness import fingerprint_prompt + +# Pinned SHA-256[:16] of ``_SYSTEM_PROMPT``. Bump when the prompt +# changes intentionally, paired with new behavioural coverage. +_PINNED_CODE_MOD_PROMPT_FP = "6ae360befb461ca9" + + +@pytest.mark.unit +class TestCodeModificationPromptContract: + """Guard rails for the code modification strategy prompt surface.""" + + def test_temperature_is_config_driven(self) -> None: + """The CompletionConfig must read ``temperature`` from config. + + We deliberately avoid pinning the value to ``0.0`` here because + operations may tune the temperature for code generation in + production. The contract is "config-driven, not literal". + """ + from synthorg.meta.strategies.code_modification import ( + CodeModificationStrategy, + ) + + source = inspect.getsource(CodeModificationStrategy) + assert "temperature=self._code_config.temperature" in source, ( + "CodeModificationStrategy must build CompletionConfig from " + "self._code_config.temperature so the value stays runtime-" + "tunable rather than hardcoded" + ) + assert "max_tokens=self._code_config.max_tokens" in source, ( + "CodeModificationStrategy must read max_tokens from " + "self._code_config so token budgets stay tunable" + ) + + def test_system_prompt_fingerprint_is_pinned(self) -> None: + """Detect silent drift in the code-modification ``_SYSTEM_PROMPT``.""" + from synthorg.meta.strategies import code_modification + + fp = fingerprint_prompt(code_modification._SYSTEM_PROMPT) + assert fp == _PINNED_CODE_MOD_PROMPT_FP, ( + f"code_modification._SYSTEM_PROMPT drifted: got {fp!r}, " + f"expected {_PINNED_CODE_MOD_PROMPT_FP!r}. Update the pin " + "and add coverage for the new behaviour." + ) + + def test_system_prompt_includes_anti_injection_directive(self) -> None: + """The prompt must include the untrusted-content directive. + + SynthOrg's prompt-safety contract requires every system prompt + that consumes operator-controlled or task-derived text to + include an explicit untrusted-content directive. The + code-modification strategy formats triggering rules + signal + contexts into the user message, so the directive is mandatory. + """ + from synthorg.meta.strategies import code_modification + + # The exact directive bytes are produced by + # ``untrusted_content_directive`` and may evolve, so we look + # for the stable opening phrase rather than the full body. + assert "untrusted" in code_modification._SYSTEM_PROMPT.lower(), ( + "_SYSTEM_PROMPT must include the untrusted-content " + "directive (no 'untrusted' substring detected)" + ) diff --git a/tests/evals/prompt/test_health_judge_behaviour.py b/tests/evals/prompt/test_health_judge_behaviour.py new file mode 100644 index 0000000000..1ecefedb46 --- /dev/null +++ b/tests/evals/prompt/test_health_judge_behaviour.py @@ -0,0 +1,171 @@ +"""Behavioural eval: HealthJudge ticket emission. + +HealthJudge is heuristic, not LLM-backed, so a "prompt eval" doesn't +apply. The corresponding deterministic-property eval is a labelled +behavioural set: ``(termination_reason / quality_signals / has_recovery) +-> expected EscalationCause | None``. Any heuristic regression that +flips a labelled outcome fails the suite. + +Reference implementation: ``synthorg.engine.health.judge.HealthJudge``. +""" + +import pytest + +from synthorg.engine.health.judge import HealthJudge +from synthorg.engine.health.models import EscalationCause +from synthorg.engine.loop_protocol import TerminationReason +from synthorg.engine.quality.models import StepQuality, StepQualitySignal +from tests.evals.prompt._harness import ( + LabelledExample, + assert_accuracy_at_least, + run_grader, +) + + +def _signal(quality: StepQuality, *, step_index: int = 0) -> StepQualitySignal: + return StepQualitySignal( + quality=quality, + confidence=0.9, + reason=f"eval-{quality.value}", + step_index=step_index, + turn_range=(1, 1), + ) + + +_INCORRECT = _signal(StepQuality.INCORRECT) +_CORRECT = _signal(StepQuality.CORRECT) + + +@pytest.mark.unit +class TestHealthJudgeBehaviour: + """Labelled-input behavioural eval for HealthJudge.emit_ticket.""" + + EXAMPLES: tuple[LabelledExample, ...] = ( + LabelledExample( + name="stagnation_emits_stagnation", + inp={ + "termination_reason": TerminationReason.STAGNATION, + "has_recovery": False, + "quality_signals": (), + }, + expected=EscalationCause.STAGNATION, + ), + LabelledExample( + name="error_with_recovery_emits_repeated_failure", + inp={ + "termination_reason": TerminationReason.ERROR, + "has_recovery": True, + "quality_signals": (), + }, + expected=EscalationCause.REPEATED_FAILURE, + ), + LabelledExample( + name="error_without_recovery_emits_nothing", + inp={ + "termination_reason": TerminationReason.ERROR, + "has_recovery": False, + "quality_signals": (), + }, + expected=None, + ), + LabelledExample( + name="three_trailing_incorrect_emits_quality_degradation", + inp={ + "termination_reason": TerminationReason.COMPLETED, + "has_recovery": False, + "quality_signals": (_INCORRECT, _INCORRECT, _INCORRECT), + }, + expected=EscalationCause.QUALITY_DEGRADATION, + ), + LabelledExample( + name="two_trailing_incorrect_below_threshold", + inp={ + "termination_reason": TerminationReason.COMPLETED, + "has_recovery": False, + "quality_signals": (_INCORRECT, _INCORRECT), + }, + expected=None, + ), + LabelledExample( + name="trailing_correct_breaks_streak", + inp={ + "termination_reason": TerminationReason.COMPLETED, + "has_recovery": False, + "quality_signals": (_INCORRECT, _INCORRECT, _INCORRECT, _CORRECT), + }, + expected=None, + ), + LabelledExample( + name="completed_clean_signals_emits_nothing", + inp={ + "termination_reason": TerminationReason.COMPLETED, + "has_recovery": False, + "quality_signals": (_CORRECT, _CORRECT), + }, + expected=None, + ), + LabelledExample( + name="six_trailing_incorrect_escalates_to_critical", + inp={ + "termination_reason": TerminationReason.COMPLETED, + "has_recovery": False, + "quality_signals": (_INCORRECT,) * 6, + }, + # The cause is still QUALITY_DEGRADATION; the ``critical`` + # severity is checked separately in + # ``test_critical_severity_at_double_threshold`` below so + # this labelled set stays uniform. + expected=EscalationCause.QUALITY_DEGRADATION, + ), + ) + + def test_emit_ticket_matches_labelled_examples(self) -> None: + """Every labelled (input, expected_cause) pair grades correctly.""" + judge = HealthJudge() + + def _grade(actual_input: object, expected: object) -> bool: + assert isinstance(actual_input, dict) + ticket = judge.emit_ticket( + termination_reason=actual_input["termination_reason"], + has_recovery=actual_input["has_recovery"], + quality_signals=actual_input["quality_signals"], + agent_id="agent-eval", + task_id="task-eval", + ) + actual_cause = ticket.cause if ticket is not None else None + return actual_cause == expected + + outcome = run_grader(self.EXAMPLES, _grade) + # 100% accuracy: every labelled outcome must match. The whole + # suite exists to catch heuristic regressions, not soft drift. + assert_accuracy_at_least(outcome, 1.0) + + def test_critical_severity_at_double_threshold(self) -> None: + """At >= 2x threshold trailing INCORRECT, severity is CRITICAL.""" + from synthorg.engine.health.models import EscalationSeverity + + judge = HealthJudge(quality_degradation_threshold=3) + ticket = judge.emit_ticket( + termination_reason=TerminationReason.COMPLETED, + has_recovery=False, + quality_signals=(_INCORRECT,) * 6, + agent_id="agent-eval", + task_id="task-eval", + ) + assert ticket is not None + assert ticket.severity == EscalationSeverity.CRITICAL + + def test_high_severity_below_double_threshold(self) -> None: + """Just at threshold but below 2x => severity is HIGH.""" + from synthorg.engine.health.models import EscalationSeverity + + judge = HealthJudge(quality_degradation_threshold=3) + ticket = judge.emit_ticket( + termination_reason=TerminationReason.COMPLETED, + has_recovery=False, + quality_signals=(_INCORRECT,) * 3, + agent_id="agent-eval", + task_id="task-eval", + ) + assert ticket is not None + assert ticket.severity == EscalationSeverity.HIGH diff --git a/tests/evals/prompt/test_procedural_memory_prompt.py b/tests/evals/prompt/test_procedural_memory_prompt.py new file mode 100644 index 0000000000..aa5bb6559c --- /dev/null +++ b/tests/evals/prompt/test_procedural_memory_prompt.py @@ -0,0 +1,99 @@ +"""Prompt eval: procedural-memory proposer temperature + prompt drift. + +Verifies the deterministic-property contract for the procedural-memory +proposer LLM call: + +1. The CompletionConfig built at construction time reads + ``temperature`` + ``max_tokens`` from ``ProceduralMemoryConfig`` so + the values stay runtime-tunable rather than silently hardcoded. +2. ``_SYSTEM_PROMPT`` bytes haven't drifted: silent prompt edits must + either bump the pinned fingerprint here OR ship new behavioural + coverage proving the new prompt still satisfies the contract. +3. The proposer passes its module-level ``self._completion_config`` to + ``provider.complete``; a future refactor that constructs a fresh + CompletionConfig inline fails the call-site assertion. + +Reference: ``synthorg.memory.procedural.proposer``. +""" + +import inspect + +import pytest + +from tests.evals.prompt._harness import fingerprint_prompt + +# Pinned SHA-256[:16] of ``_SYSTEM_PROMPT``. Bump when the prompt +# changes intentionally, paired with new behavioural coverage. +_PINNED_PROCEDURAL_PROMPT_FP = "c1cdefcde471733a" + + +@pytest.mark.unit +class TestProceduralMemoryPromptContract: + """Guard rails for the procedural-memory proposer prompt surface.""" + + def test_completion_config_is_built_from_config(self) -> None: + """The proposer must build CompletionConfig from runtime config. + + We deliberately don't pin the value to ``0.0``: ops can tune + the temperature for failure analysis. The contract is that + the CompletionConfig is built from ``ProceduralMemoryConfig``, + not hardcoded. + """ + from synthorg.memory.procedural.proposer import ProceduralMemoryProposer + + source = inspect.getsource(ProceduralMemoryProposer) + assert "temperature=config.temperature" in source, ( + "ProceduralMemoryProposer must build CompletionConfig from " + "config.temperature so the value stays runtime-tunable" + ) + assert "max_tokens=config.max_tokens" in source, ( + "ProceduralMemoryProposer must read max_tokens from " + "ProceduralMemoryConfig so token budgets stay tunable" + ) + + def test_system_prompt_fingerprint_is_pinned(self) -> None: + """Detect silent drift in the proposer ``_SYSTEM_PROMPT``.""" + from synthorg.memory.procedural import proposer + + fp = fingerprint_prompt(proposer._SYSTEM_PROMPT) + assert fp == _PINNED_PROCEDURAL_PROMPT_FP, ( + f"procedural memory _SYSTEM_PROMPT drifted: got {fp!r}, " + f"expected {_PINNED_PROCEDURAL_PROMPT_FP!r}. Update the " + "pin and add coverage for the new behaviour." + ) + + def test_system_prompt_includes_anti_injection_directive(self) -> None: + """The prompt must include the untrusted-content directive. + + SynthOrg's prompt-safety contract requires every system prompt + that consumes attacker-controllable text (task descriptions, + error messages, tool outputs) to include an explicit + untrusted-content directive. The proposer formats all three + into the user message. + """ + from synthorg.memory.procedural import proposer + + # Stable opening phrase rather than the full directive bytes + # (those are produced by ``untrusted_content_directive`` and + # may evolve as the prompt-safety contract tightens). + assert "untrusted" in proposer._SYSTEM_PROMPT.lower(), ( + "_SYSTEM_PROMPT must include the untrusted-content " + "directive (no 'untrusted' substring detected)" + ) + + def test_propose_call_site_uses_pinned_completion_config(self) -> None: + """The propose() method must pass ``self._completion_config``. + + The construction-time check above proves the field is built + correctly. This call-site check refuses any future refactor + that constructs a fresh CompletionConfig inline. + """ + from synthorg.memory.procedural.proposer import ProceduralMemoryProposer + + source = inspect.getsource(ProceduralMemoryProposer.propose) + assert "config=self._completion_config" in source, ( + "ProceduralMemoryProposer.propose must pass " + "config=self._completion_config to provider.complete; a " + "fresh inline CompletionConfig() construction would defeat " + "the runtime-tunable contract" + ) diff --git a/tests/evals/prompt/test_supervisor_router_prompt.py b/tests/evals/prompt/test_supervisor_router_prompt.py new file mode 100644 index 0000000000..52b2d63f17 --- /dev/null +++ b/tests/evals/prompt/test_supervisor_router_prompt.py @@ -0,0 +1,83 @@ +"""Prompt eval: supervisor router temperature + prompt drift. + +Asserts the deterministic properties of the hierarchical retrieval +supervisor router: + +1. ``_ROUTING_COMPLETION_CONFIG.temperature`` is pinned to ``0.0`` so + routing decisions are reproducible across CI shards. +2. The bytes of ``_ROUTING_SYSTEM_PROMPT`` and ``_RETRY_SYSTEM_PROMPT`` + haven't silently drifted: intentional prompt edits must update the + pinned fingerprints in this file. +3. The two production call sites pass the pinned config to + ``provider.complete``; a future refactor that constructs a fresh + config inline fails the call-site test. +""" + +import inspect + +import pytest + +from tests.evals.prompt._harness import fingerprint_prompt + +# Pinned SHA-256[:16] hashes of the routing + retry prompt bodies. Update +# these when the prompts change intentionally; pair the bump with new +# labelled-example coverage so the regression net stays meaningful. +_PINNED_ROUTING_PROMPT_FP = "8a89b74f71c5db10" +_PINNED_RETRY_PROMPT_FP = "f00d41e5150e98c8" + + +@pytest.mark.unit +class TestSupervisorRouterPromptContract: + """Guard rails for the supervisor router prompt surface.""" + + def test_temperature_is_zero(self) -> None: + """Routing + retry must run at temperature=0 for determinism.""" + from synthorg.memory.retrieval.hierarchical import supervisor + + runtime_config = supervisor._ROUTING_COMPLETION_CONFIG + assert runtime_config.temperature == 0.0, ( + "_ROUTING_COMPLETION_CONFIG.temperature must equal 0.0 at " + f"runtime; got {runtime_config.temperature!r}" + ) + + def test_routing_prompt_fingerprint_is_pinned(self) -> None: + """Detect silent drift in ``_ROUTING_SYSTEM_PROMPT``.""" + from synthorg.memory.retrieval.hierarchical import supervisor + + fp = fingerprint_prompt(supervisor._ROUTING_SYSTEM_PROMPT) + assert fp == _PINNED_ROUTING_PROMPT_FP, ( + f"supervisor router routing prompt drifted: got {fp!r}, " + f"expected {_PINNED_ROUTING_PROMPT_FP!r}. Update the pin " + "and add coverage for the new behaviour." + ) + + def test_retry_prompt_fingerprint_is_pinned(self) -> None: + """Detect silent drift in ``_RETRY_SYSTEM_PROMPT``.""" + from synthorg.memory.retrieval.hierarchical import supervisor + + fp = fingerprint_prompt(supervisor._RETRY_SYSTEM_PROMPT) + assert fp == _PINNED_RETRY_PROMPT_FP, ( + f"supervisor router retry prompt drifted: got {fp!r}, " + f"expected {_PINNED_RETRY_PROMPT_FP!r}. Update the pin " + "and add coverage for the new behaviour." + ) + + def test_call_sites_use_pinned_config(self) -> None: + """Both LLM call sites must pass the module-level pinned config. + + AST-level check is intentionally simple (``config=`` keyword + bound to the constant name) because the surface is small and + any cleverer refactor that constructs a fresh CompletionConfig + inline should be caught by ``test_temperature_is_zero`` + regressing or this assertion failing. + """ + from synthorg.memory.retrieval.hierarchical import supervisor + + source = inspect.getsource(supervisor) + # Two distinct call sites: ``_route_via_llm`` and + # ``_evaluate_via_llm``. Each must pass the pinned config. + occurrences = source.count("config=_ROUTING_COMPLETION_CONFIG") + assert occurrences >= 2, ( + "expected both supervisor.complete() call sites to pass " + f"config=_ROUTING_COMPLETION_CONFIG; found {occurrences}" + ) From 4589cd6cf18042b0439d37d54bce29ef907317e3 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 22:42:22 +0200 Subject: [PATCH 08/15] refactor(types): propagate JsonValue narrowing through meta strategies Followup to the JsonValue tightening: update the meta proposal-builder strategy methods (config_tuning / architecture / prompt_tuning) to take ctx as dict[str, JsonValue] instead of dict[str, object] so the value flowing in from RuleMatch.signal_context type-checks. Update affected test helpers to match. Regenerate scripts/mock_spec_baseline.txt to account for the line-number shifts introduced by the new 'from pydantic import JsonValue' import in two test files (and to drop stale entries for tests that now use spec= on their MagicMocks). --- scripts/mock_spec_baseline.txt | 68 ++----------------- src/synthorg/meta/strategies/architecture.py | 6 +- src/synthorg/meta/strategies/config_tuning.py | 14 ++-- src/synthorg/meta/strategies/prompt_tuning.py | 4 +- .../middleware/test_coordination_protocol.py | 2 +- .../meta/rollout/test_inverse_dispatch.py | 3 +- .../meta/test_code_modification_strategy.py | 3 +- tests/unit/meta/test_strategies.py | 5 +- 8 files changed, 30 insertions(+), 75 deletions(-) diff --git a/scripts/mock_spec_baseline.txt b/scripts/mock_spec_baseline.txt index 19316370b1..f06d99d38a 100644 --- a/scripts/mock_spec_baseline.txt +++ b/scripts/mock_spec_baseline.txt @@ -120,53 +120,6 @@ tests/integration/integrations/test_controllers.py:1087:18 tests/integration/integrations/test_controllers.py:1088:27 tests/integration/integrations/test_controllers.py:1105:18 tests/integration/integrations/test_controllers.py:1106:31 -tests/integration/integrations/test_oauth_flows.py:51:11 -tests/integration/integrations/test_oauth_flows.py:54:28 -tests/integration/integrations/test_oauth_flows.py:84:22 -tests/integration/integrations/test_oauth_flows.py:119:22 -tests/integration/integrations/test_oauth_flows.py:155:21 -tests/integration/integrations/test_oauth_flows.py:156:25 -tests/integration/integrations/test_oauth_flows.py:157:28 -tests/integration/integrations/test_oauth_flows.py:158:35 -tests/integration/integrations/test_oauth_flows.py:172:18 -tests/integration/integrations/test_oauth_flows.py:173:31 -tests/integration/integrations/test_oauth_flows.py:180:34 -tests/integration/integrations/test_oauth_flows.py:187:37 -tests/integration/integrations/test_oauth_flows.py:188:25 -tests/integration/integrations/test_oauth_flows.py:190:20 -tests/integration/integrations/test_oauth_flows.py:191:34 -tests/integration/integrations/test_oauth_flows.py:192:25 -tests/integration/integrations/test_oauth_flows.py:219:21 -tests/integration/integrations/test_oauth_flows.py:220:25 -tests/integration/integrations/test_oauth_flows.py:221:28 -tests/integration/integrations/test_oauth_flows.py:222:18 -tests/integration/integrations/test_oauth_flows.py:239:21 -tests/integration/integrations/test_oauth_flows.py:240:25 -tests/integration/integrations/test_oauth_flows.py:241:28 -tests/integration/integrations/test_oauth_flows.py:242:18 -tests/integration/integrations/test_oauth_flows.py:243:31 -tests/integration/integrations/test_oauth_flows.py:250:34 -tests/integration/integrations/test_oauth_flows.py:286:25 -tests/integration/integrations/test_oauth_flows.py:287:28 -tests/integration/integrations/test_oauth_flows.py:288:35 -tests/integration/integrations/test_oauth_flows.py:291:31 -tests/integration/integrations/test_oauth_flows.py:292:34 -tests/integration/integrations/test_oauth_flows.py:293:37 -tests/integration/integrations/test_oauth_flows.py:294:25 -tests/integration/integrations/test_oauth_flows.py:297:34 -tests/integration/integrations/test_oauth_flows.py:337:25 -tests/integration/integrations/test_oauth_flows.py:338:28 -tests/integration/integrations/test_oauth_flows.py:339:35 -tests/integration/integrations/test_oauth_flows.py:342:31 -tests/integration/integrations/test_oauth_flows.py:349:34 -tests/integration/integrations/test_oauth_flows.py:356:37 -tests/integration/integrations/test_oauth_flows.py:357:25 -tests/integration/integrations/test_oauth_flows.py:360:34 -tests/integration/integrations/test_oauth_flows.py:361:25 -tests/integration/integrations/test_oauth_flows.py:398:22 -tests/integration/integrations/test_oauth_flows.py:450:22 -tests/integration/integrations/test_oauth_flows.py:620:22 -tests/integration/integrations/test_oauth_flows.py:753:22 tests/integration/integrations/test_rate_limiter_shared_state.py:114:18 tests/integration/mcp/test_tool_surface.py:59:11 tests/integration/mcp/test_tool_surface.py:60:22 @@ -2287,13 +2240,13 @@ tests/unit/meta/test_code_applier.py:119:16 tests/unit/meta/test_code_applier.py:210:27 tests/unit/meta/test_code_applier.py:250:29 tests/unit/meta/test_code_applier.py:373:25 -tests/unit/meta/test_code_modification_strategy.py:102:15 -tests/unit/meta/test_code_modification_strategy.py:103:20 -tests/unit/meta/test_code_modification_strategy.py:105:24 -tests/unit/meta/test_code_modification_strategy.py:249:19 -tests/unit/meta/test_code_modification_strategy.py:250:28 -tests/unit/meta/test_code_modification_strategy.py:392:19 -tests/unit/meta/test_code_modification_strategy.py:393:28 +tests/unit/meta/test_code_modification_strategy.py:103:15 +tests/unit/meta/test_code_modification_strategy.py:104:20 +tests/unit/meta/test_code_modification_strategy.py:106:24 +tests/unit/meta/test_code_modification_strategy.py:250:19 +tests/unit/meta/test_code_modification_strategy.py:251:28 +tests/unit/meta/test_code_modification_strategy.py:393:19 +tests/unit/meta/test_code_modification_strategy.py:394:28 tests/unit/meta/test_config_loader.py:20:14 tests/unit/meta/test_config_loader.py:23:18 tests/unit/meta/test_config_loader.py:73:14 @@ -2553,13 +2506,6 @@ tests/unit/providers/test_discovery.py:40:13 tests/unit/providers/test_discovery.py:45:24 tests/unit/providers/test_discovery.py:46:23 tests/unit/providers/test_family.py:11:13 -tests/unit/providers/test_health_prober.py:96:22 -tests/unit/providers/test_health_prober.py:98:30 -tests/unit/providers/test_health_prober.py:100:28 -tests/unit/providers/test_health_prober.py:102:30 -tests/unit/providers/test_health_prober.py:103:33 -tests/unit/providers/test_health_prober.py:104:32 -tests/unit/providers/test_health_prober.py:255:24 tests/unit/providers/test_protocol.py:456:32 tests/unit/scripts/test_generate_comparison.py:191:13 tests/unit/scripts/test_generate_comparison.py:193:14 diff --git a/src/synthorg/meta/strategies/architecture.py b/src/synthorg/meta/strategies/architecture.py index 87e9bd33fb..35624e0370 100644 --- a/src/synthorg/meta/strategies/architecture.py +++ b/src/synthorg/meta/strategies/architecture.py @@ -7,6 +7,8 @@ from typing import TYPE_CHECKING from uuid import uuid4 +from pydantic import JsonValue # noqa: TC002 + from synthorg.meta.models import ( ArchitectureChange, ImprovementProposal, @@ -95,7 +97,7 @@ def _build_proposal( def _propose_team_restructure( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], ) -> ImprovementProposal: return ImprovementProposal( id=uuid4(), @@ -141,7 +143,7 @@ def _propose_team_restructure( def _propose_specialist_role( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], ) -> ImprovementProposal: return ImprovementProposal( id=uuid4(), diff --git a/src/synthorg/meta/strategies/config_tuning.py b/src/synthorg/meta/strategies/config_tuning.py index 2b259b1010..ce39745eaf 100644 --- a/src/synthorg/meta/strategies/config_tuning.py +++ b/src/synthorg/meta/strategies/config_tuning.py @@ -10,6 +10,8 @@ from typing import TYPE_CHECKING from uuid import uuid4 +from pydantic import JsonValue # noqa: TC002 + from synthorg.meta.models import ( ConfigChange, ImprovementProposal, @@ -113,7 +115,7 @@ def _build_proposal( def _propose_quality_fix( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], snapshot: OrgSignalSnapshot, ) -> ImprovementProposal: _ = snapshot @@ -162,7 +164,7 @@ def _propose_quality_fix( def _propose_success_rate_fix( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], snapshot: OrgSignalSnapshot, ) -> ImprovementProposal: _ = snapshot @@ -207,7 +209,7 @@ def _propose_success_rate_fix( def _propose_budget_fix( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], snapshot: OrgSignalSnapshot, ) -> ImprovementProposal: _ = snapshot @@ -257,7 +259,7 @@ def _propose_budget_fix( def _propose_coordination_cost_fix( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], ) -> ImprovementProposal: return ImprovementProposal( id=uuid4(), @@ -303,7 +305,7 @@ def _propose_coordination_cost_fix( def _propose_overhead_fix( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], ) -> ImprovementProposal: return ImprovementProposal( id=uuid4(), @@ -347,7 +349,7 @@ def _propose_overhead_fix( def _propose_scaling_fix( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], ) -> ImprovementProposal: return ImprovementProposal( id=uuid4(), diff --git a/src/synthorg/meta/strategies/prompt_tuning.py b/src/synthorg/meta/strategies/prompt_tuning.py index 1862ba19bd..d93f335449 100644 --- a/src/synthorg/meta/strategies/prompt_tuning.py +++ b/src/synthorg/meta/strategies/prompt_tuning.py @@ -8,6 +8,8 @@ from typing import TYPE_CHECKING from uuid import uuid4 +from pydantic import JsonValue # noqa: TC002 + from synthorg.meta.models import ( EvolutionMode, ImprovementProposal, @@ -147,7 +149,7 @@ def _propose_quality_principle( def _propose_error_awareness( self, - ctx: dict[str, object], + ctx: dict[str, JsonValue], mode: EvolutionMode, ) -> ImprovementProposal: return ImprovementProposal( diff --git a/tests/unit/engine/middleware/test_coordination_protocol.py b/tests/unit/engine/middleware/test_coordination_protocol.py index 8a8cacc996..ac45275acf 100644 --- a/tests/unit/engine/middleware/test_coordination_protocol.py +++ b/tests/unit/engine/middleware/test_coordination_protocol.py @@ -298,7 +298,7 @@ class TestCoordinationMiddlewareContext: def test_frozen(self) -> None: ctx = _mw_context() with pytest.raises(ValidationError): - ctx.decomposition_result = "bad" # type: ignore[misc] + ctx.decomposition_result = "bad" # type: ignore[misc,assignment] def test_with_metadata(self) -> None: ctx = _mw_context() diff --git a/tests/unit/meta/rollout/test_inverse_dispatch.py b/tests/unit/meta/rollout/test_inverse_dispatch.py index 67ebff5db9..e3e9b5418f 100644 --- a/tests/unit/meta/rollout/test_inverse_dispatch.py +++ b/tests/unit/meta/rollout/test_inverse_dispatch.py @@ -1,6 +1,7 @@ """Tests for rollback handler protocol and default implementations.""" import pytest +from pydantic import JsonValue from synthorg.core.types import NotBlankStr from synthorg.meta.models import RollbackOperation @@ -56,7 +57,7 @@ def _op( *, op_type: str, target: str, - previous_value: object = None, + previous_value: JsonValue = None, description: str = "test op", ) -> RollbackOperation: return RollbackOperation( diff --git a/tests/unit/meta/test_code_modification_strategy.py b/tests/unit/meta/test_code_modification_strategy.py index 5606fddfec..f1b4a652f1 100644 --- a/tests/unit/meta/test_code_modification_strategy.py +++ b/tests/unit/meta/test_code_modification_strategy.py @@ -4,6 +4,7 @@ from unittest.mock import AsyncMock import pytest +from pydantic import JsonValue from synthorg.meta.config import CodeModificationConfig, SelfImprovementConfig from synthorg.meta.models import ( @@ -63,7 +64,7 @@ def _snap() -> OrgSignalSnapshot: def _rule( name: str = "quality_declining", altitudes: tuple[ProposalAltitude, ...] = (ProposalAltitude.CODE_MODIFICATION,), - ctx: dict[str, object] | None = None, + ctx: dict[str, JsonValue] | None = None, ) -> RuleMatch: return RuleMatch( rule_name=name, diff --git a/tests/unit/meta/test_strategies.py b/tests/unit/meta/test_strategies.py index a398f0a102..679f361926 100644 --- a/tests/unit/meta/test_strategies.py +++ b/tests/unit/meta/test_strategies.py @@ -1,6 +1,7 @@ """Unit tests for meta-loop improvement strategies.""" import pytest +from pydantic import JsonValue from synthorg.meta.config import SelfImprovementConfig from synthorg.meta.models import ( @@ -62,7 +63,7 @@ def _snap() -> OrgSignalSnapshot: def _rule( name: str, altitudes: tuple[ProposalAltitude, ...], - ctx: dict[str, object] | None = None, + ctx: dict[str, JsonValue] | None = None, ) -> RuleMatch: return RuleMatch( rule_name=name, @@ -152,7 +153,7 @@ async def test_unknown_rule_returns_none(self) -> None: async def test_all_known_rules_produce_proposals(self) -> None: s = ConfigTuningStrategy(config=_DEFAULT_CONFIG) - known: list[tuple[str, dict[str, object]]] = [ + known: list[tuple[str, dict[str, JsonValue]]] = [ ("quality_declining", {"avg_quality": 4.0, "agent_count": 10}), ("success_rate_drop", {"avg_success_rate": 0.6}), ("budget_overrun", {"days_until_exhausted": 7, "total_spend": 150}), From e42f22b6f5d8945d7c89cf5bd498d0a2dcbff879 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 22:54:14 +0200 Subject: [PATCH 09/15] chore(magic-numbers): lint-allow pre-existing default-arg constants The pre-push magic-numbers gate flags pre-existing safe defaults in any file touched by this PR. Annotate the three call sites (SupervisorRouter.max_workers_per_query, SupervisorRouter.max_retry_count, PostgresApprovalRepository.list_items.limit, SqliteApprovalRepository. list_items.limit) with lint-allow comments rather than migrate them to settings/definitions/, which is out of scope for this cleanup PR. --- src/synthorg/memory/retrieval/hierarchical/supervisor.py | 4 ++-- src/synthorg/persistence/postgres/approval_repo.py | 2 +- src/synthorg/persistence/sqlite/approval_repo.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/synthorg/memory/retrieval/hierarchical/supervisor.py b/src/synthorg/memory/retrieval/hierarchical/supervisor.py index 4375fcf1ac..db0ef60602 100644 --- a/src/synthorg/memory/retrieval/hierarchical/supervisor.py +++ b/src/synthorg/memory/retrieval/hierarchical/supervisor.py @@ -104,9 +104,9 @@ def __init__( # noqa: PLR0913 *, provider: CompletionProvider, model: NotBlankStr, - max_workers_per_query: int = 2, + max_workers_per_query: int = 2, # lint-allow: magic-numbers -- bounded fan-out reflective_retry_enabled: bool = True, - max_retry_count: int = 2, + max_retry_count: int = 2, # lint-allow: magic-numbers -- bounded retry budget quality_threshold: float = _DEFAULT_QUALITY_THRESHOLD, cost_tracker: CostTracker | None = None, ) -> None: diff --git a/src/synthorg/persistence/postgres/approval_repo.py b/src/synthorg/persistence/postgres/approval_repo.py index 03614dccc3..af34238df2 100644 --- a/src/synthorg/persistence/postgres/approval_repo.py +++ b/src/synthorg/persistence/postgres/approval_repo.py @@ -392,7 +392,7 @@ async def list_items( status: ApprovalStatus | None = None, risk_level: ApprovalRiskLevel | None = None, action_type: NotBlankStr | None = None, - limit: int = 100, + limit: int = 100, # lint-allow: magic-numbers -- default page size offset: int = 0, ) -> tuple[ApprovalItem, ...]: """List approval items with optional filters (paginated).""" diff --git a/src/synthorg/persistence/sqlite/approval_repo.py b/src/synthorg/persistence/sqlite/approval_repo.py index 208d8d63cc..3d7de5cea5 100644 --- a/src/synthorg/persistence/sqlite/approval_repo.py +++ b/src/synthorg/persistence/sqlite/approval_repo.py @@ -450,7 +450,7 @@ async def list_items( status: ApprovalStatus | None = None, risk_level: ApprovalRiskLevel | None = None, action_type: NotBlankStr | None = None, - limit: int = 100, + limit: int = 100, # lint-allow: magic-numbers -- default page size offset: int = 0, ) -> tuple[ApprovalItem, ...]: """List approval items with optional filters (paginated, newest-first). From e7f4524c3bc52bed17e977f9f017822a1ce570e5 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 23:08:12 +0200 Subject: [PATCH 10/15] chore(magic-numbers): lint-allow pre-existing default page-size and bootstrap interval Same pre-push gate as the previous commit, this round flagging pre-existing defaults in api/app.py:_DEFAULT_TIMEOUT_CHECK_INTERVAL_SECONDS and the four list_* paginated controllers in api/controllers/meta.py (default page size 50). Annotate with lint-allow rather than migrate to settings/definitions/, which is out of scope for this cleanup PR. --- src/synthorg/api/app.py | 4 +++- src/synthorg/api/controllers/meta.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/synthorg/api/app.py b/src/synthorg/api/app.py index 030436a238..24c8dd19f2 100644 --- a/src/synthorg/api/app.py +++ b/src/synthorg/api/app.py @@ -142,7 +142,9 @@ # Update both sites together if the default ever changes; otherwise a # bootstrap value will silently disagree with operator-editable # overrides resolved through ``ConfigResolver``. -_DEFAULT_TIMEOUT_CHECK_INTERVAL_SECONDS = 60.0 +_DEFAULT_TIMEOUT_CHECK_INTERVAL_SECONDS = ( + 60.0 # lint-allow: magic-numbers -- bootstrap default mirrored by ConfigResolver +) def _build_default_approval_timeout_scheduler( diff --git a/src/synthorg/api/controllers/meta.py b/src/synthorg/api/controllers/meta.py index 16fcb98be5..34140255ca 100644 --- a/src/synthorg/api/controllers/meta.py +++ b/src/synthorg/api/controllers/meta.py @@ -86,7 +86,7 @@ async def list_rules( self, state: State, cursor: CursorParam = None, - limit: CursorLimit = 50, + limit: CursorLimit = 50, # lint-allow: magic-numbers -- default page size ) -> PaginatedResponse[dict[str, Any]]: """List all signal rules (built-in + custom) with status. @@ -139,7 +139,7 @@ async def list_mcp_tools( self, state: State, cursor: CursorParam = None, - limit: CursorLimit = 50, + limit: CursorLimit = 50, # lint-allow: magic-numbers -- default page size ) -> PaginatedResponse[dict[str, str]]: """List available MCP signal tools (paginated). @@ -181,7 +181,7 @@ async def list_ab_tests( self, state: State, cursor: CursorParam = None, - limit: CursorLimit = 50, + limit: CursorLimit = 50, # lint-allow: magic-numbers -- default page size ) -> PaginatedResponse[dict[str, Any]]: """List active A/B tests with status and current metrics. @@ -231,7 +231,7 @@ async def list_proposals( self, state: State, cursor: CursorParam = None, - limit: CursorLimit = 50, + limit: CursorLimit = 50, # lint-allow: magic-numbers -- default page size ) -> PaginatedResponse[dict[str, Any]]: """List improvement proposals from the approval store. From 8a92e5c5be0c79acef049f812c07a712f58c1c4a Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 23:23:56 +0200 Subject: [PATCH 11/15] fix(observability): close SEC-1 logger.exception leak in chief_of_staff chat Two pre-PR review fixes: - chief_of_staff/chat.py:253: replace logger.exception(COS_CHAT_FAILED) with logger.error(..., error_type=type(exc).__name__, error=safe_error_description(exc)) so LiteLLM provider exception strings (which can carry api_key / auth_token data) are no longer written verbatim into the structured log via exc_info=True. - api/controllers/meta.py:329-345: read app_state.chief_of_staff_chat and app_state.signals_service into local variables once before the None check so the typed accessors aren't called twice across the ServiceUnavailableError gate. --- src/synthorg/api/controllers/meta.py | 14 +++++++++----- src/synthorg/meta/chief_of_staff/chat.py | 10 +++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/synthorg/api/controllers/meta.py b/src/synthorg/api/controllers/meta.py index 34140255ca..dc7f5e8eae 100644 --- a/src/synthorg/api/controllers/meta.py +++ b/src/synthorg/api/controllers/meta.py @@ -327,19 +327,23 @@ async def chat( Chat response with answer, sources, and confidence. """ app_state = state.app_state - if not app_state.has_chief_of_staff_chat: + chat_backend = ( + app_state.chief_of_staff_chat if app_state.has_chief_of_staff_chat else None + ) + if chat_backend is None: msg = ( "Chief of Staff chat is not configured. Enable " "``meta.chief_of_staff.chat_enabled`` in settings and " "ensure an LLM provider is registered." ) raise ServiceUnavailableError(msg) - if not app_state.has_signals_service: + signals_service = ( + app_state.signals_service if app_state.has_signals_service else None + ) + if signals_service is None: msg = "SignalsService is not configured; cannot build a snapshot." raise ServiceUnavailableError(msg) - - chat_backend = app_state.chief_of_staff_chat - snapshot = await app_state.signals_service.get_org_snapshot() + snapshot = await signals_service.get_org_snapshot() query = ChatQuery( question=data.question, proposal_id=data.proposal_id, diff --git a/src/synthorg/meta/chief_of_staff/chat.py b/src/synthorg/meta/chief_of_staff/chat.py index 8b2c21a1e0..8d7dd131ee 100644 --- a/src/synthorg/meta/chief_of_staff/chat.py +++ b/src/synthorg/meta/chief_of_staff/chat.py @@ -35,7 +35,7 @@ ImprovementProposal, OrgSignalSnapshot, ) -from synthorg.observability import get_logger +from synthorg.observability import get_logger, safe_error_description from synthorg.observability.events.chief_of_staff import ( COS_CHAT_FAILED, COS_CHAT_QUERY, @@ -249,8 +249,12 @@ async def _call_llm( self._config.chat_model, config=config, ) - except Exception: - logger.exception(COS_CHAT_FAILED) + except Exception as exc: + logger.error( + COS_CHAT_FAILED, + error_type=type(exc).__name__, + error=safe_error_description(exc), + ) raise answer = (response.content or "").strip() if not answer: From b9002c769a117f6b556b49fc8c6daf4f5391769e Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 23:45:24 +0200 Subject: [PATCH 12/15] fix: babysit round 1, 3 findings (3 coderabbit) - app.py: guard _wire_chief_of_staff_chat with has_chief_of_staff_chat for lifespan-restart idempotence - engine/strategy/context.py: comment ContextSource.COMPOSITE scaffolding intent - hr/promotion/config.py: annotate seniority_model_map as Mapping[str, str] to match MappingProxyType runtime wrap Skipped 2 gemini-code-assist findings (context.py:121, 201) -- claimed comma-separated except is a SyntaxError, but project targets Python 3.14 with PEP 758 unparenthesized multi-exception clauses (no 'as' binding). Per project CLAUDE.md and CodeRabbit's repo config: 'except A, B:' is correct and mandatory. --- src/synthorg/api/app.py | 5 +++++ src/synthorg/engine/strategy/context.py | 7 +++++++ src/synthorg/hr/promotion/config.py | 3 ++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/synthorg/api/app.py b/src/synthorg/api/app.py index 24c8dd19f2..f08fc0e9cd 100644 --- a/src/synthorg/api/app.py +++ b/src/synthorg/api/app.py @@ -966,6 +966,11 @@ async def _wire_chief_of_staff_chat() -> None: # ``chief_of_staff.chat_enabled`` AND a provider is registered. # When unwired, ``POST /meta/chat`` surfaces 503 rather than the # silent placeholder it returned previously. + # Idempotent: a re-entry of lifespan startup against the same + # ``AppState`` (e.g. ASGI restart in tests) would otherwise make + # the one-shot ``set_chief_of_staff_chat`` raise. + if app_state.has_chief_of_staff_chat: + return if provider_registry is None: return from synthorg.meta.config import ( # noqa: PLC0415 diff --git a/src/synthorg/engine/strategy/context.py b/src/synthorg/engine/strategy/context.py index 5c43b9494b..fe59be23ee 100644 --- a/src/synthorg/engine/strategy/context.py +++ b/src/synthorg/engine/strategy/context.py @@ -244,6 +244,13 @@ async def build_context( memory_backend=memory_backend, ) elif config.context.source == ContextSource.COMPOSITE: + # Scaffolding for future multi-provider chains (e.g. policy / + # market-data overrides layered on top of memory). Today the + # tuple holds only ``MemoryContextProvider`` -- which already + # falls back to ``ConfigContextProvider`` -- so the composite + # wrapper is a no-op semantically. Keep it so adding new + # providers is a one-line tuple extension rather than a control + # flow change. provider = CompositeContextProvider( providers=( MemoryContextProvider( diff --git a/src/synthorg/hr/promotion/config.py b/src/synthorg/hr/promotion/config.py index 9960745447..3f97bba15a 100644 --- a/src/synthorg/hr/promotion/config.py +++ b/src/synthorg/hr/promotion/config.py @@ -5,6 +5,7 @@ """ import copy +from collections.abc import Mapping # noqa: TC003 from types import MappingProxyType from typing import Self @@ -77,7 +78,7 @@ class ModelMappingConfig(BaseModel): default=True, description="Whether model follows seniority level", ) - seniority_model_map: dict[str, str] = Field( + seniority_model_map: Mapping[str, str] = Field( default_factory=dict, description="Explicit seniority level to model ID overrides " "(wrapped as MappingProxyType after validation)", From e5d94ac83068e508769abbd70ec60b48baef3ce3 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sun, 10 May 2026 00:19:48 +0200 Subject: [PATCH 13/15] fix: babysit round 2, 7 findings (7 coderabbit) - meta.py: log META_CHAT_DEPENDENCY_UNAVAILABLE warning before raising 503 from /meta/chat (chat backend and signals service guards). - engine/strategy/context.py: drop redundant STRATEGY_CONTEXT_BUILT log on no-backend branch (fallback emits its own); validate memory payload via _StrategicContextOverridesArgs Pydantic model with strict and extra=ignore (typed boundary, kept in-module to avoid an api into engine layer crossing). - observability/events/meta.py: add META_CHAT_DEPENDENCY_UNAVAILABLE event constant. - app_builders.py: hoist CostTracker, ChiefOfStaffChat, ChiefOfStaffConfig, and ProviderRegistry imports out of TYPE_CHECKING for PEP 649 runtime resolution (matches app_helpers.py precedent). - test_code_modification_prompt.py: AST match for CompletionConfig kwargs instead of substring contains. - test_supervisor_router_prompt.py: AST walk for complete kwarg call sites instead of source.count. - test_context.py: rename to test_falls_back_when_any_field_is_blank_or_non_string for whole-payload-rejection contract; add test_ignores_unknown_payload_fields and test_composite_source_with_backend_applies_overrides. Skipped 1 coderabbit finding (test_appliers.py:815-828, add tuple coverage for tool_access) as factually wrong: ArchitectureChange.payload is JSON-typed and the upstream Pydantic validator rejects tuples with 'input was not a valid JSON value' before reaching the list-or-tuple branch in _validate_tool_access. Tuple inputs are unreachable in practice; the defensive isinstance check is dead code. --- src/synthorg/api/app_builders.py | 26 +++++++--- src/synthorg/api/controllers/meta.py | 17 ++++++- src/synthorg/engine/strategy/context.py | 46 +++++++++++------ src/synthorg/observability/events/meta.py | 11 +++++ .../prompt/test_code_modification_prompt.py | 36 ++++++++++++-- .../prompt/test_supervisor_router_prompt.py | 31 +++++++++--- tests/unit/engine/strategy/test_context.py | 49 +++++++++++++++++-- tests/unit/meta/test_appliers.py | 7 +++ 8 files changed, 187 insertions(+), 36 deletions(-) diff --git a/src/synthorg/api/app_builders.py b/src/synthorg/api/app_builders.py index 4a1f6217dc..8aec467bb8 100644 --- a/src/synthorg/api/app_builders.py +++ b/src/synthorg/api/app_builders.py @@ -10,23 +10,39 @@ from pathlib import Path, PurePath from typing import TYPE_CHECKING +# These four types appear in the public signatures of +# ``build_chief_of_staff_chat`` and ``_resolve_llm_judge_strategy``. +# Under PEP 649 lazy annotations, ``typing.get_type_hints()`` resolves +# annotation names against module globals at introspection time, so the +# imports must be runtime-resolvable; keeping them under TYPE_CHECKING +# would raise ``NameError`` for any caller (Litestar's plugin loader, +# doc generators, type checkers) that introspects the annotation +# surface. Same pattern as ``app_helpers.py``'s ChannelsPlugin. +from synthorg.budget.tracker import ( + CostTracker, # noqa: TC001 -- runtime-resolvable annotation for PEP 649 +) +from synthorg.meta.chief_of_staff.chat import ( + ChiefOfStaffChat, +) +from synthorg.meta.chief_of_staff.config import ( + ChiefOfStaffConfig, # noqa: TC001 -- runtime-resolvable annotation for PEP 649 +) from synthorg.observability import get_logger, safe_error_description from synthorg.observability.config import DEFAULT_SINKS, LogConfig from synthorg.observability.events.api import ( API_APP_STARTUP, API_MEMORY_DIR_TMPROOT_FALLBACK, ) +from synthorg.providers.registry import ( + ProviderRegistry, # noqa: TC001 -- runtime-resolvable annotation for PEP 649 +) from synthorg.telemetry import TelemetryCollector, TelemetryConfig if TYPE_CHECKING: - from synthorg.budget.tracker import CostTracker from synthorg.config.schema import RootConfig from synthorg.hr.performance.config import PerformanceConfig from synthorg.hr.performance.quality_protocol import QualityScoringStrategy from synthorg.hr.performance.tracker import PerformanceTracker - from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat - from synthorg.meta.chief_of_staff.config import ChiefOfStaffConfig - from synthorg.providers.registry import ProviderRegistry from synthorg.security.trust.service import TrustService logger = get_logger(__name__) @@ -137,8 +153,6 @@ def build_chief_of_staff_chat( judge: the first registered provider, since the chat model name in config is provider-agnostic. """ - from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat # noqa: PLC0415 - if not chief_of_staff_config.chat_enabled: return None diff --git a/src/synthorg/api/controllers/meta.py b/src/synthorg/api/controllers/meta.py index dc7f5e8eae..5b8b92c59c 100644 --- a/src/synthorg/api/controllers/meta.py +++ b/src/synthorg/api/controllers/meta.py @@ -21,7 +21,10 @@ from synthorg.meta.mcp.server import get_server_config from synthorg.meta.mcp.tools import get_tool_definitions from synthorg.observability import get_logger, safe_error_description -from synthorg.observability.events.meta import META_CUSTOM_RULE_LIST_FAILED +from synthorg.observability.events.meta import ( + META_CHAT_DEPENDENCY_UNAVAILABLE, + META_CUSTOM_RULE_LIST_FAILED, +) class ChatRequest(BaseModel): @@ -331,6 +334,13 @@ async def chat( app_state.chief_of_staff_chat if app_state.has_chief_of_staff_chat else None ) if chat_backend is None: + logger.warning( + META_CHAT_DEPENDENCY_UNAVAILABLE, + dependency="chief_of_staff_chat", + hint=( + "Set meta.chief_of_staff.chat_enabled and register an LLM provider." + ), + ) msg = ( "Chief of Staff chat is not configured. Enable " "``meta.chief_of_staff.chat_enabled`` in settings and " @@ -341,6 +351,11 @@ async def chat( app_state.signals_service if app_state.has_signals_service else None ) if signals_service is None: + logger.warning( + META_CHAT_DEPENDENCY_UNAVAILABLE, + dependency="signals_service", + hint="SignalsService must be wired during AppState startup.", + ) msg = "SignalsService is not configured; cannot build a snapshot." raise ServiceUnavailableError(msg) snapshot = await signals_service.get_org_snapshot() diff --git a/src/synthorg/engine/strategy/context.py b/src/synthorg/engine/strategy/context.py index fe59be23ee..124796e0ce 100644 --- a/src/synthorg/engine/strategy/context.py +++ b/src/synthorg/engine/strategy/context.py @@ -9,6 +9,8 @@ import json from typing import Protocol, runtime_checkable +from pydantic import BaseModel, ConfigDict, ValidationError + from synthorg.core.enums import MemoryCategory from synthorg.core.types import NotBlankStr from synthorg.engine.strategy.models import StrategicContext, StrategyConfig @@ -29,11 +31,27 @@ _STRATEGIC_CONTEXT_TAG: NotBlankStr = NotBlankStr("strategic-context") """Tag the memory backend filters on for strategic-context entries.""" -_OVERRIDABLE_FIELDS: tuple[str, ...] = ( - "maturity_stage", - "industry", - "competitive_position", -) + +class _StrategicContextOverridesArgs(BaseModel): + """Typed-boundary validator for memory-stored strategic-context overrides. + + The memory backend yields untrusted JSON; this args model is the + boundary that turns that payload into typed overrides. Behaves like + :func:`synthorg.api.boundary.parse_typed` (validate, log on failure, + re-raise) but lives in-module so ``engine.strategy`` does not depend + on the ``api`` layer. + + Each override field is ``NotBlankStr`` so blank / non-string values + reject the payload entirely; callers fall back to the no-override + path on :class:`pydantic.ValidationError`. ``extra="ignore"`` keeps + the boundary forward-compatible with future enrichment fields. + """ + + model_config = ConfigDict(frozen=True, extra="ignore", strict=True) + + maturity_stage: NotBlankStr | None = None + industry: NotBlankStr | None = None + competitive_position: NotBlankStr | None = None @runtime_checkable @@ -106,7 +124,6 @@ def __init__( async def provide(self, *, config: StrategyConfig) -> StrategicContext: # noqa: PLR0911 """Layer memory-stored overrides on top of the fallback context.""" if self._memory_backend is None: - logger.debug(STRATEGY_CONTEXT_BUILT, source="memory_no_backend") return await self._fallback.provide(config=config) try: @@ -138,7 +155,7 @@ async def provide(self, *, config: StrategyConfig) -> StrategicContext: # noqa: return await self._fallback.provide(config=config) try: - payload = json.loads(entries[0].content) + decoded = json.loads(entries[0].content) except json.JSONDecodeError as exc: logger.warning( STRATEGY_CONTEXT_PROVIDER_FAILED, @@ -149,21 +166,20 @@ async def provide(self, *, config: StrategyConfig) -> StrategicContext: # noqa: ) return await self._fallback.provide(config=config) - if not isinstance(payload, dict): + try: + args = _StrategicContextOverridesArgs.model_validate(decoded) + except ValidationError as exc: logger.warning( STRATEGY_CONTEXT_PROVIDER_FAILED, provider_name="MemoryContextProvider", - stage="parse", - reason="content_not_object", + stage="validate", + error_type=type(exc).__name__, + error=safe_error_description(exc), ) return await self._fallback.provide(config=config) + overrides = args.model_dump(exclude_none=True) fallback_ctx = await self._fallback.provide(config=config) - overrides: dict[str, str] = {} - for field_name in _OVERRIDABLE_FIELDS: - value = payload.get(field_name) - if isinstance(value, str) and value.strip(): - overrides[field_name] = value if not overrides: return fallback_ctx logger.info( diff --git a/src/synthorg/observability/events/meta.py b/src/synthorg/observability/events/meta.py index cc547b334c..70ee73ac55 100644 --- a/src/synthorg/observability/events/meta.py +++ b/src/synthorg/observability/events/meta.py @@ -170,3 +170,14 @@ # defaults. Always logged at WARNING so operators can audit silent # fallbacks. META_SELF_IMPROVEMENT_LOAD_FAILED: Final[str] = "meta.self_improvement.load_failed" + +# -- Chief of Staff chat endpoint diagnostics ------------------------------ + +# Emitted at WARNING when ``POST /meta/chat`` cannot be served because a +# required service (chat backend or signals service) is not wired into +# the AppState. These guards run hand-rolled rather than through +# ``AppState._require_service`` because the chat endpoint must surface +# 503 (service unavailable) rather than the generic 500 the central +# accessor produces -- but operators still need the same level of audit +# visibility. +META_CHAT_DEPENDENCY_UNAVAILABLE: Final[str] = "meta.chat.dependency_unavailable" diff --git a/tests/evals/prompt/test_code_modification_prompt.py b/tests/evals/prompt/test_code_modification_prompt.py index 81155e6f0a..c4f9d587f6 100644 --- a/tests/evals/prompt/test_code_modification_prompt.py +++ b/tests/evals/prompt/test_code_modification_prompt.py @@ -13,6 +13,7 @@ Reference: ``synthorg.meta.strategies.code_modification``. """ +import ast import inspect import pytest @@ -34,20 +35,45 @@ def test_temperature_is_config_driven(self) -> None: We deliberately avoid pinning the value to ``0.0`` here because operations may tune the temperature for code generation in production. The contract is "config-driven, not literal". + + Uses AST matching rather than substring search so a refactor + that splits the keyword across lines, adds a comment, or aliases + ``self._code_config`` cannot quietly slip past the gate. """ from synthorg.meta.strategies.code_modification import ( CodeModificationStrategy, ) source = inspect.getsource(CodeModificationStrategy) - assert "temperature=self._code_config.temperature" in source, ( - "CodeModificationStrategy must build CompletionConfig from " + tree = ast.parse(source) + config_calls = [ + node + for node in ast.walk(tree) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "CompletionConfig" + ] + assert config_calls, ( + "expected at least one CompletionConfig(...) call in " + "CodeModificationStrategy" + ) + call = config_calls[0] + assert any( + kw.arg == "temperature" + and ast.unparse(kw.value) == "self._code_config.temperature" + for kw in call.keywords + ), ( + "CompletionConfig.temperature must come from " "self._code_config.temperature so the value stays runtime-" "tunable rather than hardcoded" ) - assert "max_tokens=self._code_config.max_tokens" in source, ( - "CodeModificationStrategy must read max_tokens from " - "self._code_config so token budgets stay tunable" + assert any( + kw.arg == "max_tokens" + and ast.unparse(kw.value) == "self._code_config.max_tokens" + for kw in call.keywords + ), ( + "CompletionConfig.max_tokens must come from " + "self._code_config.max_tokens so token budgets stay tunable" ) def test_system_prompt_fingerprint_is_pinned(self) -> None: diff --git a/tests/evals/prompt/test_supervisor_router_prompt.py b/tests/evals/prompt/test_supervisor_router_prompt.py index 52b2d63f17..b0e4b46c13 100644 --- a/tests/evals/prompt/test_supervisor_router_prompt.py +++ b/tests/evals/prompt/test_supervisor_router_prompt.py @@ -13,6 +13,7 @@ config inline fails the call-site test. """ +import ast import inspect import pytest @@ -65,18 +66,36 @@ def test_retry_prompt_fingerprint_is_pinned(self) -> None: def test_call_sites_use_pinned_config(self) -> None: """Both LLM call sites must pass the module-level pinned config. - AST-level check is intentionally simple (``config=`` keyword - bound to the constant name) because the surface is small and - any cleverer refactor that constructs a fresh CompletionConfig - inline should be caught by ``test_temperature_is_zero`` - regressing or this assertion failing. + Walks the AST for every ``.complete(...)`` call (or bare + ``complete(...)`` if a future refactor unwraps the attribute + access) and counts those whose ``config=`` keyword binds to the + ``_ROUTING_COMPLETION_CONFIG`` Name node. Substring counting + gave false positives on stray references in comments or + docstrings; this AST check only counts real call-site bindings. """ from synthorg.memory.retrieval.hierarchical import supervisor source = inspect.getsource(supervisor) + tree = ast.parse(source) + occurrences = 0 + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + is_complete_call = ( + isinstance(func, ast.Attribute) and func.attr == "complete" + ) or (isinstance(func, ast.Name) and func.id == "complete") + if not is_complete_call: + continue + if any( + kw.arg == "config" + and isinstance(kw.value, ast.Name) + and kw.value.id == "_ROUTING_COMPLETION_CONFIG" + for kw in node.keywords + ): + occurrences += 1 # Two distinct call sites: ``_route_via_llm`` and # ``_evaluate_via_llm``. Each must pass the pinned config. - occurrences = source.count("config=_ROUTING_COMPLETION_CONFIG") assert occurrences >= 2, ( "expected both supervisor.complete() call sites to pass " f"config=_ROUTING_COMPLETION_CONFIG; found {occurrences}" diff --git a/tests/unit/engine/strategy/test_context.py b/tests/unit/engine/strategy/test_context.py index 363ad9f10b..59859dddf0 100644 --- a/tests/unit/engine/strategy/test_context.py +++ b/tests/unit/engine/strategy/test_context.py @@ -176,10 +176,15 @@ async def test_applies_partial_overrides( assert ctx.competitive_position == "challenger" @pytest.mark.unit - async def test_ignores_blank_or_non_string_overrides( + async def test_falls_back_when_any_field_is_blank_or_non_string( self, default_strategy_config: StrategyConfig, ) -> None: + # Strict args-model validation rejects the whole payload as soon + # as any field is blank or non-string, so even the otherwise- + # valid ``competitive_position`` is dropped. This is the right + # contract for boundary validation: garbage in -> total fall + # back, no partial application. payload = { "maturity_stage": " ", "industry": 42, @@ -194,7 +199,28 @@ async def test_ignores_blank_or_non_string_overrides( ctx = await provider.provide(config=default_strategy_config) assert ctx.maturity_stage == "growth" assert ctx.industry == "technology" - assert ctx.competitive_position == "leader" + assert ctx.competitive_position == "challenger" + + @pytest.mark.unit + async def test_ignores_unknown_payload_fields( + self, + default_strategy_config: StrategyConfig, + ) -> None: + # Forward-compatible: extra fields the args model doesn't know + # about are dropped silently, the rest still apply. + payload = { + "maturity_stage": "scaleup", + "future_field": {"experimental": True}, + } + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = (_entry(json.dumps(payload)),) + provider = MemoryContextProvider( + fallback=ConfigContextProvider(), + memory_backend=backend, + ) + ctx = await provider.provide(config=default_strategy_config) + assert ctx.maturity_stage == "scaleup" + assert ctx.industry == "technology" class TestCompositeContextProvider: @@ -286,9 +312,26 @@ async def test_memory_source_with_backend_applies_overrides(self) -> None: assert ctx.maturity_stage == "scaleup" @pytest.mark.unit - async def test_composite_source(self) -> None: + async def test_composite_source_without_backend_falls_back(self) -> None: config = StrategyConfig( context=StrategicContextConfig(source=ContextSource.COMPOSITE), ) ctx = await build_context(config) assert ctx.maturity_stage == "growth" + + @pytest.mark.unit + async def test_composite_source_with_backend_applies_overrides(self) -> None: + # Pins the COMPOSITE branch end-to-end with a real backend so a + # regression in ``CompositeContextProvider`` (e.g. wrapping the + # wrong fallback chain) fails this test rather than slipping + # through under the no-backend degraded path. + config = StrategyConfig( + context=StrategicContextConfig(source=ContextSource.COMPOSITE), + ) + backend = AsyncMock(spec=MemoryBackend) + backend.retrieve.return_value = ( + _entry(json.dumps({"maturity_stage": "scaleup"})), + ) + ctx = await build_context(config, memory_backend=backend) + assert ctx.maturity_stage == "scaleup" + assert ctx.industry == "technology" diff --git a/tests/unit/meta/test_appliers.py b/tests/unit/meta/test_appliers.py index 57c32062a8..533eb8e47f 100644 --- a/tests/unit/meta/test_appliers.py +++ b/tests/unit/meta/test_appliers.py @@ -813,6 +813,13 @@ async def test_dry_run_tool_access_rejects( assert expected in (result.error_message or "") async def test_dry_run_tool_access_valid_passes(self) -> None: + # ``ArchitectureChange.payload`` is JSON-typed, so the + # ``_validate_tool_access`` ``isinstance(list | tuple)`` branch + # is unreachable for tuples in practice -- the upstream Pydantic + # boundary rejects them with ``input was not a valid JSON + # value`` before the applier sees the payload. Only the list + # path is tested because that's the only path real callers can + # exercise. applier = ArchitectureApplier(context=_FakeArchContext()) proposal = _proposal_architecture( _arch( From 1f708612a41ec0d0efae21fb4e0fafbe7a2e2f6c Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sun, 10 May 2026 00:50:34 +0200 Subject: [PATCH 14/15] fix: babysit round 3, 7 findings (1 ci, 6 coderabbit) - app_builders.py: revert TYPE_CHECKING hoist from round 2 (the hoist created a circular import 'cannot import name CostRecord from partially initialized module synthorg.budget.cost_record' that broke OpenAPI export, Build Web Assets, Build Backend, Lighthouse, Build Preview, and the CI Pass umbrella check); document why the imports stay lazy. - observability/events/meta.py: correct META_CHAT_DEPENDENCY_UNAVAILABLE rationale (AppState._require_service raises ServiceUnavailableError 503, not generic 500). - api/controllers/meta.py: set status_code=200 on POST /chat (query-only endpoint, no resource creation). - api/state_services_facades_mcp3.py: correct mixin docstring from five to six services (chief_of_staff_chat). - engine/strategy/context.py: route memory payload validation through synthorg.api.boundary.parse_typed under the memory.strategic_context boundary label, so failures emit API_BOUNDARY_VALIDATION_FAILED alongside the provider's own STRATEGY_CONTEXT_PROVIDER_FAILED log. - tests/unit/api/controllers/test_meta_chat.py: assert chat_mock.ask was awaited with a ChatQuery carrying question and proposal_id and alert_id, and that the snapshot is the one signals_mock returned; update status_code expectation to 200. - tests/evals/prompt/test_procedural_memory_prompt.py: AST hardening for both CompletionConfig kwargs and the propose call site (matches the pattern used in test_code_modification_prompt.py and test_supervisor_router_prompt.py); textwrap.dedent the method source before ast.parse. --- src/synthorg/api/app_builders.py | 37 ++++++------- src/synthorg/api/controllers/meta.py | 5 ++ .../api/state_services_facades_mcp3.py | 7 ++- src/synthorg/engine/strategy/context.py | 18 ++++-- src/synthorg/observability/events/meta.py | 8 +-- .../prompt/test_procedural_memory_prompt.py | 55 +++++++++++++++++-- tests/unit/api/controllers/test_meta_chat.py | 26 +++++++-- 7 files changed, 116 insertions(+), 40 deletions(-) diff --git a/src/synthorg/api/app_builders.py b/src/synthorg/api/app_builders.py index 8aec467bb8..1dd5a5bf9d 100644 --- a/src/synthorg/api/app_builders.py +++ b/src/synthorg/api/app_builders.py @@ -10,39 +10,34 @@ from pathlib import Path, PurePath from typing import TYPE_CHECKING -# These four types appear in the public signatures of -# ``build_chief_of_staff_chat`` and ``_resolve_llm_judge_strategy``. -# Under PEP 649 lazy annotations, ``typing.get_type_hints()`` resolves -# annotation names against module globals at introspection time, so the -# imports must be runtime-resolvable; keeping them under TYPE_CHECKING -# would raise ``NameError`` for any caller (Litestar's plugin loader, -# doc generators, type checkers) that introspects the annotation -# surface. Same pattern as ``app_helpers.py``'s ChannelsPlugin. -from synthorg.budget.tracker import ( - CostTracker, # noqa: TC001 -- runtime-resolvable annotation for PEP 649 -) -from synthorg.meta.chief_of_staff.chat import ( - ChiefOfStaffChat, -) -from synthorg.meta.chief_of_staff.config import ( - ChiefOfStaffConfig, # noqa: TC001 -- runtime-resolvable annotation for PEP 649 -) from synthorg.observability import get_logger, safe_error_description from synthorg.observability.config import DEFAULT_SINKS, LogConfig from synthorg.observability.events.api import ( API_APP_STARTUP, API_MEMORY_DIR_TMPROOT_FALLBACK, ) -from synthorg.providers.registry import ( - ProviderRegistry, # noqa: TC001 -- runtime-resolvable annotation for PEP 649 -) from synthorg.telemetry import TelemetryCollector, TelemetryConfig +# All four of ``CostTracker`` / ``ChiefOfStaffChat`` / ``ChiefOfStaffConfig`` +# / ``ProviderRegistry`` are imported lazily under TYPE_CHECKING. Hoisting +# them to runtime imports created a circular import via the budget / +# observability chain (``cannot import name 'CostRecord' from partially +# initialized module 'synthorg.budget.cost_record'``). Under PEP 649 the +# annotations are stored as code objects and only evaluated when +# ``typing.get_type_hints()`` runs against this module -- which Litestar's +# route discovery does for handler signatures, not for the helpers below +# (private prefix or non-handler). ``ChiefOfStaffChat`` is also imported +# in-function below for the constructor call site, so the runtime +# constructor reference is independent of the annotation surface. if TYPE_CHECKING: + from synthorg.budget.tracker import CostTracker from synthorg.config.schema import RootConfig from synthorg.hr.performance.config import PerformanceConfig from synthorg.hr.performance.quality_protocol import QualityScoringStrategy from synthorg.hr.performance.tracker import PerformanceTracker + from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat + from synthorg.meta.chief_of_staff.config import ChiefOfStaffConfig + from synthorg.providers.registry import ProviderRegistry from synthorg.security.trust.service import TrustService logger = get_logger(__name__) @@ -153,6 +148,8 @@ def build_chief_of_staff_chat( judge: the first registered provider, since the chat model name in config is provider-agnostic. """ + from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat # noqa: PLC0415 + if not chief_of_staff_config.chat_enabled: return None diff --git a/src/synthorg/api/controllers/meta.py b/src/synthorg/api/controllers/meta.py index 5b8b92c59c..24f7817e98 100644 --- a/src/synthorg/api/controllers/meta.py +++ b/src/synthorg/api/controllers/meta.py @@ -305,6 +305,11 @@ async def get_signals( @post( "/chat", + # Query-only endpoint: routes a question to ChiefOfStaffChat + # and returns the computed answer. No server resource is + # created, so 200 OK is the right status; without this Litestar + # defaults POST handlers to 201 Created. + status_code=200, guards=[ require_org_mutation(), per_op_rate_limit_from_policy("meta.chat", key="user"), diff --git a/src/synthorg/api/state_services_facades_mcp3.py b/src/synthorg/api/state_services_facades_mcp3.py index 9d3881c82b..d350197e0b 100644 --- a/src/synthorg/api/state_services_facades_mcp3.py +++ b/src/synthorg/api/state_services_facades_mcp3.py @@ -39,7 +39,12 @@ class _MetaMcp3FacadesMixin: - """Facade accessors for the five META-MCP-3 services.""" + """Facade accessors for the six META-MCP-3 services. + + Covers the five original META-MCP-3 surfaces plus + ``chief_of_staff_chat``, the LLM-backed chat backend wired in for + ``POST /meta/chat``. + """ _set_once: Any diff --git a/src/synthorg/engine/strategy/context.py b/src/synthorg/engine/strategy/context.py index 124796e0ce..fdc5001b3c 100644 --- a/src/synthorg/engine/strategy/context.py +++ b/src/synthorg/engine/strategy/context.py @@ -11,6 +11,7 @@ from pydantic import BaseModel, ConfigDict, ValidationError +from synthorg.api.boundary import parse_typed from synthorg.core.enums import MemoryCategory from synthorg.core.types import NotBlankStr from synthorg.engine.strategy.models import StrategicContext, StrategyConfig @@ -33,13 +34,14 @@ class _StrategicContextOverridesArgs(BaseModel): - """Typed-boundary validator for memory-stored strategic-context overrides. + """Typed-boundary args model for memory-stored context overrides. The memory backend yields untrusted JSON; this args model is the - boundary that turns that payload into typed overrides. Behaves like - :func:`synthorg.api.boundary.parse_typed` (validate, log on failure, - re-raise) but lives in-module so ``engine.strategy`` does not depend - on the ``api`` layer. + boundary contract that turns that payload into typed overrides. + Validated via :func:`synthorg.api.boundary.parse_typed` under the + ``memory.strategic_context`` boundary label so failures emit the + standard ``API_BOUNDARY_VALIDATION_FAILED`` log alongside the + provider's own ``STRATEGY_CONTEXT_PROVIDER_FAILED`` log. Each override field is ``NotBlankStr`` so blank / non-string values reject the payload entirely; callers fall back to the no-override @@ -167,7 +169,11 @@ async def provide(self, *, config: StrategyConfig) -> StrategicContext: # noqa: return await self._fallback.provide(config=config) try: - args = _StrategicContextOverridesArgs.model_validate(decoded) + args = parse_typed( + "memory.strategic_context", + decoded, + _StrategicContextOverridesArgs, + ) except ValidationError as exc: logger.warning( STRATEGY_CONTEXT_PROVIDER_FAILED, diff --git a/src/synthorg/observability/events/meta.py b/src/synthorg/observability/events/meta.py index 70ee73ac55..a963c67e7e 100644 --- a/src/synthorg/observability/events/meta.py +++ b/src/synthorg/observability/events/meta.py @@ -176,8 +176,8 @@ # Emitted at WARNING when ``POST /meta/chat`` cannot be served because a # required service (chat backend or signals service) is not wired into # the AppState. These guards run hand-rolled rather than through -# ``AppState._require_service`` because the chat endpoint must surface -# 503 (service unavailable) rather than the generic 500 the central -# accessor produces -- but operators still need the same level of audit -# visibility. +# ``AppState._require_service`` so the endpoint can emit +# ``META_CHAT_DEPENDENCY_UNAVAILABLE`` with dependency-specific context +# (which dependency is missing, plus a fix hint) before +# ``ServiceUnavailableError`` propagates and the response surfaces 503. META_CHAT_DEPENDENCY_UNAVAILABLE: Final[str] = "meta.chat.dependency_unavailable" diff --git a/tests/evals/prompt/test_procedural_memory_prompt.py b/tests/evals/prompt/test_procedural_memory_prompt.py index aa5bb6559c..f321388f93 100644 --- a/tests/evals/prompt/test_procedural_memory_prompt.py +++ b/tests/evals/prompt/test_procedural_memory_prompt.py @@ -16,7 +16,9 @@ Reference: ``synthorg.memory.procedural.proposer``. """ +import ast import inspect +import textwrap import pytest @@ -38,15 +40,38 @@ def test_completion_config_is_built_from_config(self) -> None: the temperature for failure analysis. The contract is that the CompletionConfig is built from ``ProceduralMemoryConfig``, not hardcoded. + + Uses AST matching rather than substring search so a refactor + that splits the keyword across lines, adds a comment, or + aliases ``config`` cannot quietly slip past the gate. """ from synthorg.memory.procedural.proposer import ProceduralMemoryProposer source = inspect.getsource(ProceduralMemoryProposer) - assert "temperature=config.temperature" in source, ( + tree = ast.parse(source) + config_calls = [ + node + for node in ast.walk(tree) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "CompletionConfig" + ] + assert config_calls, ( + "expected at least one CompletionConfig(...) call in " + "ProceduralMemoryProposer" + ) + call = config_calls[0] + assert any( + kw.arg == "temperature" and ast.unparse(kw.value) == "config.temperature" + for kw in call.keywords + ), ( "ProceduralMemoryProposer must build CompletionConfig from " "config.temperature so the value stays runtime-tunable" ) - assert "max_tokens=config.max_tokens" in source, ( + assert any( + kw.arg == "max_tokens" and ast.unparse(kw.value) == "config.max_tokens" + for kw in call.keywords + ), ( "ProceduralMemoryProposer must read max_tokens from " "ProceduralMemoryConfig so token budgets stay tunable" ) @@ -86,12 +111,32 @@ def test_propose_call_site_uses_pinned_completion_config(self) -> None: The construction-time check above proves the field is built correctly. This call-site check refuses any future refactor - that constructs a fresh CompletionConfig inline. + that constructs a fresh CompletionConfig inline. AST walk + instead of substring so a stray reference in a comment or + docstring cannot satisfy the assertion. """ from synthorg.memory.procedural.proposer import ProceduralMemoryProposer - source = inspect.getsource(ProceduralMemoryProposer.propose) - assert "config=self._completion_config" in source, ( + # ``inspect.getsource`` of a method preserves the class-body + # indentation, which ``ast.parse`` rejects as an unexpected + # indent. Dedent before parsing. + source = textwrap.dedent(inspect.getsource(ProceduralMemoryProposer.propose)) + tree = ast.parse(source) + complete_calls = [ + node + for node in ast.walk(tree) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Attribute) + and node.func.attr == "complete" + ] + assert any( + any( + kw.arg == "config" + and ast.unparse(kw.value) == "self._completion_config" + for kw in call.keywords + ) + for call in complete_calls + ), ( "ProceduralMemoryProposer.propose must pass " "config=self._completion_config to provider.complete; a " "fresh inline CompletionConfig() construction would defeat " diff --git a/tests/unit/api/controllers/test_meta_chat.py b/tests/unit/api/controllers/test_meta_chat.py index 4031ac8f49..87a0661963 100644 --- a/tests/unit/api/controllers/test_meta_chat.py +++ b/tests/unit/api/controllers/test_meta_chat.py @@ -7,7 +7,7 @@ from litestar.testing import TestClient from synthorg.meta.chief_of_staff.chat import ChiefOfStaffChat -from synthorg.meta.chief_of_staff.models import ChatResponse +from synthorg.meta.chief_of_staff.models import ChatQuery, ChatResponse from synthorg.meta.models import ( OrgBudgetSummary, OrgCoordinationSummary, @@ -80,13 +80,16 @@ async def test_returns_chat_payload_when_wired( ) -> None: """Wired chat backend returns the answer + sources + confidence.""" chat_mock = AsyncMock(spec=ChiefOfStaffChat) + proposal_id = "11111111-1111-1111-1111-111111111111" + alert_id = "22222222-2222-2222-2222-222222222222" chat_mock.ask.return_value = ChatResponse( answer="Quality is up 5%.", sources=("performance",), confidence=0.8, ) signals_mock = AsyncMock(spec=SignalsService) - signals_mock.get_org_snapshot.return_value = _empty_snapshot() + expected_snapshot = _empty_snapshot() + signals_mock.get_org_snapshot.return_value = expected_snapshot app_state = test_client.app.state.app_state chat_original = app_state._chief_of_staff_chat @@ -97,14 +100,29 @@ async def test_returns_chat_payload_when_wired( resp = test_client.post( _BASE, headers=_HEADERS, - json={"question": "How is quality trending?"}, + json={ + "question": "How is quality trending?", + "proposal_id": proposal_id, + "alert_id": alert_id, + }, ) - assert resp.status_code == 201 + assert resp.status_code == 200 body = resp.json() assert body["success"] is True assert body["data"]["answer"] == "Quality is up 5%." assert body["data"]["sources"] == ["performance"] assert body["data"]["confidence"] == pytest.approx(0.8) + # Without these the controller could silently drop + # ``proposal_id`` / ``alert_id`` or swap the ``ask()`` args + # and the payload-only checks above would still pass. + signals_mock.get_org_snapshot.assert_awaited_once_with() + chat_mock.ask.assert_awaited_once() + asked_query, asked_snapshot = chat_mock.ask.await_args.args + assert isinstance(asked_query, ChatQuery) + assert asked_query.question == "How is quality trending?" + assert str(asked_query.proposal_id) == proposal_id + assert str(asked_query.alert_id) == alert_id + assert asked_snapshot is expected_snapshot finally: app_state._chief_of_staff_chat = chat_original app_state._signals_service = signals_original From 27100346cc2f963a34db4411709ef29416d8e591 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sun, 10 May 2026 01:11:48 +0200 Subject: [PATCH 15/15] fix: babysit round 4, 2 findings (2 coderabbit) test_procedural_memory_prompt.py AST hardening: search all CompletionConfig calls (not just config_calls[0]) for one carrying both temperature=config.temperature and max_tokens=config.max_tokens; restrict provider.complete matching to nodes whose receiver is provider/self.provider/self._provider so an unrelated .complete() call can't satisfy the assertion. --- .../prompt/test_procedural_memory_prompt.py | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/evals/prompt/test_procedural_memory_prompt.py b/tests/evals/prompt/test_procedural_memory_prompt.py index f321388f93..ea53cb3727 100644 --- a/tests/evals/prompt/test_procedural_memory_prompt.py +++ b/tests/evals/prompt/test_procedural_memory_prompt.py @@ -60,20 +60,26 @@ def test_completion_config_is_built_from_config(self) -> None: "expected at least one CompletionConfig(...) call in " "ProceduralMemoryProposer" ) - call = config_calls[0] + + def _kw(call: ast.Call, name: str) -> str | None: + for kw in call.keywords: + if kw.arg == name: + return ast.unparse(kw.value) + return None + + # Match against any CompletionConfig call carrying BOTH + # required kwargs. Selecting ``config_calls[0]`` would + # silently break if a future refactor introduced a second + # CompletionConfig earlier in the class (e.g. for a retry + # path), so search the full set. assert any( - kw.arg == "temperature" and ast.unparse(kw.value) == "config.temperature" - for kw in call.keywords + _kw(call, "temperature") == "config.temperature" + and _kw(call, "max_tokens") == "config.max_tokens" + for call in config_calls ), ( "ProceduralMemoryProposer must build CompletionConfig from " - "config.temperature so the value stays runtime-tunable" - ) - assert any( - kw.arg == "max_tokens" and ast.unparse(kw.value) == "config.max_tokens" - for kw in call.keywords - ), ( - "ProceduralMemoryProposer must read max_tokens from " - "ProceduralMemoryConfig so token budgets stay tunable" + "config.temperature and config.max_tokens so values remain " + "runtime-tunable" ) def test_system_prompt_fingerprint_is_pinned(self) -> None: @@ -122,20 +128,31 @@ def test_propose_call_site_uses_pinned_completion_config(self) -> None: # indent. Dedent before parsing. source = textwrap.dedent(inspect.getsource(ProceduralMemoryProposer.propose)) tree = ast.parse(source) - complete_calls = [ + # Narrow to provider call sites only -- accept the obvious + # ``provider``, ``self.provider``, and ``self._provider`` + # receivers. Without this, an unrelated ``foo.complete(...)`` + # could satisfy the assertion even after a refactor that drops + # ``self._completion_config`` from the actual provider call. + provider_complete_calls = [ node for node in ast.walk(tree) if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute) and node.func.attr == "complete" + and ast.unparse(node.func.value) + in {"provider", "self.provider", "self._provider"} ] + assert provider_complete_calls, ( + "expected at least one provider.complete(...) call in " + "ProceduralMemoryProposer.propose" + ) assert any( any( kw.arg == "config" and ast.unparse(kw.value) == "self._completion_config" for kw in call.keywords ) - for call in complete_calls + for call in provider_complete_calls ), ( "ProceduralMemoryProposer.propose must pass " "config=self._completion_config to provider.complete; a "