From 137ff3f24d0d770a12432d23ec13a49d429e8a68 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:05:53 +0200 Subject: [PATCH 1/5] test: wire real LiteLLM provider in single-agent e2e test --- tests/e2e/test_single_agent_e2e.py | 69 +++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py index 381efae878..1442ff578f 100644 --- a/tests/e2e/test_single_agent_e2e.py +++ b/tests/e2e/test_single_agent_e2e.py @@ -14,10 +14,12 @@ from pathlib import Path from synthorg.budget.tracker import CostTracker -from synthorg.core.agent import ToolPermissions +from synthorg.config.provider_schema import ProviderConfig, ProviderModelConfig +from synthorg.core.agent import ModelConfig, ToolPermissions from synthorg.core.enums import TaskStatus, ToolAccessLevel from synthorg.engine.agent_engine import AgentEngine from synthorg.engine.loop_protocol import TerminationReason +from synthorg.providers.drivers.litellm_driver import LiteLLMDriver from synthorg.providers.enums import MessageRole from synthorg.providers.models import ToolCall from synthorg.tools.file_system.write_file import WriteFileTool @@ -381,22 +383,69 @@ class TestRealLLMIntegration: """Optional smoke test with a real LLM provider. Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI. - Currently a placeholder -- all methods skip until a real provider - is configured via environment variables. + Each method also requires REAL_LLM_MODEL, REAL_LLM_PROVIDER, and + REAL_LLM_API_KEY so the test can construct a LiteLLM-backed + provider without leaning on app-startup config wiring. """ async def test_real_provider_text_completion(self) -> None: - """Minimal text-only task with a real provider. - - Placeholder for real provider integration; additional - configuration scaffolding is required before this can be enabled. - """ + """Minimal text-only task end-to-end through ``LiteLLMDriver``.""" provider_model = os.environ.get("REAL_LLM_MODEL") if not provider_model: pytest.skip( "Set REAL_LLM_MODEL to a valid model ID " "(e.g. 'example-large-001') to run this test" ) - pytest.skip( - f"Real LLM provider integration not yet wired -- model={provider_model}" + provider_name = os.environ.get("REAL_LLM_PROVIDER") + if not provider_name: + pytest.skip( + "Set REAL_LLM_PROVIDER to a LiteLLM routing key " + "(e.g. 'example-provider') to run this test" + ) + api_key = os.environ.get("REAL_LLM_API_KEY") + if not api_key: + pytest.skip( + "Set REAL_LLM_API_KEY to the provider's API key to run this test" + ) + + provider_config = ProviderConfig( + litellm_provider=provider_name, + api_key=api_key, + models=(ProviderModelConfig(id=provider_model),), ) + provider = LiteLLMDriver(provider_name, provider_config) + + cost_tracker = CostTracker() + identity = make_e2e_identity().model_copy( + update={ + "model": ModelConfig( + provider=provider_name, + model_id=provider_model, + ), + }, + ) + task = make_e2e_task( + identity=identity, + title="Real LLM smoke test", + description="Reply with the single word 'ack'.", + ) + + engine = AgentEngine( + provider=provider, + cost_tracker=cost_tracker, + ) + result = await engine.run( + identity=identity, + task=task, + max_turns=2, + ) + + # Real provider produced a successful single-turn completion. + assert result.is_success is True + assert result.termination_reason == TerminationReason.COMPLETED + assert result.completion_summary + # ``>= 0`` (not ``> 0``) so a local zero-cost preset still passes. + assert result.total_cost >= 0 + assert await cost_tracker.get_record_count() == result.total_turns + assert result.task_id == task.id + assert result.duration_seconds > 0 From 5694937738d786528d29f2933c8eba7dff45da53 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:08:13 +0200 Subject: [PATCH 2/5] test: route meta-cycle proposal approval through real ApprovalStore --- tests/integration/meta/test_meta_cycle.py | 69 ++++++++++++++++++++--- 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/tests/integration/meta/test_meta_cycle.py b/tests/integration/meta/test_meta_cycle.py index 1add3a1d80..e428d04bbb 100644 --- a/tests/integration/meta/test_meta_cycle.py +++ b/tests/integration/meta/test_meta_cycle.py @@ -4,9 +4,14 @@ guards -> approval -> rollout -> regression detection. """ +from datetime import UTC, datetime +from uuid import NAMESPACE_URL, uuid5 + import pytest from synthorg.api.approval_store import ApprovalStore +from synthorg.core.enums import ApprovalStatus +from synthorg.core.types import NotBlankStr from synthorg.meta.config import SelfImprovementConfig from synthorg.meta.models import ( OrgBudgetSummary, @@ -107,11 +112,23 @@ async def test_budget_overrun_produces_critical_proposal( assert "budget_overrun" in sources async def test_proposal_rollout_succeeds(self) -> None: - """Scenario: approved proposal -> rollout -> success.""" + """Scenario: approved proposal -> rollout -> success. + + Routes the approval through the real ``ApprovalStore``: the + guard registers an ``ApprovalItem`` during ``run_cycle``, the + test approves it via ``save_if_pending`` (mirroring the API / + MCP approval handlers), and the proposal handed to + ``execute_rollout`` carries the decision metadata that came + back from the store. A regression in + ``ApprovalGateGuard.evaluate`` (e.g. a different deterministic + approval id, or failure to register) makes this test fail at + the ``item is not None`` assert. + """ async def snapshot_builder() -> OrgSignalSnapshot: return _snap(quality=7.5, success=0.85) + approval_store = ApprovalStore() svc = SelfImprovementService( config=SelfImprovementConfig( enabled=True, @@ -119,7 +136,7 @@ async def snapshot_builder() -> OrgSignalSnapshot: ), clock=FakeClock(), snapshot_builder=snapshot_builder, - approval_store=ApprovalStore(), + approval_store=approval_store, ) proposals = await svc.run_cycle(_snap(quality=4.0)) assert len(proposals) >= 1 @@ -129,17 +146,53 @@ async def snapshot_builder() -> OrgSignalSnapshot: if p.source_rule == "quality_declining" and p.altitude == ProposalAltitude.CONFIG_TUNING ) - # Simulate approval via model_copy because the ApprovalStore - # approval flow is not yet integrated in the test harness. - approved = proposal.model_copy( + + # The guard derives the approval id deterministically from + # the proposal id; mirror that derivation so a regression in + # the guard surfaces as a missing approval item rather than a + # silently bypassed flow. + approval_id = NotBlankStr( + str(uuid5(NAMESPACE_URL, f"proposal:{proposal.id}")), + ) + item = await approval_store.get(approval_id) + assert item is not None, ( + "ApprovalGateGuard did not register an approval item for " + "the proposal during run_cycle" + ) + assert item.status == ApprovalStatus.PENDING + + # Approve via the real store: ``save_if_pending`` is the same + # first-writer-wins path the API and MCP approval handlers + # take, so a regression in the store's concurrency model also + # surfaces here. + decided_at = datetime.now(UTC) + decided = item.model_copy( update={ - "status": ProposalStatus.APPROVED, - "decided_at": proposal.proposed_at, + "status": ApprovalStatus.APPROVED, + "decided_at": decided_at, "decided_by": "test-approver", "decision_reason": "Integration test approval", }, ) - result = await svc.execute_rollout(approved) + saved = await approval_store.save_if_pending(decided) + assert saved is not None, "Approval store rejected the pending decision" + assert saved.status == ApprovalStatus.APPROVED + + # Hand ``execute_rollout`` a proposal whose APPROVED state + # mirrors the store-resident decision. The proposal-side + # ``model_copy`` is mechanical -- there is no + # ``ApprovalItem -> ImprovementProposal`` adapter -- but the + # decision metadata flows from the real ``ApprovalStore`` + # round-trip, not from a free-standing test mutation. + approved_proposal = proposal.model_copy( + update={ + "status": ProposalStatus.APPROVED, + "decided_at": saved.decided_at, + "decided_by": saved.decided_by, + "decision_reason": saved.decision_reason, + }, + ) + result = await svc.execute_rollout(approved_proposal) assert result.outcome == RolloutOutcome.SUCCESS async def test_disabled_altitude_blocks_proposals(self) -> None: From 8bcd1b5bfc6bb4daa730b3ccea29ba186c82cfd1 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:38:12 +0200 Subject: [PATCH 3/5] test: support local LLM providers in real-LLM e2e test Make REAL_LLM_API_KEY optional so the test can target local providers (Ollama, LM Studio, vLLM) reached via REAL_LLM_BASE_URL. At least one of the two must be set; both unset still skips cleanly with a documented reason. --- tests/e2e/test_single_agent_e2e.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py index 1442ff578f..188dc34094 100644 --- a/tests/e2e/test_single_agent_e2e.py +++ b/tests/e2e/test_single_agent_e2e.py @@ -383,9 +383,13 @@ class TestRealLLMIntegration: """Optional smoke test with a real LLM provider. Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI. - Each method also requires REAL_LLM_MODEL, REAL_LLM_PROVIDER, and - REAL_LLM_API_KEY so the test can construct a LiteLLM-backed - provider without leaning on app-startup config wiring. + Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so + the test can construct a LiteLLM-backed provider without leaning + on app-startup config wiring. Authentication is configured via at + least one of REAL_LLM_API_KEY (hosted providers) or + REAL_LLM_BASE_URL (local providers like Ollama / LM Studio / + vLLM); when both are set the API key wins as before, when only the + base URL is set the call goes unauthenticated. """ async def test_real_provider_text_completion(self) -> None: @@ -402,15 +406,18 @@ async def test_real_provider_text_completion(self) -> None: "Set REAL_LLM_PROVIDER to a LiteLLM routing key " "(e.g. 'example-provider') to run this test" ) - api_key = os.environ.get("REAL_LLM_API_KEY") - if not api_key: + api_key = os.environ.get("REAL_LLM_API_KEY") or None + base_url = os.environ.get("REAL_LLM_BASE_URL") or None + if api_key is None and base_url is None: pytest.skip( - "Set REAL_LLM_API_KEY to the provider's API key to run this test" + "Set REAL_LLM_API_KEY (hosted) or REAL_LLM_BASE_URL " + "(local provider) to run this test" ) provider_config = ProviderConfig( litellm_provider=provider_name, api_key=api_key, + base_url=base_url, models=(ProviderModelConfig(id=provider_model),), ) provider = LiteLLMDriver(provider_name, provider_config) From ca6ab9a616028625dfe38b4bf21732f2fd140572 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 21:47:45 +0200 Subject: [PATCH 4/5] test: clarify auth-config docstring + comment in real-LLM test --- tests/e2e/test_single_agent_e2e.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py index 188dc34094..3b65f8fa52 100644 --- a/tests/e2e/test_single_agent_e2e.py +++ b/tests/e2e/test_single_agent_e2e.py @@ -388,8 +388,8 @@ class TestRealLLMIntegration: on app-startup config wiring. Authentication is configured via at least one of REAL_LLM_API_KEY (hosted providers) or REAL_LLM_BASE_URL (local providers like Ollama / LM Studio / - vLLM); when both are set the API key wins as before, when only the - base URL is set the call goes unauthenticated. + vLLM); when both are set, both are forwarded to LiteLLM (api_key + for auth, base_url as the request endpoint). """ async def test_real_provider_text_completion(self) -> None: @@ -406,6 +406,9 @@ async def test_real_provider_text_completion(self) -> None: "Set REAL_LLM_PROVIDER to a LiteLLM routing key " "(e.g. 'example-provider') to run this test" ) + # Normalise empty-string env vars to None so ProviderConfig's + # NotBlankStr fields accept the value; an exported-but-empty + # var is treated as "unset" rather than rejected at construct. api_key = os.environ.get("REAL_LLM_API_KEY") or None base_url = os.environ.get("REAL_LLM_BASE_URL") or None if api_key is None and base_url is None: From e64d1dfb7c52244a40b8193d15c675bcb21697be Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sat, 9 May 2026 22:03:27 +0200 Subject: [PATCH 5/5] fix: babysit round 1, 3 findings (1 coderabbit, 2 gemini) - tests/integration/meta/test_meta_cycle.py: share a single FakeClock between ApprovalStore and SelfImprovementService and source decided_at from clock.now() so time-dependent paths in the store (expiration, decision timestamps) stay in lockstep with the service's simulated time (Gemini #3213807073, #3213807075). - tests/e2e/test_single_agent_e2e.py: drop vendor-branded names (LiteLLM, Ollama, LM Studio, vLLM) from docstrings and skip messages in TestRealLLMIntegration; use vendor-agnostic phrasing (configured provider driver, local / self-hosted providers, provider routing key) per CLAUDE.md vendor-agnostic naming (CodeRabbit #3213808360). The LiteLLMDriver class identifier is retained: it is the project-internal wrapper around the third-party litellm package, which CLAUDE.md exempts under third-party imports. --- tests/e2e/test_single_agent_e2e.py | 16 ++++++++-------- tests/integration/meta/test_meta_cycle.py | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py index 3b65f8fa52..72bcb22913 100644 --- a/tests/e2e/test_single_agent_e2e.py +++ b/tests/e2e/test_single_agent_e2e.py @@ -384,16 +384,16 @@ class TestRealLLMIntegration: Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI. Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so - the test can construct a LiteLLM-backed provider without leaning - on app-startup config wiring. Authentication is configured via at - least one of REAL_LLM_API_KEY (hosted providers) or - REAL_LLM_BASE_URL (local providers like Ollama / LM Studio / - vLLM); when both are set, both are forwarded to LiteLLM (api_key - for auth, base_url as the request endpoint). + the test can construct a configured provider driver without + leaning on app-startup config wiring. Authentication is configured + via at least one of REAL_LLM_API_KEY (hosted providers) or + REAL_LLM_BASE_URL (local / self-hosted providers); when both are + set, both are forwarded to the provider driver (api_key for auth, + base_url as the request endpoint). """ async def test_real_provider_text_completion(self) -> None: - """Minimal text-only task end-to-end through ``LiteLLMDriver``.""" + """Minimal text-only task end-to-end through the configured provider driver.""" provider_model = os.environ.get("REAL_LLM_MODEL") if not provider_model: pytest.skip( @@ -403,7 +403,7 @@ async def test_real_provider_text_completion(self) -> None: provider_name = os.environ.get("REAL_LLM_PROVIDER") if not provider_name: pytest.skip( - "Set REAL_LLM_PROVIDER to a LiteLLM routing key " + "Set REAL_LLM_PROVIDER to a provider routing key " "(e.g. 'example-provider') to run this test" ) # Normalise empty-string env vars to None so ProviderConfig's diff --git a/tests/integration/meta/test_meta_cycle.py b/tests/integration/meta/test_meta_cycle.py index e428d04bbb..82f618706f 100644 --- a/tests/integration/meta/test_meta_cycle.py +++ b/tests/integration/meta/test_meta_cycle.py @@ -4,7 +4,6 @@ guards -> approval -> rollout -> regression detection. """ -from datetime import UTC, datetime from uuid import NAMESPACE_URL, uuid5 import pytest @@ -128,13 +127,14 @@ async def test_proposal_rollout_succeeds(self) -> None: async def snapshot_builder() -> OrgSignalSnapshot: return _snap(quality=7.5, success=0.85) - approval_store = ApprovalStore() + clock = FakeClock() + approval_store = ApprovalStore(clock=clock) svc = SelfImprovementService( config=SelfImprovementConfig( enabled=True, config_tuning_enabled=True, ), - clock=FakeClock(), + clock=clock, snapshot_builder=snapshot_builder, approval_store=approval_store, ) @@ -165,7 +165,7 @@ async def snapshot_builder() -> OrgSignalSnapshot: # first-writer-wins path the API and MCP approval handlers # take, so a regression in the store's concurrency model also # surfaces here. - decided_at = datetime.now(UTC) + decided_at = clock.now() decided = item.model_copy( update={ "status": ApprovalStatus.APPROVED,