From 137ff3f24d0d770a12432d23ec13a49d429e8a68 Mon Sep 17 00:00:00 2001
From: Aurelio <19254254+Aureliolo@users.noreply.github.com>
Date: Sat, 9 May 2026 21:05:53 +0200
Subject: [PATCH 1/5] test: wire real LiteLLM provider in single-agent e2e test

---
 tests/e2e/test_single_agent_e2e.py | 69 +++++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 10 deletions(-)

diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py
index 381efae878..1442ff578f 100644
--- a/tests/e2e/test_single_agent_e2e.py
+++ b/tests/e2e/test_single_agent_e2e.py
@@ -14,10 +14,12 @@
     from pathlib import Path
 
 from synthorg.budget.tracker import CostTracker
-from synthorg.core.agent import ToolPermissions
+from synthorg.config.provider_schema import ProviderConfig, ProviderModelConfig
+from synthorg.core.agent import ModelConfig, ToolPermissions
 from synthorg.core.enums import TaskStatus, ToolAccessLevel
 from synthorg.engine.agent_engine import AgentEngine
 from synthorg.engine.loop_protocol import TerminationReason
+from synthorg.providers.drivers.litellm_driver import LiteLLMDriver
 from synthorg.providers.enums import MessageRole
 from synthorg.providers.models import ToolCall
 from synthorg.tools.file_system.write_file import WriteFileTool
@@ -381,22 +383,69 @@ class TestRealLLMIntegration:
     """Optional smoke test with a real LLM provider.
 
     Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI.
-    Currently a placeholder -- all methods skip until a real provider
-    is configured via environment variables.
+    Each method also requires REAL_LLM_MODEL, REAL_LLM_PROVIDER, and
+    REAL_LLM_API_KEY so the test can construct a LiteLLM-backed
+    provider without leaning on app-startup config wiring.
     """
 
     async def test_real_provider_text_completion(self) -> None:
-        """Minimal text-only task with a real provider.
-
-        Placeholder for real provider integration; additional
-        configuration scaffolding is required before this can be enabled.
-        """
+        """Minimal text-only task end-to-end through ``LiteLLMDriver``."""
         provider_model = os.environ.get("REAL_LLM_MODEL")
         if not provider_model:
             pytest.skip(
                 "Set REAL_LLM_MODEL to a valid model ID "
                 "(e.g. 'example-large-001') to run this test"
             )
-        pytest.skip(
-            f"Real LLM provider integration not yet wired -- model={provider_model}"
+        provider_name = os.environ.get("REAL_LLM_PROVIDER")
+        if not provider_name:
+            pytest.skip(
+                "Set REAL_LLM_PROVIDER to a LiteLLM routing key "
+                "(e.g. 'example-provider') to run this test"
+            )
+        api_key = os.environ.get("REAL_LLM_API_KEY")
+        if not api_key:
+            pytest.skip(
+                "Set REAL_LLM_API_KEY to the provider's API key to run this test"
+            )
+
+        provider_config = ProviderConfig(
+            litellm_provider=provider_name,
+            api_key=api_key,
+            models=(ProviderModelConfig(id=provider_model),),
         )
+        provider = LiteLLMDriver(provider_name, provider_config)
+
+        cost_tracker = CostTracker()
+        identity = make_e2e_identity().model_copy(
+            update={
+                "model": ModelConfig(
+                    provider=provider_name,
+                    model_id=provider_model,
+                ),
+            },
+        )
+        task = make_e2e_task(
+            identity=identity,
+            title="Real LLM smoke test",
+            description="Reply with the single word 'ack'.",
+        )
+
+        engine = AgentEngine(
+            provider=provider,
+            cost_tracker=cost_tracker,
+        )
+        result = await engine.run(
+            identity=identity,
+            task=task,
+            max_turns=2,
+        )
+
+        # Real provider produced a successful single-turn completion.
+        assert result.is_success is True
+        assert result.termination_reason == TerminationReason.COMPLETED
+        assert result.completion_summary
+        # ``>= 0`` (not ``> 0``) so a local zero-cost preset still passes.
+        assert result.total_cost >= 0
+        assert await cost_tracker.get_record_count() == result.total_turns
+        assert result.task_id == task.id
+        assert result.duration_seconds > 0

From 5694937738d786528d29f2933c8eba7dff45da53 Mon Sep 17 00:00:00 2001
From: Aurelio <19254254+Aureliolo@users.noreply.github.com>
Date: Sat, 9 May 2026 21:08:13 +0200
Subject: [PATCH 2/5] test: route meta-cycle proposal approval through real
 ApprovalStore

---
 tests/integration/meta/test_meta_cycle.py | 69 ++++++++++++++++++++---
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/tests/integration/meta/test_meta_cycle.py b/tests/integration/meta/test_meta_cycle.py
index 1add3a1d80..e428d04bbb 100644
--- a/tests/integration/meta/test_meta_cycle.py
+++ b/tests/integration/meta/test_meta_cycle.py
@@ -4,9 +4,14 @@
 guards -> approval -> rollout -> regression detection.
 """
 
+from datetime import UTC, datetime
+from uuid import NAMESPACE_URL, uuid5
+
 import pytest
 
 from synthorg.api.approval_store import ApprovalStore
+from synthorg.core.enums import ApprovalStatus
+from synthorg.core.types import NotBlankStr
 from synthorg.meta.config import SelfImprovementConfig
 from synthorg.meta.models import (
     OrgBudgetSummary,
@@ -107,11 +112,23 @@ async def test_budget_overrun_produces_critical_proposal(
         assert "budget_overrun" in sources
 
     async def test_proposal_rollout_succeeds(self) -> None:
-        """Scenario: approved proposal -> rollout -> success."""
+        """Scenario: approved proposal -> rollout -> success.
+
+        Routes the approval through the real ``ApprovalStore``: the
+        guard registers an ``ApprovalItem`` during ``run_cycle``, the
+        test approves it via ``save_if_pending`` (mirroring the API /
+        MCP approval handlers), and the proposal handed to
+        ``execute_rollout`` carries the decision metadata that came
+        back from the store. A regression in
+        ``ApprovalGateGuard.evaluate`` (e.g. a different deterministic
+        approval id, or failure to register) makes this test fail at
+        the ``item is not None`` assert.
+        """
 
         async def snapshot_builder() -> OrgSignalSnapshot:
             return _snap(quality=7.5, success=0.85)
 
+        approval_store = ApprovalStore()
         svc = SelfImprovementService(
             config=SelfImprovementConfig(
                 enabled=True,
@@ -119,7 +136,7 @@ async def snapshot_builder() -> OrgSignalSnapshot:
             ),
             clock=FakeClock(),
             snapshot_builder=snapshot_builder,
-            approval_store=ApprovalStore(),
+            approval_store=approval_store,
         )
         proposals = await svc.run_cycle(_snap(quality=4.0))
         assert len(proposals) >= 1
@@ -129,17 +146,53 @@ async def snapshot_builder() -> OrgSignalSnapshot:
             if p.source_rule == "quality_declining"
             and p.altitude == ProposalAltitude.CONFIG_TUNING
         )
-        # Simulate approval via model_copy because the ApprovalStore
-        # approval flow is not yet integrated in the test harness.
-        approved = proposal.model_copy(
+
+        # The guard derives the approval id deterministically from
+        # the proposal id; mirror that derivation so a regression in
+        # the guard surfaces as a missing approval item rather than a
+        # silently bypassed flow.
+        approval_id = NotBlankStr(
+            str(uuid5(NAMESPACE_URL, f"proposal:{proposal.id}")),
+        )
+        item = await approval_store.get(approval_id)
+        assert item is not None, (
+            "ApprovalGateGuard did not register an approval item for "
+            "the proposal during run_cycle"
+        )
+        assert item.status == ApprovalStatus.PENDING
+
+        # Approve via the real store: ``save_if_pending`` is the same
+        # first-writer-wins path the API and MCP approval handlers
+        # take, so a regression in the store's concurrency model also
+        # surfaces here.
+        decided_at = datetime.now(UTC)
+        decided = item.model_copy(
             update={
-                "status": ProposalStatus.APPROVED,
-                "decided_at": proposal.proposed_at,
+                "status": ApprovalStatus.APPROVED,
+                "decided_at": decided_at,
                 "decided_by": "test-approver",
                 "decision_reason": "Integration test approval",
             },
         )
-        result = await svc.execute_rollout(approved)
+        saved = await approval_store.save_if_pending(decided)
+        assert saved is not None, "Approval store rejected the pending decision"
+        assert saved.status == ApprovalStatus.APPROVED
+
+        # Hand ``execute_rollout`` a proposal whose APPROVED state
+        # mirrors the store-resident decision. The proposal-side
+        # ``model_copy`` is mechanical -- there is no
+        # ``ApprovalItem -> ImprovementProposal`` adapter -- but the
+        # decision metadata flows from the real ``ApprovalStore``
+        # round-trip, not from a free-standing test mutation.
+        approved_proposal = proposal.model_copy(
+            update={
+                "status": ProposalStatus.APPROVED,
+                "decided_at": saved.decided_at,
+                "decided_by": saved.decided_by,
+                "decision_reason": saved.decision_reason,
+            },
+        )
+        result = await svc.execute_rollout(approved_proposal)
         assert result.outcome == RolloutOutcome.SUCCESS
 
     async def test_disabled_altitude_blocks_proposals(self) -> None:

From 8bcd1b5bfc6bb4daa730b3ccea29ba186c82cfd1 Mon Sep 17 00:00:00 2001
From: Aurelio <19254254+Aureliolo@users.noreply.github.com>
Date: Sat, 9 May 2026 21:38:12 +0200
Subject: [PATCH 3/5] test: support local LLM providers in real-LLM e2e test

Make REAL_LLM_API_KEY optional so the test can target local
providers (Ollama, LM Studio, vLLM) reached via REAL_LLM_BASE_URL.
At least one of the two must be set; both unset still skips
cleanly with a documented reason.
---
 tests/e2e/test_single_agent_e2e.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py
index 1442ff578f..188dc34094 100644
--- a/tests/e2e/test_single_agent_e2e.py
+++ b/tests/e2e/test_single_agent_e2e.py
@@ -383,9 +383,13 @@ class TestRealLLMIntegration:
     """Optional smoke test with a real LLM provider.
 
     Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI.
-    Each method also requires REAL_LLM_MODEL, REAL_LLM_PROVIDER, and
-    REAL_LLM_API_KEY so the test can construct a LiteLLM-backed
-    provider without leaning on app-startup config wiring.
+    Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so
+    the test can construct a LiteLLM-backed provider without leaning
+    on app-startup config wiring. Authentication is configured via at
+    least one of REAL_LLM_API_KEY (hosted providers) or
+    REAL_LLM_BASE_URL (local providers like Ollama / LM Studio /
+    vLLM); when both are set the API key wins as before, when only the
+    base URL is set the call goes unauthenticated.
     """
 
     async def test_real_provider_text_completion(self) -> None:
@@ -402,15 +406,18 @@ async def test_real_provider_text_completion(self) -> None:
                 "Set REAL_LLM_PROVIDER to a LiteLLM routing key "
                 "(e.g. 'example-provider') to run this test"
             )
-        api_key = os.environ.get("REAL_LLM_API_KEY")
-        if not api_key:
+        api_key = os.environ.get("REAL_LLM_API_KEY") or None
+        base_url = os.environ.get("REAL_LLM_BASE_URL") or None
+        if api_key is None and base_url is None:
             pytest.skip(
-                "Set REAL_LLM_API_KEY to the provider's API key to run this test"
+                "Set REAL_LLM_API_KEY (hosted) or REAL_LLM_BASE_URL "
+                "(local provider) to run this test"
             )
 
         provider_config = ProviderConfig(
             litellm_provider=provider_name,
             api_key=api_key,
+            base_url=base_url,
             models=(ProviderModelConfig(id=provider_model),),
         )
         provider = LiteLLMDriver(provider_name, provider_config)

From ca6ab9a616028625dfe38b4bf21732f2fd140572 Mon Sep 17 00:00:00 2001
From: Aurelio <19254254+Aureliolo@users.noreply.github.com>
Date: Sat, 9 May 2026 21:47:45 +0200
Subject: [PATCH 4/5] test: clarify auth-config docstring + comment in real-LLM
 test

---
 tests/e2e/test_single_agent_e2e.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py
index 188dc34094..3b65f8fa52 100644
--- a/tests/e2e/test_single_agent_e2e.py
+++ b/tests/e2e/test_single_agent_e2e.py
@@ -388,8 +388,8 @@ class TestRealLLMIntegration:
     on app-startup config wiring. Authentication is configured via at
     least one of REAL_LLM_API_KEY (hosted providers) or
     REAL_LLM_BASE_URL (local providers like Ollama / LM Studio /
-    vLLM); when both are set the API key wins as before, when only the
-    base URL is set the call goes unauthenticated.
+    vLLM); when both are set, both are forwarded to LiteLLM (api_key
+    for auth, base_url as the request endpoint).
     """
 
     async def test_real_provider_text_completion(self) -> None:
@@ -406,6 +406,9 @@ async def test_real_provider_text_completion(self) -> None:
                 "Set REAL_LLM_PROVIDER to a LiteLLM routing key "
                 "(e.g. 'example-provider') to run this test"
             )
+        # Normalise empty-string env vars to None so ProviderConfig's
+        # NotBlankStr fields accept the value; an exported-but-empty
+        # var is treated as "unset" rather than rejected at construct.
         api_key = os.environ.get("REAL_LLM_API_KEY") or None
         base_url = os.environ.get("REAL_LLM_BASE_URL") or None
         if api_key is None and base_url is None:

From e64d1dfb7c52244a40b8193d15c675bcb21697be Mon Sep 17 00:00:00 2001
From: Aurelio <19254254+Aureliolo@users.noreply.github.com>
Date: Sat, 9 May 2026 22:03:27 +0200
Subject: [PATCH 5/5] fix: babysit round 1, 3 findings (1 coderabbit, 2 gemini)

- tests/integration/meta/test_meta_cycle.py: share a single FakeClock
  between ApprovalStore and SelfImprovementService and source
  decided_at from clock.now() so time-dependent paths in the store
  (expiration, decision timestamps) stay in lockstep with the
  service's simulated time (Gemini #3213807073, #3213807075).
- tests/e2e/test_single_agent_e2e.py: drop vendor-branded names
  (LiteLLM, Ollama, LM Studio, vLLM) from docstrings and skip
  messages in TestRealLLMIntegration; use vendor-agnostic phrasing
  (configured provider driver, local / self-hosted providers,
  provider routing key) per CLAUDE.md vendor-agnostic naming
  (CodeRabbit #3213808360). The LiteLLMDriver class identifier is
  retained: it is the project-internal wrapper around the
  third-party litellm package, which CLAUDE.md exempts under
  third-party imports.
---
 tests/e2e/test_single_agent_e2e.py        | 16 ++++++++--------
 tests/integration/meta/test_meta_cycle.py |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py
index 3b65f8fa52..72bcb22913 100644
--- a/tests/e2e/test_single_agent_e2e.py
+++ b/tests/e2e/test_single_agent_e2e.py
@@ -384,16 +384,16 @@ class TestRealLLMIntegration:
 
     Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI.
     Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so
-    the test can construct a LiteLLM-backed provider without leaning
-    on app-startup config wiring. Authentication is configured via at
-    least one of REAL_LLM_API_KEY (hosted providers) or
-    REAL_LLM_BASE_URL (local providers like Ollama / LM Studio /
-    vLLM); when both are set, both are forwarded to LiteLLM (api_key
-    for auth, base_url as the request endpoint).
+    the test can construct a configured provider driver without
+    leaning on app-startup config wiring. Authentication is configured
+    via at least one of REAL_LLM_API_KEY (hosted providers) or
+    REAL_LLM_BASE_URL (local / self-hosted providers); when both are
+    set, both are forwarded to the provider driver (api_key for auth,
+    base_url as the request endpoint).
     """
 
     async def test_real_provider_text_completion(self) -> None:
-        """Minimal text-only task end-to-end through ``LiteLLMDriver``."""
+        """Minimal text-only task end-to-end through the configured provider driver."""
         provider_model = os.environ.get("REAL_LLM_MODEL")
         if not provider_model:
             pytest.skip(
@@ -403,7 +403,7 @@ async def test_real_provider_text_completion(self) -> None:
         provider_name = os.environ.get("REAL_LLM_PROVIDER")
         if not provider_name:
             pytest.skip(
-                "Set REAL_LLM_PROVIDER to a LiteLLM routing key "
+                "Set REAL_LLM_PROVIDER to a provider routing key "
                 "(e.g. 'example-provider') to run this test"
             )
         # Normalise empty-string env vars to None so ProviderConfig's
diff --git a/tests/integration/meta/test_meta_cycle.py b/tests/integration/meta/test_meta_cycle.py
index e428d04bbb..82f618706f 100644
--- a/tests/integration/meta/test_meta_cycle.py
+++ b/tests/integration/meta/test_meta_cycle.py
@@ -4,7 +4,6 @@
 guards -> approval -> rollout -> regression detection.
 """
 
-from datetime import UTC, datetime
 from uuid import NAMESPACE_URL, uuid5
 
 import pytest
@@ -128,13 +127,14 @@ async def test_proposal_rollout_succeeds(self) -> None:
         async def snapshot_builder() -> OrgSignalSnapshot:
             return _snap(quality=7.5, success=0.85)
 
-        approval_store = ApprovalStore()
+        clock = FakeClock()
+        approval_store = ApprovalStore(clock=clock)
         svc = SelfImprovementService(
             config=SelfImprovementConfig(
                 enabled=True,
                 config_tuning_enabled=True,
             ),
-            clock=FakeClock(),
+            clock=clock,
             snapshot_builder=snapshot_builder,
             approval_store=approval_store,
         )
@@ -165,7 +165,7 @@ async def snapshot_builder() -> OrgSignalSnapshot:
         # first-writer-wins path the API and MCP approval handlers
         # take, so a regression in the store's concurrency model also
         # surfaces here.
-        decided_at = datetime.now(UTC)
+        decided_at = clock.now()
         decided = item.model_copy(
             update={
                 "status": ApprovalStatus.APPROVED,