Aureliolo · Aureliolo · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
@@ -14,10 +14,12 @@
     from pathlib import Path
 
 from synthorg.budget.tracker import CostTracker
-from synthorg.core.agent import ToolPermissions
+from synthorg.config.provider_schema import ProviderConfig, ProviderModelConfig
+from synthorg.core.agent import ModelConfig, ToolPermissions
 from synthorg.core.enums import TaskStatus, ToolAccessLevel
 from synthorg.engine.agent_engine import AgentEngine
 from synthorg.engine.loop_protocol import TerminationReason
+from synthorg.providers.drivers.litellm_driver import LiteLLMDriver
 from synthorg.providers.enums import MessageRole
 from synthorg.providers.models import ToolCall
 from synthorg.tools.file_system.write_file import WriteFileTool
@@ -381,22 +383,79 @@ class TestRealLLMIntegration:
     """Optional smoke test with a real LLM provider.
 
     Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI.
-    Currently a placeholder -- all methods skip until a real provider
-    is configured via environment variables.
+    Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so
+    the test can construct a configured provider driver without
+    leaning on app-startup config wiring. Authentication is configured
+    via at least one of REAL_LLM_API_KEY (hosted providers) or
+    REAL_LLM_BASE_URL (local / self-hosted providers); when both are
+    set, both are forwarded to the provider driver (api_key for auth,
+    base_url as the request endpoint).
     """
 
     async def test_real_provider_text_completion(self) -> None:
-        """Minimal text-only task with a real provider.
-
-        Placeholder for real provider integration; additional
-        configuration scaffolding is required before this can be enabled.
-        """
+        """Minimal text-only task end-to-end through the configured provider driver."""
         provider_model = os.environ.get("REAL_LLM_MODEL")
         if not provider_model:
             pytest.skip(
                 "Set REAL_LLM_MODEL to a valid model ID "
                 "(e.g. 'example-large-001') to run this test"
             )
-        pytest.skip(
-            f"Real LLM provider integration not yet wired -- model={provider_model}"
+        provider_name = os.environ.get("REAL_LLM_PROVIDER")
+        if not provider_name:
+            pytest.skip(
+                "Set REAL_LLM_PROVIDER to a provider routing key "
+                "(e.g. 'example-provider') to run this test"
+            )
+        # Normalise empty-string env vars to None so ProviderConfig's
+        # NotBlankStr fields accept the value; an exported-but-empty
+        # var is treated as "unset" rather than rejected at construct.
+        api_key = os.environ.get("REAL_LLM_API_KEY") or None
+        base_url = os.environ.get("REAL_LLM_BASE_URL") or None
+        if api_key is None and base_url is None:
+            pytest.skip(
+                "Set REAL_LLM_API_KEY (hosted) or REAL_LLM_BASE_URL "
+                "(local provider) to run this test"
+            )
+
+        provider_config = ProviderConfig(
+            litellm_provider=provider_name,
+            api_key=api_key,
+            base_url=base_url,
+            models=(ProviderModelConfig(id=provider_model),),
         )
+        provider = LiteLLMDriver(provider_name, provider_config)
+
+        cost_tracker = CostTracker()
+        identity = make_e2e_identity().model_copy(
+            update={
+                "model": ModelConfig(
+                    provider=provider_name,
+                    model_id=provider_model,
+                ),
+            },
+        )
+        task = make_e2e_task(
+            identity=identity,
+            title="Real LLM smoke test",
+            description="Reply with the single word 'ack'.",
+        )
+
+        engine = AgentEngine(
+            provider=provider,
+            cost_tracker=cost_tracker,
+        )
+        result = await engine.run(
+            identity=identity,
+            task=task,
+            max_turns=2,
+        )
+
+        # Real provider produced a successful single-turn completion.
+        assert result.is_success is True
+        assert result.termination_reason == TerminationReason.COMPLETED
+        assert result.completion_summary
+        # ``>= 0`` (not ``> 0``) so a local zero-cost preset still passes.
+        assert result.total_cost >= 0
+        assert await cost_tracker.get_record_count() == result.total_turns
+        assert result.task_id == task.id
+        assert result.duration_seconds > 0
@@ -4,9 +4,13 @@
 guards -> approval -> rollout -> regression detection.
 """
 
+from uuid import NAMESPACE_URL, uuid5
+
 import pytest
 
 from synthorg.api.approval_store import ApprovalStore
+from synthorg.core.enums import ApprovalStatus
+from synthorg.core.types import NotBlankStr
 from synthorg.meta.config import SelfImprovementConfig
 from synthorg.meta.models import (
     OrgBudgetSummary,
@@ -107,19 +111,32 @@ async def test_budget_overrun_produces_critical_proposal(
         assert "budget_overrun" in sources
 
     async def test_proposal_rollout_succeeds(self) -> None:
-        """Scenario: approved proposal -> rollout -> success."""
+        """Scenario: approved proposal -> rollout -> success.
+
+        Routes the approval through the real ``ApprovalStore``: the
+        guard registers an ``ApprovalItem`` during ``run_cycle``, the
+        test approves it via ``save_if_pending`` (mirroring the API /
+        MCP approval handlers), and the proposal handed to
+        ``execute_rollout`` carries the decision metadata that came
+        back from the store. A regression in
+        ``ApprovalGateGuard.evaluate`` (e.g. a different deterministic
+        approval id, or failure to register) makes this test fail at
+        the ``item is not None`` assert.
+        """
 
         async def snapshot_builder() -> OrgSignalSnapshot:
             return _snap(quality=7.5, success=0.85)
 
+        clock = FakeClock()
+        approval_store = ApprovalStore(clock=clock)
         svc = SelfImprovementService(
             config=SelfImprovementConfig(
                 enabled=True,
                 config_tuning_enabled=True,
             ),
-            clock=FakeClock(),
+            clock=clock,
             snapshot_builder=snapshot_builder,
-            approval_store=ApprovalStore(),
+            approval_store=approval_store,
         )
         proposals = await svc.run_cycle(_snap(quality=4.0))
         assert len(proposals) >= 1
@@ -129,17 +146,53 @@ async def snapshot_builder() -> OrgSignalSnapshot:
             if p.source_rule == "quality_declining"
             and p.altitude == ProposalAltitude.CONFIG_TUNING
         )
-        # Simulate approval via model_copy because the ApprovalStore
-        # approval flow is not yet integrated in the test harness.
-        approved = proposal.model_copy(
+
+        # The guard derives the approval id deterministically from
+        # the proposal id; mirror that derivation so a regression in
+        # the guard surfaces as a missing approval item rather than a
+        # silently bypassed flow.
+        approval_id = NotBlankStr(
+            str(uuid5(NAMESPACE_URL, f"proposal:{proposal.id}")),
+        )
+        item = await approval_store.get(approval_id)
+        assert item is not None, (
+            "ApprovalGateGuard did not register an approval item for "
+            "the proposal during run_cycle"
+        )
+        assert item.status == ApprovalStatus.PENDING
+
+        # Approve via the real store: ``save_if_pending`` is the same
+        # first-writer-wins path the API and MCP approval handlers
+        # take, so a regression in the store's concurrency model also
+        # surfaces here.
+        decided_at = clock.now()
+        decided = item.model_copy(
             update={
-                "status": ProposalStatus.APPROVED,
-                "decided_at": proposal.proposed_at,
+                "status": ApprovalStatus.APPROVED,
+                "decided_at": decided_at,
                 "decided_by": "test-approver",
                 "decision_reason": "Integration test approval",
             },
         )
-        result = await svc.execute_rollout(approved)
+        saved = await approval_store.save_if_pending(decided)
+        assert saved is not None, "Approval store rejected the pending decision"
+        assert saved.status == ApprovalStatus.APPROVED
+
+        # Hand ``execute_rollout`` a proposal whose APPROVED state
+        # mirrors the store-resident decision. The proposal-side
+        # ``model_copy`` is mechanical -- there is no
+        # ``ApprovalItem -> ImprovementProposal`` adapter -- but the
+        # decision metadata flows from the real ``ApprovalStore``
+        # round-trip, not from a free-standing test mutation.
+        approved_proposal = proposal.model_copy(
+            update={
+                "status": ProposalStatus.APPROVED,
+                "decided_at": saved.decided_at,
+                "decided_by": saved.decided_by,
+                "decision_reason": saved.decision_reason,
+            },
+        )
+        result = await svc.execute_rollout(approved_proposal)
         assert result.outcome == RolloutOutcome.SUCCESS
 
     async def test_disabled_altitude_blocks_proposals(self) -> None: