diff --git a/tests/e2e/test_single_agent_e2e.py b/tests/e2e/test_single_agent_e2e.py index 381efae878..72bcb22913 100644 --- a/tests/e2e/test_single_agent_e2e.py +++ b/tests/e2e/test_single_agent_e2e.py @@ -14,10 +14,12 @@ from pathlib import Path from synthorg.budget.tracker import CostTracker -from synthorg.core.agent import ToolPermissions +from synthorg.config.provider_schema import ProviderConfig, ProviderModelConfig +from synthorg.core.agent import ModelConfig, ToolPermissions from synthorg.core.enums import TaskStatus, ToolAccessLevel from synthorg.engine.agent_engine import AgentEngine from synthorg.engine.loop_protocol import TerminationReason +from synthorg.providers.drivers.litellm_driver import LiteLLMDriver from synthorg.providers.enums import MessageRole from synthorg.providers.models import ToolCall from synthorg.tools.file_system.write_file import WriteFileTool @@ -381,22 +383,79 @@ class TestRealLLMIntegration: """Optional smoke test with a real LLM provider. Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI. - Currently a placeholder -- all methods skip until a real provider - is configured via environment variables. + Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so + the test can construct a configured provider driver without + leaning on app-startup config wiring. Authentication is configured + via at least one of REAL_LLM_API_KEY (hosted providers) or + REAL_LLM_BASE_URL (local / self-hosted providers); when both are + set, both are forwarded to the provider driver (api_key for auth, + base_url as the request endpoint). """ async def test_real_provider_text_completion(self) -> None: - """Minimal text-only task with a real provider. - - Placeholder for real provider integration; additional - configuration scaffolding is required before this can be enabled. - """ + """Minimal text-only task end-to-end through the configured provider driver.""" provider_model = os.environ.get("REAL_LLM_MODEL") if not provider_model: pytest.skip( "Set REAL_LLM_MODEL to a valid model ID " "(e.g. 'example-large-001') to run this test" ) - pytest.skip( - f"Real LLM provider integration not yet wired -- model={provider_model}" + provider_name = os.environ.get("REAL_LLM_PROVIDER") + if not provider_name: + pytest.skip( + "Set REAL_LLM_PROVIDER to a provider routing key " + "(e.g. 'example-provider') to run this test" + ) + # Normalise empty-string env vars to None so ProviderConfig's + # NotBlankStr fields accept the value; an exported-but-empty + # var is treated as "unset" rather than rejected at construct. + api_key = os.environ.get("REAL_LLM_API_KEY") or None + base_url = os.environ.get("REAL_LLM_BASE_URL") or None + if api_key is None and base_url is None: + pytest.skip( + "Set REAL_LLM_API_KEY (hosted) or REAL_LLM_BASE_URL " + "(local provider) to run this test" + ) + + provider_config = ProviderConfig( + litellm_provider=provider_name, + api_key=api_key, + base_url=base_url, + models=(ProviderModelConfig(id=provider_model),), ) + provider = LiteLLMDriver(provider_name, provider_config) + + cost_tracker = CostTracker() + identity = make_e2e_identity().model_copy( + update={ + "model": ModelConfig( + provider=provider_name, + model_id=provider_model, + ), + }, + ) + task = make_e2e_task( + identity=identity, + title="Real LLM smoke test", + description="Reply with the single word 'ack'.", + ) + + engine = AgentEngine( + provider=provider, + cost_tracker=cost_tracker, + ) + result = await engine.run( + identity=identity, + task=task, + max_turns=2, + ) + + # Real provider produced a successful single-turn completion. + assert result.is_success is True + assert result.termination_reason == TerminationReason.COMPLETED + assert result.completion_summary + # ``>= 0`` (not ``> 0``) so a local zero-cost preset still passes. + assert result.total_cost >= 0 + assert await cost_tracker.get_record_count() == result.total_turns + assert result.task_id == task.id + assert result.duration_seconds > 0 diff --git a/tests/integration/meta/test_meta_cycle.py b/tests/integration/meta/test_meta_cycle.py index 1add3a1d80..82f618706f 100644 --- a/tests/integration/meta/test_meta_cycle.py +++ b/tests/integration/meta/test_meta_cycle.py @@ -4,9 +4,13 @@ guards -> approval -> rollout -> regression detection. """ +from uuid import NAMESPACE_URL, uuid5 + import pytest from synthorg.api.approval_store import ApprovalStore +from synthorg.core.enums import ApprovalStatus +from synthorg.core.types import NotBlankStr from synthorg.meta.config import SelfImprovementConfig from synthorg.meta.models import ( OrgBudgetSummary, @@ -107,19 +111,32 @@ async def test_budget_overrun_produces_critical_proposal( assert "budget_overrun" in sources async def test_proposal_rollout_succeeds(self) -> None: - """Scenario: approved proposal -> rollout -> success.""" + """Scenario: approved proposal -> rollout -> success. + + Routes the approval through the real ``ApprovalStore``: the + guard registers an ``ApprovalItem`` during ``run_cycle``, the + test approves it via ``save_if_pending`` (mirroring the API / + MCP approval handlers), and the proposal handed to + ``execute_rollout`` carries the decision metadata that came + back from the store. A regression in + ``ApprovalGateGuard.evaluate`` (e.g. a different deterministic + approval id, or failure to register) makes this test fail at + the ``item is not None`` assert. + """ async def snapshot_builder() -> OrgSignalSnapshot: return _snap(quality=7.5, success=0.85) + clock = FakeClock() + approval_store = ApprovalStore(clock=clock) svc = SelfImprovementService( config=SelfImprovementConfig( enabled=True, config_tuning_enabled=True, ), - clock=FakeClock(), + clock=clock, snapshot_builder=snapshot_builder, - approval_store=ApprovalStore(), + approval_store=approval_store, ) proposals = await svc.run_cycle(_snap(quality=4.0)) assert len(proposals) >= 1 @@ -129,17 +146,53 @@ async def snapshot_builder() -> OrgSignalSnapshot: if p.source_rule == "quality_declining" and p.altitude == ProposalAltitude.CONFIG_TUNING ) - # Simulate approval via model_copy because the ApprovalStore - # approval flow is not yet integrated in the test harness. - approved = proposal.model_copy( + + # The guard derives the approval id deterministically from + # the proposal id; mirror that derivation so a regression in + # the guard surfaces as a missing approval item rather than a + # silently bypassed flow. + approval_id = NotBlankStr( + str(uuid5(NAMESPACE_URL, f"proposal:{proposal.id}")), + ) + item = await approval_store.get(approval_id) + assert item is not None, ( + "ApprovalGateGuard did not register an approval item for " + "the proposal during run_cycle" + ) + assert item.status == ApprovalStatus.PENDING + + # Approve via the real store: ``save_if_pending`` is the same + # first-writer-wins path the API and MCP approval handlers + # take, so a regression in the store's concurrency model also + # surfaces here. + decided_at = clock.now() + decided = item.model_copy( update={ - "status": ProposalStatus.APPROVED, - "decided_at": proposal.proposed_at, + "status": ApprovalStatus.APPROVED, + "decided_at": decided_at, "decided_by": "test-approver", "decision_reason": "Integration test approval", }, ) - result = await svc.execute_rollout(approved) + saved = await approval_store.save_if_pending(decided) + assert saved is not None, "Approval store rejected the pending decision" + assert saved.status == ApprovalStatus.APPROVED + + # Hand ``execute_rollout`` a proposal whose APPROVED state + # mirrors the store-resident decision. The proposal-side + # ``model_copy`` is mechanical -- there is no + # ``ApprovalItem -> ImprovementProposal`` adapter -- but the + # decision metadata flows from the real ``ApprovalStore`` + # round-trip, not from a free-standing test mutation. + approved_proposal = proposal.model_copy( + update={ + "status": ProposalStatus.APPROVED, + "decided_at": saved.decided_at, + "decided_by": saved.decided_by, + "decision_reason": saved.decision_reason, + }, + ) + result = await svc.execute_rollout(approved_proposal) assert result.outcome == RolloutOutcome.SUCCESS async def test_disabled_altitude_blocks_proposals(self) -> None: