Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 69 additions & 10 deletions tests/e2e/test_single_agent_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
from pathlib import Path

from synthorg.budget.tracker import CostTracker
from synthorg.core.agent import ToolPermissions
from synthorg.config.provider_schema import ProviderConfig, ProviderModelConfig
from synthorg.core.agent import ModelConfig, ToolPermissions
from synthorg.core.enums import TaskStatus, ToolAccessLevel
from synthorg.engine.agent_engine import AgentEngine
from synthorg.engine.loop_protocol import TerminationReason
from synthorg.providers.drivers.litellm_driver import LiteLLMDriver
from synthorg.providers.enums import MessageRole
from synthorg.providers.models import ToolCall
from synthorg.tools.file_system.write_file import WriteFileTool
Expand Down Expand Up @@ -381,22 +383,79 @@ class TestRealLLMIntegration:
"""Optional smoke test with a real LLM provider.

Skipped unless REAL_LLM_TEST=1 is set; not expected to run in CI.
Currently a placeholder -- all methods skip until a real provider
is configured via environment variables.
Each method also requires REAL_LLM_MODEL and REAL_LLM_PROVIDER so
the test can construct a configured provider driver without
leaning on app-startup config wiring. Authentication is configured
via at least one of REAL_LLM_API_KEY (hosted providers) or
REAL_LLM_BASE_URL (local / self-hosted providers); when both are
set, both are forwarded to the provider driver (api_key for auth,
base_url as the request endpoint).
"""

async def test_real_provider_text_completion(self) -> None:
"""Minimal text-only task with a real provider.

Placeholder for real provider integration; additional
configuration scaffolding is required before this can be enabled.
"""
"""Minimal text-only task end-to-end through the configured provider driver."""
provider_model = os.environ.get("REAL_LLM_MODEL")
if not provider_model:
pytest.skip(
"Set REAL_LLM_MODEL to a valid model ID "
"(e.g. 'example-large-001') to run this test"
)
pytest.skip(
f"Real LLM provider integration not yet wired -- model={provider_model}"
provider_name = os.environ.get("REAL_LLM_PROVIDER")
if not provider_name:
pytest.skip(
"Set REAL_LLM_PROVIDER to a provider routing key "
"(e.g. 'example-provider') to run this test"
)
# Normalise empty-string env vars to None so ProviderConfig's
# NotBlankStr fields accept the value; an exported-but-empty
# var is treated as "unset" rather than rejected at construct.
api_key = os.environ.get("REAL_LLM_API_KEY") or None
base_url = os.environ.get("REAL_LLM_BASE_URL") or None
if api_key is None and base_url is None:
pytest.skip(
"Set REAL_LLM_API_KEY (hosted) or REAL_LLM_BASE_URL "
"(local provider) to run this test"
)

provider_config = ProviderConfig(
litellm_provider=provider_name,
api_key=api_key,
base_url=base_url,
models=(ProviderModelConfig(id=provider_model),),
)
provider = LiteLLMDriver(provider_name, provider_config)

cost_tracker = CostTracker()
identity = make_e2e_identity().model_copy(
update={
"model": ModelConfig(
provider=provider_name,
model_id=provider_model,
),
},
)
task = make_e2e_task(
identity=identity,
title="Real LLM smoke test",
description="Reply with the single word 'ack'.",
)

engine = AgentEngine(
provider=provider,
cost_tracker=cost_tracker,
)
result = await engine.run(
identity=identity,
task=task,
max_turns=2,
)

# Real provider produced a successful single-turn completion.
assert result.is_success is True
assert result.termination_reason == TerminationReason.COMPLETED
assert result.completion_summary
# ``>= 0`` (not ``> 0``) so a local zero-cost preset still passes.
assert result.total_cost >= 0
assert await cost_tracker.get_record_count() == result.total_turns
assert result.task_id == task.id
assert result.duration_seconds > 0
71 changes: 62 additions & 9 deletions tests/integration/meta/test_meta_cycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
guards -> approval -> rollout -> regression detection.
"""

from uuid import NAMESPACE_URL, uuid5

import pytest

from synthorg.api.approval_store import ApprovalStore
from synthorg.core.enums import ApprovalStatus
from synthorg.core.types import NotBlankStr
from synthorg.meta.config import SelfImprovementConfig
from synthorg.meta.models import (
OrgBudgetSummary,
Expand Down Expand Up @@ -107,19 +111,32 @@ async def test_budget_overrun_produces_critical_proposal(
assert "budget_overrun" in sources

async def test_proposal_rollout_succeeds(self) -> None:
"""Scenario: approved proposal -> rollout -> success."""
"""Scenario: approved proposal -> rollout -> success.

Routes the approval through the real ``ApprovalStore``: the
guard registers an ``ApprovalItem`` during ``run_cycle``, the
test approves it via ``save_if_pending`` (mirroring the API /
MCP approval handlers), and the proposal handed to
``execute_rollout`` carries the decision metadata that came
back from the store. A regression in
``ApprovalGateGuard.evaluate`` (e.g. a different deterministic
approval id, or failure to register) makes this test fail at
the ``item is not None`` assert.
"""

async def snapshot_builder() -> OrgSignalSnapshot:
return _snap(quality=7.5, success=0.85)

clock = FakeClock()
approval_store = ApprovalStore(clock=clock)
svc = SelfImprovementService(
config=SelfImprovementConfig(
enabled=True,
config_tuning_enabled=True,
),
clock=FakeClock(),
clock=clock,
snapshot_builder=snapshot_builder,
approval_store=ApprovalStore(),
approval_store=approval_store,
)
proposals = await svc.run_cycle(_snap(quality=4.0))
assert len(proposals) >= 1
Expand All @@ -129,17 +146,53 @@ async def snapshot_builder() -> OrgSignalSnapshot:
if p.source_rule == "quality_declining"
and p.altitude == ProposalAltitude.CONFIG_TUNING
)
# Simulate approval via model_copy because the ApprovalStore
# approval flow is not yet integrated in the test harness.
approved = proposal.model_copy(

# The guard derives the approval id deterministically from
# the proposal id; mirror that derivation so a regression in
# the guard surfaces as a missing approval item rather than a
# silently bypassed flow.
approval_id = NotBlankStr(
str(uuid5(NAMESPACE_URL, f"proposal:{proposal.id}")),
)
item = await approval_store.get(approval_id)
assert item is not None, (
"ApprovalGateGuard did not register an approval item for "
"the proposal during run_cycle"
)
assert item.status == ApprovalStatus.PENDING

# Approve via the real store: ``save_if_pending`` is the same
# first-writer-wins path the API and MCP approval handlers
# take, so a regression in the store's concurrency model also
# surfaces here.
decided_at = clock.now()
decided = item.model_copy(
update={
"status": ProposalStatus.APPROVED,
"decided_at": proposal.proposed_at,
"status": ApprovalStatus.APPROVED,
"decided_at": decided_at,
"decided_by": "test-approver",
"decision_reason": "Integration test approval",
},
)
result = await svc.execute_rollout(approved)
saved = await approval_store.save_if_pending(decided)
assert saved is not None, "Approval store rejected the pending decision"
assert saved.status == ApprovalStatus.APPROVED

# Hand ``execute_rollout`` a proposal whose APPROVED state
# mirrors the store-resident decision. The proposal-side
# ``model_copy`` is mechanical -- there is no
# ``ApprovalItem -> ImprovementProposal`` adapter -- but the
# decision metadata flows from the real ``ApprovalStore``
# round-trip, not from a free-standing test mutation.
approved_proposal = proposal.model_copy(
update={
"status": ProposalStatus.APPROVED,
"decided_at": saved.decided_at,
"decided_by": saved.decided_by,
"decision_reason": saved.decision_reason,
},
)
result = await svc.execute_rollout(approved_proposal)
assert result.outcome == RolloutOutcome.SUCCESS

async def test_disabled_altitude_blocks_proposals(self) -> None:
Expand Down
Loading