diff --git a/DESIGN_SPEC.md b/DESIGN_SPEC.md index 76f3a04ffa..840eb1d55e 100644 --- a/DESIGN_SPEC.md +++ b/DESIGN_SPEC.md @@ -915,7 +915,7 @@ hybrid: `AgentEngine` (in `engine/agent_engine.py`) is the top-level entry point for running an agent on a task. It composes the execution loop with prompt construction, context management, tool invocation, and cost tracking into a single `run()` call. -**`async run(identity, task, completion_config?, max_turns?, memory_messages?) -> AgentRunResult`** +**`async run(identity, task, completion_config?, max_turns?, memory_messages?, timeout_seconds?) -> AgentRunResult`** Pipeline steps: @@ -925,15 +925,16 @@ Pipeline steps: 4. **Seed conversation** — injects system prompt, optional memory messages, and formatted task instruction as initial messages. 5. **Transition task** — `ASSIGNED` → `IN_PROGRESS` (pass-through if already `IN_PROGRESS`). 6. **Prepare tools and budget** — creates `ToolInvoker` from registry and `BudgetChecker` from task budget limit. -7. **Delegate to loop** — calls `ExecutionLoop.execute()` with context, provider, tool invoker, budget checker, and completion config. +7. **Delegate to loop** — calls `ExecutionLoop.execute()` with context, provider, tool invoker, budget checker, and completion config. If `timeout_seconds` is set, wraps the call in `asyncio.wait_for`; on expiry the run returns with `TerminationReason.ERROR` but cost recording and post-execution processing still occur. 8. **Record costs** — records accumulated `TokenUsage` to `CostTracker` (if available). Cost recording failures are logged but do not affect the result. -9. **Return result** — wraps `ExecutionResult` in `AgentRunResult` with engine-level metadata. +9. **Apply post-execution transitions** — on `COMPLETED` termination: IN_PROGRESS → IN_REVIEW → COMPLETED (two-hop auto-complete in M3; reviewers deferred to M4+). All other termination reasons leave the task in its current state. Transition failures are logged but do not discard the successful execution result. +10. **Return result** — wraps `ExecutionResult` in `AgentRunResult` with engine-level metadata. Error handling: `MemoryError` and `RecursionError` propagate unconditionally. All other exceptions are caught and wrapped in an `AgentRunResult` with `TerminationReason.ERROR`. Constructor accepts: `provider` (required), `execution_loop` (defaults to `ReactLoop`), `tool_registry`, `cost_tracker`. The `run()` method also accepts `memory_messages` — optional working memory to inject between the system prompt and task instruction (memory retrieval is M5; the engine provides the injection hook). -Logs structured events under the `execution.engine.*` namespace (10 constants in `events/execution.py`): creation, start, prompt built, completion, errors, invalid input, task transitions, and cost recording outcomes. +Logs structured events under the `execution.engine.*` namespace (12 constants in `events/execution.py`): creation, start, prompt built, completion, errors, invalid input, task transitions, cost recording outcomes, task metrics, and timeout. **`AgentRunResult`** — frozen Pydantic model wrapping `ExecutionResult` with engine metadata: @@ -941,7 +942,7 @@ Logs structured events under the `execution.engine.*` namespace (10 constants in - `system_prompt` — the `SystemPrompt` used for this run - `duration_seconds` — wall-clock run time - `agent_id`, `task_id` — identifiers -- Computed fields: `termination_reason`, `total_turns`, `total_cost_usd`, `is_success` +- Computed fields: `termination_reason`, `total_turns`, `total_cost_usd`, `is_success`, `completion_summary` ### 6.6 Agent Crash Recovery @@ -1463,12 +1464,15 @@ Every LLM provider call is tracked with comprehensive metadata for financial rep Every completion call produces a `CompletionResponse` with `TokenUsage` (token counts and cost). The engine layer creates a `CostRecord` (with agent/task context) and records it into `CostTracker` — the provider itself does not have agent/task context. In M3, the engine additionally logs **proxy overhead metrics** at task completion: -- `turns_per_task` — number of LLM turns to complete the task (from `AgentContext.turn_count`) +- `turns_per_task` — number of LLM turns to complete the task (from `AgentRunResult.total_turns`) - `tokens_per_task` — total tokens consumed (from `AgentContext.accumulated_cost.total_tokens`) -- `cost_per_task` — total USD cost (from `TaskExecution.accumulated_cost.cost_usd`) +- `cost_per_task` — total USD cost (from `AgentContext.accumulated_cost.cost_usd` via `AgentRunResult.total_cost_usd`) +- `duration_seconds` — wall-clock execution time in seconds (from `AgentRunResult.duration_seconds`) These are natural overhead indicators — a task consuming 15 turns and 50k tokens for a one-line fix signals a problem. +These metrics are captured in `TaskCompletionMetrics` (in `engine/metrics.py`), a frozen Pydantic model with a `from_run_result()` factory method. The engine logs these metrics at task completion via the `EXECUTION_ENGINE_TASK_METRICS` event. + #### M4: Call Categorization + Orchestration Ratio When multi-agent coordination exists, each `CostRecord` is tagged with a **call category**: @@ -2153,6 +2157,7 @@ ai-company/ │ │ ├── task_execution.py # TaskExecution + StatusTransition │ │ ├── context.py # AgentContext + AgentContextSnapshot │ │ ├── loop_protocol.py # ExecutionLoop protocol + result models +│ │ ├── metrics.py # TaskCompletionMetrics proxy overhead model │ │ ├── react_loop.py # ReAct loop implementation │ │ ├── run_result.py # AgentRunResult outcome model │ │ ├── agent_engine.py # Agent execution engine diff --git a/src/ai_company/engine/__init__.py b/src/ai_company/engine/__init__.py index 7e4c6a775a..08e39c58a8 100644 --- a/src/ai_company/engine/__init__.py +++ b/src/ai_company/engine/__init__.py @@ -26,6 +26,7 @@ TerminationReason, TurnRecord, ) +from ai_company.engine.metrics import TaskCompletionMetrics from ai_company.engine.prompt import ( DefaultTokenEstimator, PromptTokenEstimator, @@ -58,6 +59,7 @@ "ReactLoop", "StatusTransition", "SystemPrompt", + "TaskCompletionMetrics", "TaskExecution", "TerminationReason", "TurnRecord", diff --git a/src/ai_company/engine/agent_engine.py b/src/ai_company/engine/agent_engine.py index 5700359af1..bc8be0ef80 100644 --- a/src/ai_company/engine/agent_engine.py +++ b/src/ai_company/engine/agent_engine.py @@ -4,6 +4,8 @@ tool invocation, and budget tracking into a single ``run()`` entry point. """ +import asyncio +import contextlib import time from datetime import UTC, datetime from typing import TYPE_CHECKING @@ -17,6 +19,7 @@ TerminationReason, TurnRecord, ) +from ai_company.engine.metrics import TaskCompletionMetrics from ai_company.engine.prompt import SystemPrompt, build_system_prompt from ai_company.engine.react_loop import ReactLoop from ai_company.engine.run_result import AgentRunResult @@ -31,7 +34,9 @@ EXECUTION_ENGINE_INVALID_INPUT, EXECUTION_ENGINE_PROMPT_BUILT, EXECUTION_ENGINE_START, + EXECUTION_ENGINE_TASK_METRICS, EXECUTION_ENGINE_TASK_TRANSITION, + EXECUTION_ENGINE_TIMEOUT, ) from ai_company.providers.enums import MessageRole from ai_company.providers.models import ChatMessage @@ -90,7 +95,7 @@ def __init__( has_cost_tracker=self._cost_tracker is not None, ) - async def run( + async def run( # noqa: PLR0913 self, *, identity: AgentIdentity, @@ -98,6 +103,7 @@ async def run( completion_config: CompletionConfig | None = None, max_turns: int = DEFAULT_MAX_TURNS, memory_messages: tuple[ChatMessage, ...] = (), + timeout_seconds: float | None = None, ) -> AgentRunResult: """Execute an agent on a task. @@ -108,6 +114,10 @@ async def run( max_turns: Maximum LLM turns allowed (must be >= 1). memory_messages: Optional working memory messages to inject between the system prompt and task instruction. + timeout_seconds: Optional wall-clock timeout in seconds. + When exceeded, the execution loop is cancelled and the + run returns with ``TerminationReason.ERROR``. Cost + recording and post-execution processing still occur. Returns: ``AgentRunResult`` with execution outcome and metadata. @@ -118,23 +128,20 @@ async def run( Raises: ExecutionStateError: If pre-flight validation fails (agent not ACTIVE or task not ASSIGNED/IN_PROGRESS). - ValueError: If ``max_turns`` is less than 1. + ValueError: If ``max_turns`` is less than 1, or if + ``timeout_seconds`` is not positive. MemoryError: Re-raised unconditionally (non-recoverable). RecursionError: Re-raised unconditionally (non-recoverable). """ agent_id = str(identity.id) task_id = task.id - if max_turns < 1: - msg = f"max_turns must be >= 1, got {max_turns}" - logger.warning( - EXECUTION_ENGINE_INVALID_INPUT, - agent_id=agent_id, - task_id=task_id, - reason=msg, - ) - raise ValueError(msg) - + self._validate_run_inputs( + agent_id=agent_id, + task_id=task_id, + max_turns=max_turns, + timeout_seconds=timeout_seconds, + ) self._validate_agent(identity, agent_id) self._validate_task(task, agent_id, task_id) @@ -167,6 +174,7 @@ async def run( ctx=ctx, system_prompt=system_prompt, start=start, + timeout_seconds=timeout_seconds, ) except MemoryError, RecursionError: logger.error( @@ -200,8 +208,14 @@ async def _execute( # noqa: PLR0913 ctx: AgentContext, system_prompt: SystemPrompt, start: float, + timeout_seconds: float | None = None, ) -> AgentRunResult: - """Run the execution loop, record costs, and build result.""" + """Run execution loop, record costs, apply transitions, and build result. + + Orchestrates the full execution pipeline: loop execution (with + optional wall-clock timeout via ``asyncio.wait``), per-turn cost + recording, post-execution task transitions, and metrics logging. + """ budget_checker = _make_budget_checker(task) tool_invoker = self._make_tool_invoker() @@ -212,17 +226,25 @@ async def _execute( # noqa: PLR0913 estimated_tokens=system_prompt.estimated_tokens, ) - execution_result = await self._loop.execute( - context=ctx, - provider=self._provider, - tool_invoker=tool_invoker, - budget_checker=budget_checker, + execution_result = await self._run_loop_with_timeout( + ctx=ctx, + agent_id=agent_id, + task_id=task_id, completion_config=completion_config, + budget_checker=budget_checker, + tool_invoker=tool_invoker, + start=start, + timeout_seconds=timeout_seconds, ) - duration = time.monotonic() - start await self._record_costs(execution_result, identity, agent_id, task_id) + execution_result = self._apply_post_execution_transitions( + execution_result, + agent_id, + task_id, + ) + duration = time.monotonic() - start result = AgentRunResult( execution_result=execution_result, system_prompt=system_prompt, @@ -230,9 +252,73 @@ async def _execute( # noqa: PLR0913 agent_id=agent_id, task_id=task_id, ) - self._log_completion(result, execution_result, agent_id, task_id, duration) + try: + self._log_completion(result, agent_id, task_id, duration) + except Exception: + logger.exception( + EXECUTION_ENGINE_ERROR, + agent_id=agent_id, + task_id=task_id, + error="Completion logging failed", + ) return result + async def _run_loop_with_timeout( # noqa: PLR0913 + self, + *, + ctx: AgentContext, + agent_id: str, + task_id: str, + completion_config: CompletionConfig | None, + budget_checker: BudgetChecker | None, + tool_invoker: ToolInvoker | None, + start: float, + timeout_seconds: float | None, + ) -> ExecutionResult: + """Execute the loop, using ``asyncio.wait`` for timeout control. + + Uses ``asyncio.wait`` instead of ``asyncio.wait_for`` so that + ``TimeoutError`` raised inside the loop propagates normally + and is not conflated with the engine's wall-clock deadline. + """ + coro = self._loop.execute( + context=ctx, + provider=self._provider, + tool_invoker=tool_invoker, + budget_checker=budget_checker, + completion_config=completion_config, + ) + if timeout_seconds is None: + return await coro + + loop_task = asyncio.create_task(coro) + _done, pending = await asyncio.wait( + {loop_task}, + timeout=timeout_seconds, + ) + if not pending: + return loop_task.result() + + duration = time.monotonic() - start + error_msg = ( + f"Wall-clock timeout after {duration:.1f}s (limit: {timeout_seconds}s)" + ) + logger.warning( + EXECUTION_ENGINE_TIMEOUT, + agent_id=agent_id, + task_id=task_id, + duration_seconds=duration, + timeout_seconds=timeout_seconds, + ) + loop_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await loop_task + return ExecutionResult( + context=ctx, + termination_reason=TerminationReason.ERROR, + error_message=error_msg, + ) + # ── Setup ──────────────────────────────────────────────────── def _prepare_context( # noqa: PLR0913 @@ -275,6 +361,34 @@ def _prepare_context( # noqa: PLR0913 # ── Validation ─────────────────────────────────────────────── + def _validate_run_inputs( + self, + *, + agent_id: str, + task_id: str, + max_turns: int, + timeout_seconds: float | None, + ) -> None: + """Validate scalar ``run()`` arguments before execution.""" + if max_turns < 1: + msg = f"max_turns must be >= 1, got {max_turns}" + logger.warning( + EXECUTION_ENGINE_INVALID_INPUT, + agent_id=agent_id, + task_id=task_id, + reason=msg, + ) + raise ValueError(msg) + if timeout_seconds is not None and timeout_seconds <= 0: + msg = f"timeout_seconds must be > 0, got {timeout_seconds}" + logger.warning( + EXECUTION_ENGINE_INVALID_INPUT, + agent_id=agent_id, + task_id=task_id, + reason=msg, + ) + raise ValueError(msg) + def _validate_agent(self, identity: AgentIdentity, agent_id: str) -> None: """Raise if agent is not ACTIVE.""" if identity.status != AgentStatus.ACTIVE: @@ -295,7 +409,7 @@ def _validate_task( agent_id: str, task_id: str, ) -> None: - """Raise if task is not in an executable status.""" + """Raise if task is not executable or not assigned to this agent.""" if task.status not in _EXECUTABLE_STATUSES: msg = ( f"Task {task_id!r} has status {task.status.value!r}; " @@ -308,6 +422,18 @@ def _validate_task( reason=msg, ) raise ExecutionStateError(msg) + if task.assigned_to is not None and task.assigned_to != agent_id: + msg = ( + f"Task {task_id!r} is assigned to {task.assigned_to!r}, " + f"not to agent {agent_id!r}" + ) + logger.warning( + EXECUTION_ENGINE_INVALID_INPUT, + agent_id=agent_id, + task_id=task_id, + reason=msg, + ) + raise ExecutionStateError(msg) # ── Helpers ────────────────────────────────────────────────── @@ -341,6 +467,80 @@ def _transition_task_if_needed( ) return ctx + def _apply_post_execution_transitions( + self, + execution_result: ExecutionResult, + agent_id: str, + task_id: str, + ) -> ExecutionResult: + """Apply post-execution task transitions based on termination reason. + + Only ``TerminationReason.COMPLETED`` triggers transitions: + IN_PROGRESS → IN_REVIEW → COMPLETED (two-hop auto-complete). + All other reasons leave the task in its current state. + + Note: + The IN_REVIEW → COMPLETED auto-complete is M3 scaffolding + (no reviewers yet). Later milestones will gate COMPLETED + on reviewer approval. + + Transition failures are logged but do not discard the + successful execution result — a bookkeeping error must never + destroy the agent's work. + + Args: + execution_result: Result from the execution loop. + agent_id: Agent identifier for logging. + task_id: Task identifier for logging. + + Returns: + New ``ExecutionResult`` with updated context if transitions + were applied, or the original result unchanged. + """ + ctx = execution_result.context + if ctx.task_execution is None: + return execution_result + + if execution_result.termination_reason != TerminationReason.COMPLETED: + return execution_result + + try: + prev_status = ctx.task_execution.status + ctx = ctx.with_task_transition( + TaskStatus.IN_REVIEW, + reason="Agent completed execution", + ) + logger.info( + EXECUTION_ENGINE_TASK_TRANSITION, + agent_id=agent_id, + task_id=task_id, + from_status=prev_status.value, + to_status=TaskStatus.IN_REVIEW.value, + ) + # TODO(M4): Replace auto-complete with review gate + prev_status = ctx.task_execution.status # type: ignore[union-attr] + ctx = ctx.with_task_transition( + TaskStatus.COMPLETED, + reason="Auto-completed (no reviewers in M3)", + ) + logger.info( + EXECUTION_ENGINE_TASK_TRANSITION, + agent_id=agent_id, + task_id=task_id, + from_status=prev_status.value, + to_status=TaskStatus.COMPLETED.value, + ) + except (ValueError, ExecutionStateError) as exc: + logger.exception( + EXECUTION_ENGINE_ERROR, + agent_id=agent_id, + task_id=task_id, + error=f"Post-execution transition failed: {exc}", + ) + return execution_result + + return execution_result.model_copy(update={"context": ctx}) + def _make_tool_invoker(self) -> ToolInvoker | None: """Create a ToolInvoker from the registry, or None.""" if self._tool_registry is None: @@ -350,23 +550,35 @@ def _make_tool_invoker(self) -> ToolInvoker | None: def _log_completion( self, result: AgentRunResult, - execution_result: ExecutionResult, agent_id: str, task_id: str, duration: float, ) -> None: - """Log structured completion event for the finished run.""" + """Log structured completion event and proxy overhead metrics.""" + accumulated = result.execution_result.context.accumulated_cost logger.info( EXECUTION_ENGINE_COMPLETE, agent_id=agent_id, task_id=task_id, termination_reason=result.termination_reason.value, total_turns=result.total_turns, - total_tokens=execution_result.context.accumulated_cost.total_tokens, + total_tokens=accumulated.total_tokens, duration_seconds=duration, cost_usd=result.total_cost_usd, ) + metrics = TaskCompletionMetrics.from_run_result(result) + logger.info( + EXECUTION_ENGINE_TASK_METRICS, + agent_id=agent_id, + task_id=task_id, + termination_reason=result.termination_reason.value, + turns_per_task=metrics.turns_per_task, + tokens_per_task=metrics.tokens_per_task, + cost_per_task=metrics.cost_per_task, + duration_seconds=metrics.duration_seconds, + ) + async def _record_costs( self, result: ExecutionResult, diff --git a/src/ai_company/engine/metrics.py b/src/ai_company/engine/metrics.py new file mode 100644 index 0000000000..d8c9f9dfb4 --- /dev/null +++ b/src/ai_company/engine/metrics.py @@ -0,0 +1,75 @@ +"""Task completion metrics model. + +Proxy overhead metrics for an agent run, computed from +``AgentRunResult`` data per DESIGN_SPEC §10.5 (M3). +""" + +from typing import TYPE_CHECKING + +from pydantic import BaseModel, ConfigDict, Field + +from ai_company.core.types import NotBlankStr # noqa: TC001 + +if TYPE_CHECKING: + from ai_company.engine.run_result import AgentRunResult + + +class TaskCompletionMetrics(BaseModel): + """Proxy overhead metrics for an agent run (DESIGN_SPEC §10.5). + + Computed from ``AgentRunResult`` after execution to surface + orchestration overhead indicators (turns, tokens, cost, duration). + + Attributes: + task_id: Task identifier (``None`` for future taskless runs). + agent_id: Agent identifier (string form of UUID). + turns_per_task: Number of LLM turns to complete the task. + tokens_per_task: Total tokens consumed (input + output). + cost_per_task: Total USD cost for the task. + duration_seconds: Wall-clock execution time in seconds. + """ + + model_config = ConfigDict(frozen=True) + + task_id: NotBlankStr | None = Field( + default=None, + description="Task identifier", + ) + agent_id: NotBlankStr = Field(description="Agent identifier") + turns_per_task: int = Field( + ge=0, + description="Number of LLM turns to complete the task", + ) + tokens_per_task: int = Field( + ge=0, + description="Total tokens consumed (input + output)", + ) + cost_per_task: float = Field( + ge=0.0, + description="Total USD cost for the task", + ) + duration_seconds: float = Field( + ge=0.0, + description="Wall-clock execution time in seconds", + ) + + @classmethod + def from_run_result(cls, result: AgentRunResult) -> TaskCompletionMetrics: + """Build metrics from an agent run result. + + Args: + result: The ``AgentRunResult`` to extract metrics from. + + Returns: + New ``TaskCompletionMetrics`` with values extracted from + the result's execution context and metadata. + """ + accumulated = result.execution_result.context.accumulated_cost + return cls( + task_id=result.task_id, + agent_id=result.agent_id, + turns_per_task=result.total_turns, + tokens_per_task=accumulated.total_tokens, + cost_per_task=result.total_cost_usd, + duration_seconds=result.duration_seconds, + ) diff --git a/src/ai_company/engine/run_result.py b/src/ai_company/engine/run_result.py index 426bdb2d0f..595c3ca397 100644 --- a/src/ai_company/engine/run_result.py +++ b/src/ai_company/engine/run_result.py @@ -6,12 +6,14 @@ from pydantic import BaseModel, ConfigDict, Field, computed_field +from ai_company.core.artifact import Artifact # noqa: TC001 from ai_company.core.types import NotBlankStr # noqa: TC001 from ai_company.engine.loop_protocol import ( ExecutionResult, TerminationReason, ) from ai_company.engine.prompt import SystemPrompt # noqa: TC001 +from ai_company.providers.enums import MessageRole class AgentRunResult(BaseModel): @@ -46,6 +48,10 @@ class AgentRunResult(BaseModel): default=None, description="Task identifier, or None for future taskless runs", ) + produced_artifacts: tuple[Artifact, ...] = Field( + default=(), + description="Artifacts produced during execution", + ) # mypy does not yet model Pydantic's @computed_field + @property # combination correctly; the ignores are safe — Pydantic enforces @@ -82,3 +88,22 @@ def total_cost_usd(self) -> float: def is_success(self) -> bool: """True when termination reason is COMPLETED.""" return self.termination_reason == TerminationReason.COMPLETED + + @computed_field( # type: ignore[prop-decorator] + description="Last assistant message content as work summary", + ) + @property + def completion_summary(self) -> str | None: + """Extract the last assistant message content as a work summary. + + Walks the conversation in reverse to find the most recent + assistant message with non-empty text content. Tool-call-only + assistant messages (content is ``None`` or empty) are skipped. + + Returns: + The content string, or ``None`` if no qualifying message exists. + """ + for msg in reversed(self.execution_result.context.conversation): + if msg.role == MessageRole.ASSISTANT and msg.content: + return msg.content + return None diff --git a/src/ai_company/observability/events/execution.py b/src/ai_company/observability/events/execution.py index ab3977221d..30f50c220a 100644 --- a/src/ai_company/observability/events/execution.py +++ b/src/ai_company/observability/events/execution.py @@ -32,3 +32,5 @@ EXECUTION_ENGINE_COST_RECORDED: Final[str] = "execution.engine.cost_recorded" EXECUTION_ENGINE_COST_SKIPPED: Final[str] = "execution.engine.cost_skipped" EXECUTION_ENGINE_COST_FAILED: Final[str] = "execution.engine.cost_failed" +EXECUTION_ENGINE_TASK_METRICS: Final[str] = "execution.engine.task_metrics" +EXECUTION_ENGINE_TIMEOUT: Final[str] = "execution.engine.timeout" diff --git a/tests/integration/engine/test_agent_engine_integration.py b/tests/integration/engine/test_agent_engine_integration.py index 878a11403b..290a238761 100644 --- a/tests/integration/engine/test_agent_engine_integration.py +++ b/tests/integration/engine/test_agent_engine_integration.py @@ -202,7 +202,95 @@ async def test_full_tool_call_loop(self) -> None: assert tool_results[0].tool_result is not None assert tool_results[0].tool_result.content == "HELLO WORLD" - # Verify task transitioned to IN_PROGRESS + # Verify task auto-completed: ASSIGNED → IP → IR → COMPLETED te = result.execution_result.context.task_execution assert te is not None - assert te.status == TaskStatus.IN_PROGRESS + assert te.status == TaskStatus.COMPLETED + + +class TestAgentEngineFullLifecycle: + """Full task lifecycle: ASSIGNED → IN_PROGRESS → IN_REVIEW → COMPLETED.""" + + async def test_full_lifecycle_assigned_to_completed(self) -> None: + """Verify complete lifecycle with transitions, summary, and metrics.""" + identity = AgentIdentity( + id=uuid4(), + name="Lifecycle Agent", + role="Developer", + department="Engineering", + level=SeniorityLevel.MID, + hiring_date=date(2026, 1, 15), + personality=PersonalityConfig(traits=("analytical",)), + model=ModelConfig( + provider="test-provider", + model_id="test-model-001", + ), + ) + task = Task( + id="task-lifecycle", + title="Full lifecycle test", + description="Test the complete task lifecycle.", + type=TaskType.DEVELOPMENT, + priority=Priority.MEDIUM, + project="proj-001", + created_by="manager", + assigned_to=str(identity.id), + status=TaskStatus.ASSIGNED, + ) + + tool = UppercaseTool( + name="uppercase", + description="Converts text to uppercase.", + parameters_schema={ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Text to uppercase", + }, + }, + "required": ["text"], + }, + ) + registry = ToolRegistry([tool]) + provider = _ToolCallingProvider() + + engine = AgentEngine( + provider=provider, + tool_registry=registry, + ) + + result = await engine.run( + identity=identity, + task=task, + max_turns=5, + ) + + # Verify successful completion + assert result.is_success is True + assert result.termination_reason == TerminationReason.COMPLETED + + # Verify transition log: ASSIGNED→IP, IP→IR, IR→COMPLETED + te = result.execution_result.context.task_execution + assert te is not None + assert te.status == TaskStatus.COMPLETED + assert len(te.transition_log) == 3 + assert te.transition_log[0].to_status == TaskStatus.IN_PROGRESS + assert te.transition_log[1].to_status == TaskStatus.IN_REVIEW + assert te.transition_log[2].to_status == TaskStatus.COMPLETED + + # Verify completed_at is set + assert te.completed_at is not None + + # Verify completion_summary is non-empty + assert result.completion_summary is not None + assert len(result.completion_summary) > 0 + + # Verify TaskCompletionMetrics computable + from ai_company.engine.metrics import TaskCompletionMetrics + + metrics = TaskCompletionMetrics.from_run_result(result) + assert metrics.turns_per_task > 0 + assert metrics.tokens_per_task > 0 + assert metrics.cost_per_task > 0 + assert metrics.duration_seconds > 0 diff --git a/tests/unit/engine/conftest.py b/tests/unit/engine/conftest.py index 51a25f8e76..d7c0ccf918 100644 --- a/tests/unit/engine/conftest.py +++ b/tests/unit/engine/conftest.py @@ -92,7 +92,9 @@ def sample_role_with_description() -> Role: @pytest.fixture -def sample_task_with_criteria() -> Task: +def sample_task_with_criteria( + sample_agent_with_personality: AgentIdentity, +) -> Task: """Task with acceptance criteria and budget for prompt testing.""" return Task( id="task-prompt-001", @@ -109,7 +111,7 @@ def sample_task_with_criteria() -> Task: estimated_complexity=Complexity.MEDIUM, budget_limit=5.0, deadline="2026-04-01T00:00:00", - assigned_to="ada_lovelace", + assigned_to=str(sample_agent_with_personality.id), status=TaskStatus.ASSIGNED, ) diff --git a/tests/unit/engine/test_agent_engine.py b/tests/unit/engine/test_agent_engine.py index e587eb6293..121e6e0b34 100644 --- a/tests/unit/engine/test_agent_engine.py +++ b/tests/unit/engine/test_agent_engine.py @@ -115,10 +115,10 @@ async def test_assigned_transitions_to_in_progress( task=sample_task_with_criteria, ) - # The context inside the result should show IN_PROGRESS + # Successful run auto-completes: ASSIGNED → IP → IR → COMPLETED te = result.execution_result.context.task_execution assert te is not None - assert te.status == TaskStatus.IN_PROGRESS + assert te.status == TaskStatus.COMPLETED @pytest.mark.unit @@ -142,9 +142,10 @@ async def test_in_progress_accepted( ) assert result.is_success is True + # Successful run auto-completes: IP → IR → COMPLETED te = result.execution_result.context.task_execution assert te is not None - assert te.status == TaskStatus.IN_PROGRESS + assert te.status == TaskStatus.COMPLETED @pytest.mark.unit @@ -195,7 +196,7 @@ async def test_completed_task_raises( priority=Priority.MEDIUM, project="proj-001", created_by="manager", - assigned_to="someone", + assigned_to=str(sample_agent_with_personality.id), status=TaskStatus.COMPLETED, ) provider = mock_provider_factory([]) @@ -244,7 +245,7 @@ async def test_blocked_task_raises( type=TaskType.DEVELOPMENT, project="proj-001", created_by="manager", - assigned_to="someone", + assigned_to=str(sample_agent_with_personality.id), status=TaskStatus.BLOCKED, ) provider = mock_provider_factory([]) @@ -374,7 +375,7 @@ async def test_no_budget_limit_no_checker( type=TaskType.DEVELOPMENT, project="proj-001", created_by="manager", - assigned_to="someone", + assigned_to=str(sample_agent_with_personality.id), budget_limit=0.0, status=TaskStatus.ASSIGNED, ) @@ -451,7 +452,7 @@ async def test_zero_cost_not_recorded( type=TaskType.DEVELOPMENT, project="proj-001", created_by="manager", - assigned_to="someone", + assigned_to=str(sample_agent_with_personality.id), status=TaskStatus.ASSIGNED, ) response = _make_completion_response( @@ -481,7 +482,7 @@ async def test_free_provider_tokens_recorded( type=TaskType.DEVELOPMENT, project="proj-001", created_by="manager", - assigned_to="someone", + assigned_to=str(sample_agent_with_personality.id), status=TaskStatus.ASSIGNED, ) response = _make_completion_response( @@ -534,6 +535,10 @@ async def test_completion_config_forwarded( sample_agent_with_personality, task=sample_task_with_criteria, ) + ctx = ctx.with_task_transition( + TaskStatus.IN_PROGRESS, + reason="Engine starting execution", + ) mock_result = ExecutionResult( context=ctx, termination_reason=TerminationReason.COMPLETED, @@ -647,6 +652,12 @@ async def test_custom_loop_used( sample_agent_with_personality, task=sample_task_with_criteria, ) + # Mock must return context at IN_PROGRESS (as _prepare_context + # transitions ASSIGNED → IP before handing to the loop). + ctx = ctx.with_task_transition( + TaskStatus.IN_PROGRESS, + reason="Engine starting execution", + ) mock_result = ExecutionResult( context=ctx, termination_reason=TerminationReason.COMPLETED, diff --git a/tests/unit/engine/test_agent_engine_lifecycle.py b/tests/unit/engine/test_agent_engine_lifecycle.py new file mode 100644 index 0000000000..4651287c9e --- /dev/null +++ b/tests/unit/engine/test_agent_engine_lifecycle.py @@ -0,0 +1,452 @@ +"""Unit tests for AgentEngine post-execution transitions, timeout, and metrics.""" + +import asyncio +from typing import TYPE_CHECKING, Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from ai_company.core.agent import AgentIdentity # noqa: TC001 +from ai_company.core.enums import TaskStatus +from ai_company.core.task import Task # noqa: TC001 +from ai_company.engine.agent_engine import AgentEngine +from ai_company.engine.context import AgentContext +from ai_company.engine.loop_protocol import ( + ExecutionResult, + TerminationReason, +) + +if TYPE_CHECKING: + from .conftest import MockCompletionProvider + +from .conftest import make_completion_response as _make_completion_response + + +@pytest.mark.unit +class TestAgentEnginePostExecutionTransitions: + """Post-execution task transitions based on termination reason.""" + + async def test_completed_auto_completes_task( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """COMPLETED → two-hop: IP → IR → COMPLETED.""" + response = _make_completion_response() + provider = mock_provider_factory([response]) + engine = AgentEngine(provider=provider) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert te.status == TaskStatus.COMPLETED + + async def test_completed_transition_log_has_three_entries( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """ASSIGNED→IP, IP→IR, IR→COMPLETED = 3 transitions.""" + response = _make_completion_response() + provider = mock_provider_factory([response]) + engine = AgentEngine(provider=provider) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert len(te.transition_log) == 3 + assert te.transition_log[0].from_status == TaskStatus.ASSIGNED + assert te.transition_log[0].to_status == TaskStatus.IN_PROGRESS + assert te.transition_log[1].from_status == TaskStatus.IN_PROGRESS + assert te.transition_log[1].to_status == TaskStatus.IN_REVIEW + assert te.transition_log[2].from_status == TaskStatus.IN_REVIEW + assert te.transition_log[2].to_status == TaskStatus.COMPLETED + + async def test_completed_sets_completed_at( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + response = _make_completion_response() + provider = mock_provider_factory([response]) + engine = AgentEngine(provider=provider) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert te.completed_at is not None + + async def test_max_turns_stays_in_progress( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + ctx = AgentContext.from_identity( + sample_agent_with_personality, + task=sample_task_with_criteria, + ) + # Simulate ASSIGNED→IP transition that _prepare_context does + ctx = ctx.with_task_transition( + TaskStatus.IN_PROGRESS, + reason="Engine starting execution", + ) + mock_result = ExecutionResult( + context=ctx, + termination_reason=TerminationReason.MAX_TURNS, + ) + mock_loop = MagicMock() + mock_loop.execute = AsyncMock(return_value=mock_result) + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert te.status == TaskStatus.IN_PROGRESS + + async def test_budget_exhausted_stays_in_progress( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + ctx = AgentContext.from_identity( + sample_agent_with_personality, + task=sample_task_with_criteria, + ) + ctx = ctx.with_task_transition( + TaskStatus.IN_PROGRESS, + reason="Engine starting execution", + ) + mock_result = ExecutionResult( + context=ctx, + termination_reason=TerminationReason.BUDGET_EXHAUSTED, + ) + mock_loop = MagicMock() + mock_loop.execute = AsyncMock(return_value=mock_result) + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert te.status == TaskStatus.IN_PROGRESS + + async def test_error_stays_in_progress( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + ctx = AgentContext.from_identity( + sample_agent_with_personality, + task=sample_task_with_criteria, + ) + ctx = ctx.with_task_transition( + TaskStatus.IN_PROGRESS, + reason="Engine starting execution", + ) + mock_result = ExecutionResult( + context=ctx, + termination_reason=TerminationReason.ERROR, + error_message="something failed", + ) + mock_loop = MagicMock() + mock_loop.execute = AsyncMock(return_value=mock_result) + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert te.status == TaskStatus.IN_PROGRESS + + async def test_no_task_execution_passes_through( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """No task_execution in context → transitions skipped.""" + ctx = AgentContext.from_identity(sample_agent_with_personality) + mock_result = ExecutionResult( + context=ctx, + termination_reason=TerminationReason.COMPLETED, + ) + mock_loop = MagicMock() + mock_loop.execute = AsyncMock(return_value=mock_result) + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + assert result.execution_result.context.task_execution is None + + +@pytest.mark.unit +class TestAgentEngineTimeout: + """Wall-clock timeout support.""" + + async def test_timeout_produces_error_result( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """Slow provider triggers timeout → ERROR result.""" + + async def slow_execute(**kwargs: Any) -> ExecutionResult: + await asyncio.sleep(10) + msg = "Should not reach here" + raise AssertionError(msg) + + mock_loop = MagicMock() + mock_loop.execute = slow_execute + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + timeout_seconds=0.1, + ) + + assert result.termination_reason == TerminationReason.ERROR + assert result.is_success is False + assert "timeout" in (result.execution_result.error_message or "").lower() + + async def test_no_timeout_by_default( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """Default run has no timeout.""" + response = _make_completion_response() + provider = mock_provider_factory([response]) + engine = AgentEngine(provider=provider) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + assert result.is_success is True + + async def test_zero_timeout_raises( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider) + + with pytest.raises(ValueError, match="timeout_seconds must be > 0"): + await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + timeout_seconds=0, + ) + + async def test_negative_timeout_raises( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider) + + with pytest.raises(ValueError, match="timeout_seconds must be > 0"): + await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + timeout_seconds=-1.0, + ) + + +@pytest.mark.unit +class TestAgentEngineCompletionMetrics: + """Proxy overhead metrics logged on completion.""" + + async def test_metrics_logged_on_completion( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """Successful run computes and logs TaskCompletionMetrics.""" + from ai_company.engine.metrics import TaskCompletionMetrics + + response = _make_completion_response(cost_usd=0.05) + provider = mock_provider_factory([response]) + engine = AgentEngine(provider=provider) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + # Metrics can be computed from the result + metrics = TaskCompletionMetrics.from_run_result(result) + assert metrics.turns_per_task == 1 + assert metrics.tokens_per_task > 0 + assert metrics.cost_per_task > 0 + assert metrics.duration_seconds > 0 + assert metrics.agent_id == str(sample_agent_with_personality.id) + assert metrics.task_id == sample_task_with_criteria.id + + +@pytest.mark.unit +class TestAgentEngineTimeoutEdgeCases: + """Edge cases for timeout behaviour.""" + + async def test_inner_timeout_propagates_without_engine_timeout( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """TimeoutError from inside the loop is treated as a fatal error.""" + + async def raises_timeout(**kwargs: Any) -> ExecutionResult: + msg = "inner timeout" + raise TimeoutError(msg) + + mock_loop = MagicMock() + mock_loop.execute = raises_timeout + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + assert result.termination_reason == TerminationReason.ERROR + assert "TimeoutError" in (result.execution_result.error_message or "") + + async def test_timeout_records_no_costs( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """Timeout result has no turns, so no costs are recorded.""" + + async def slow_execute(**kwargs: Any) -> ExecutionResult: + await asyncio.sleep(10) + msg = "Should not reach here" + raise AssertionError(msg) + + mock_tracker = MagicMock() + mock_tracker.record = AsyncMock() + + mock_loop = MagicMock() + mock_loop.execute = slow_execute + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine( + provider=provider, + execution_loop=mock_loop, + cost_tracker=mock_tracker, + ) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + timeout_seconds=0.1, + ) + + assert result.termination_reason == TerminationReason.ERROR + mock_tracker.record.assert_not_called() + + +@pytest.mark.unit +class TestAgentEnginePostExecutionResilience: + """Post-execution transition failure resilience.""" + + async def test_transition_failure_preserves_result( + self, + sample_agent_with_personality: AgentIdentity, + sample_task_with_criteria: Task, + mock_provider_factory: type[MockCompletionProvider], + ) -> None: + """Transition failure preserves execution result unchanged.""" + ctx = AgentContext.from_identity( + sample_agent_with_personality, + task=sample_task_with_criteria, + ) + ctx = ctx.with_task_transition( + TaskStatus.IN_PROGRESS, + reason="Engine starting execution", + ) + te = ctx.task_execution + assert te is not None + bad_te = te.model_copy(update={"status": TaskStatus.CANCELLED}) + ctx_bad = ctx.model_copy(update={"task_execution": bad_te}) + + mock_result = ExecutionResult( + context=ctx_bad, + termination_reason=TerminationReason.COMPLETED, + ) + mock_loop = MagicMock() + mock_loop.execute = AsyncMock(return_value=mock_result) + mock_loop.get_loop_type = MagicMock(return_value="react") + + provider = mock_provider_factory([]) + engine = AgentEngine(provider=provider, execution_loop=mock_loop) + + result = await engine.run( + identity=sample_agent_with_personality, + task=sample_task_with_criteria, + ) + + te = result.execution_result.context.task_execution + assert te is not None + assert te.status == TaskStatus.CANCELLED diff --git a/tests/unit/engine/test_metrics.py b/tests/unit/engine/test_metrics.py new file mode 100644 index 0000000000..7257f50c2d --- /dev/null +++ b/tests/unit/engine/test_metrics.py @@ -0,0 +1,225 @@ +"""Unit tests for TaskCompletionMetrics model.""" + +import pytest +from pydantic import ValidationError + +from ai_company.engine.context import AgentContext # noqa: TC001 +from ai_company.engine.loop_protocol import ( + ExecutionResult, + TerminationReason, + TurnRecord, +) +from ai_company.engine.metrics import TaskCompletionMetrics +from ai_company.engine.prompt import SystemPrompt +from ai_company.engine.run_result import AgentRunResult +from ai_company.providers.enums import FinishReason +from ai_company.providers.models import TokenUsage + + +@pytest.mark.unit +class TestTaskCompletionMetricsConstruction: + """Basic construction and frozen enforcement.""" + + def test_valid_construction(self) -> None: + metrics = TaskCompletionMetrics( + task_id="task-001", + agent_id="agent-001", + turns_per_task=3, + tokens_per_task=1500, + cost_per_task=0.05, + duration_seconds=12.5, + ) + assert metrics.task_id == "task-001" + assert metrics.agent_id == "agent-001" + assert metrics.turns_per_task == 3 + assert metrics.tokens_per_task == 1500 + assert metrics.cost_per_task == 0.05 + assert metrics.duration_seconds == 12.5 + + def test_task_id_none(self) -> None: + metrics = TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=0, + tokens_per_task=0, + cost_per_task=0.0, + duration_seconds=0.0, + ) + assert metrics.task_id is None + + def test_frozen(self) -> None: + metrics = TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=1, + tokens_per_task=100, + cost_per_task=0.01, + duration_seconds=1.0, + ) + with pytest.raises(ValidationError): + metrics.turns_per_task = 5 # type: ignore[misc] + + def test_zero_values(self) -> None: + metrics = TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=0, + tokens_per_task=0, + cost_per_task=0.0, + duration_seconds=0.0, + ) + assert metrics.turns_per_task == 0 + assert metrics.tokens_per_task == 0 + + def test_negative_turns_rejected(self) -> None: + with pytest.raises(ValidationError, match="turns_per_task"): + TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=-1, + tokens_per_task=0, + cost_per_task=0.0, + duration_seconds=0.0, + ) + + def test_negative_tokens_rejected(self) -> None: + with pytest.raises(ValidationError, match="tokens_per_task"): + TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=0, + tokens_per_task=-1, + cost_per_task=0.0, + duration_seconds=0.0, + ) + + def test_blank_agent_id_rejected(self) -> None: + with pytest.raises(ValidationError, match="agent_id"): + TaskCompletionMetrics( + agent_id=" ", + turns_per_task=0, + tokens_per_task=0, + cost_per_task=0.0, + duration_seconds=0.0, + ) + + def test_negative_cost_rejected(self) -> None: + with pytest.raises(ValidationError, match="cost_per_task"): + TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=0, + tokens_per_task=0, + cost_per_task=-0.01, + duration_seconds=0.0, + ) + + def test_negative_duration_rejected(self) -> None: + with pytest.raises(ValidationError, match="duration_seconds"): + TaskCompletionMetrics( + agent_id="agent-001", + turns_per_task=0, + tokens_per_task=0, + cost_per_task=0.0, + duration_seconds=-1.0, + ) + + def test_blank_task_id_rejected(self) -> None: + with pytest.raises(ValidationError, match="task_id"): + TaskCompletionMetrics( + task_id=" ", + agent_id="agent-001", + turns_per_task=0, + tokens_per_task=0, + cost_per_task=0.0, + duration_seconds=0.0, + ) + + +@pytest.mark.unit +class TestTaskCompletionMetricsFromRunResult: + """Factory method extracts values from AgentRunResult.""" + + def _make_run_result( + self, + sample_agent_context: AgentContext, + *, + turns: tuple[TurnRecord, ...] = (), + cost_usd: float = 0.0, + input_tokens: int = 0, + output_tokens: int = 0, + ) -> AgentRunResult: + """Build a minimal AgentRunResult for testing.""" + ctx = sample_agent_context + if input_tokens or output_tokens: + ctx = ctx.model_copy( + update={ + "accumulated_cost": TokenUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + cost_usd=cost_usd, + ), + }, + ) + execution_result = ExecutionResult( + context=ctx, + termination_reason=TerminationReason.COMPLETED, + turns=turns, + ) + prompt = SystemPrompt( + content="test", + template_version="v1", + estimated_tokens=10, + sections=(), + metadata={}, + ) + return AgentRunResult( + execution_result=execution_result, + system_prompt=prompt, + duration_seconds=5.0, + agent_id=str(sample_agent_context.identity.id), + task_id="task-001", + ) + + def test_from_run_result_extracts_values( + self, + sample_agent_context: AgentContext, + ) -> None: + turns = ( + TurnRecord( + turn_number=1, + input_tokens=100, + output_tokens=50, + cost_usd=0.01, + finish_reason=FinishReason.STOP, + ), + TurnRecord( + turn_number=2, + input_tokens=200, + output_tokens=80, + cost_usd=0.02, + finish_reason=FinishReason.STOP, + ), + ) + result = self._make_run_result( + sample_agent_context, + turns=turns, + input_tokens=300, + output_tokens=130, + cost_usd=0.03, + ) + metrics = TaskCompletionMetrics.from_run_result(result) + + assert metrics.task_id == "task-001" + assert metrics.agent_id == str( + sample_agent_context.identity.id, + ) + assert metrics.turns_per_task == 2 + assert metrics.tokens_per_task == 430 # 300 + 130 + assert metrics.cost_per_task == 0.03 + assert metrics.duration_seconds == 5.0 + + def test_from_run_result_zero_turns( + self, + sample_agent_context: AgentContext, + ) -> None: + result = self._make_run_result(sample_agent_context) + metrics = TaskCompletionMetrics.from_run_result(result) + + assert metrics.turns_per_task == 0 + assert metrics.tokens_per_task == 0 + assert metrics.cost_per_task == 0.0 diff --git a/tests/unit/engine/test_run_result.py b/tests/unit/engine/test_run_result.py index 888d33d2fd..3ca70e2b9e 100644 --- a/tests/unit/engine/test_run_result.py +++ b/tests/unit/engine/test_run_result.py @@ -21,8 +21,8 @@ ) from ai_company.engine.prompt import SystemPrompt from ai_company.engine.run_result import AgentRunResult -from ai_company.providers.enums import FinishReason -from ai_company.providers.models import TokenUsage +from ai_company.providers.enums import FinishReason, MessageRole +from ai_company.providers.models import ChatMessage, TokenUsage, ToolCall def _test_identity() -> AgentIdentity: @@ -423,3 +423,73 @@ def test_checker_returns_true_over_budget(self) -> None: }, ) assert checker(ctx) is True + + +def _make_result_with_messages( + *messages: ChatMessage, +) -> AgentRunResult: + """Build an AgentRunResult with specific messages in conversation.""" + identity = _test_identity() + ctx = AgentContext.from_identity(identity) + for msg in messages: + ctx = ctx.with_message(msg) + execution = ExecutionResult( + context=ctx, + termination_reason=TerminationReason.COMPLETED, + ) + prompt = SystemPrompt( + content="", + template_version="1.0", + estimated_tokens=0, + sections=(), + metadata={}, + ) + return AgentRunResult( + execution_result=execution, + system_prompt=prompt, + duration_seconds=1.0, + agent_id="agent-001", + ) + + +@pytest.mark.unit +class TestCompletionSummary: + """completion_summary returns last assistant message content.""" + + def test_returns_last_assistant_content(self) -> None: + result = _make_result_with_messages( + ChatMessage(role=MessageRole.ASSISTANT, content="First"), + ChatMessage(role=MessageRole.USER, content="Follow up"), + ChatMessage(role=MessageRole.ASSISTANT, content="Final answer"), + ) + assert result.completion_summary == "Final answer" + + def test_returns_none_when_no_assistant_messages(self) -> None: + result = _make_result_with_messages( + ChatMessage(role=MessageRole.USER, content="Hello"), + ) + assert result.completion_summary is None + + def test_returns_none_for_empty_conversation(self) -> None: + result = _make_result_with_messages() + assert result.completion_summary is None + + def test_skips_tool_call_only_messages(self) -> None: + """Assistant message with tool_calls but no content is skipped.""" + result = _make_result_with_messages( + ChatMessage(role=MessageRole.ASSISTANT, content="Before tool"), + ChatMessage( + role=MessageRole.ASSISTANT, + content=None, + tool_calls=(ToolCall(id="call-1", name="test_tool", arguments={}),), + ), + ) + assert result.completion_summary == "Before tool" + + def test_skips_empty_string_content(self) -> None: + """Assistant message with empty string content is skipped.""" + result = _make_result_with_messages( + ChatMessage(role=MessageRole.ASSISTANT, content="Real content"), + ChatMessage(role=MessageRole.ASSISTANT, content=""), + ) + assert result.completion_summary == "Real content"