Aureliolo · Aureliolo · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 14, 2026
@@ -104,7 +104,7 @@ src/ai_company/
   communication/  # Message bus, dispatcher, messenger, channels, delegation, loop prevention, conflict resolution, meeting protocol
   config/         # YAML company config loading and validation
   core/           # Shared domain models, base classes, and resilience config (RetryConfig, RateLimiterConfig)
-  engine/         # Agent orchestration, execution loops, parallel execution, task decomposition, routing, task assignment, centralized single-writer task state engine (TaskEngine), task lifecycle, recovery, shutdown, workspace isolation, coordination (multi-agent pipeline: TopologyDispatcher protocol, 4 dispatchers — SAS/centralized/decentralized/context-dependent, wave execution, workspace lifecycle integration), coordination error classification, and prompt policy validation
+  engine/         # Agent orchestration, execution loops, parallel execution, task decomposition, routing, task assignment, centralized single-writer task state engine (TaskEngine), task lifecycle, recovery, shutdown, workspace isolation, coordination (multi-agent pipeline: TopologyDispatcher protocol, 4 dispatchers — SAS/centralized/decentralized/context-dependent, wave execution, workspace lifecycle integration), coordination error classification, prompt policy validation, checkpoint recovery (checkpoint/, per-turn persistence, heartbeat detection, CheckpointRecoveryStrategy)
   hr/             # HR engine: hiring, firing, onboarding, offboarding, agent registry, performance tracking (task metrics, collaboration scoring, trend detection), promotion/demotion (criteria evaluation, approval strategies, model mapping)
   memory/         # Persistent agent memory (pluggable MemoryBackend protocol), backends/ (Mem0 adapter: backends/mem0/), retrieval pipeline (ranking, injection, context formatting, non-inferable filtering), shared org memory (org/), consolidation/archival (consolidation/)
   persistence/    # Operational data persistence — pluggable PersistenceBackend protocol, SQLite initial (see Memory & Persistence design page)
@@ -151,7 +151,7 @@ web/              # Vue 3 + PrimeVue + Tailwind CSS dashboard
 - **Every module** with business logic MUST have: `from ai_company.observability import get_logger` then `logger = get_logger(__name__)`
 - **Never** use `import logging` / `logging.getLogger()` / `print()` in application code
 - **Variable name**: always `logger` (not `_logger`, not `log`)
-- **Event names**: always use constants from the domain-specific module under `ai_company.observability.events` (e.g. `PROVIDER_CALL_START` from `events.provider`, `BUDGET_RECORD_ADDED` from `events.budget`, `CFO_ANOMALY_DETECTED` from `events.cfo`, `CONFLICT_DETECTED` from `events.conflict`, `MEETING_STARTED` from `events.meeting`, `CLASSIFICATION_START` from `events.classification`, `CONSOLIDATION_START` from `events.consolidation`, `ORG_MEMORY_QUERY_START` from `events.org_memory`, `API_REQUEST_STARTED` from `events.api`, `API_ROUTE_NOT_FOUND` from `events.api`, `CODE_RUNNER_EXECUTE_START` from `events.code_runner`, `DOCKER_EXECUTE_START` from `events.docker`, `MCP_INVOKE_START` from `events.mcp`, `SECURITY_EVALUATE_START` from `events.security`, `HR_HIRING_REQUEST_CREATED` from `events.hr`, `PERF_METRIC_RECORDED` from `events.performance`, `TRUST_EVALUATE_START` from `events.trust`, `PROMOTION_EVALUATE_START` from `events.promotion`, `PROMPT_BUILD_START` from `events.prompt`, `MEMORY_RETRIEVAL_START` from `events.memory`, `MEMORY_BACKEND_CONNECTED` from `events.memory`, `MEMORY_ENTRY_STORED` from `events.memory`, `MEMORY_BACKEND_SYSTEM_ERROR` from `events.memory`, `AUTONOMY_ACTION_AUTO_APPROVED` from `events.autonomy`, `TIMEOUT_POLICY_EVALUATED` from `events.timeout`, `PERSISTENCE_AUDIT_ENTRY_SAVED` from `events.persistence`, `TASK_ENGINE_STARTED` from `events.task_engine`, `COORDINATION_STARTED` from `events.coordination`, `COMMUNICATION_DISPATCH_START` from `events.communication`, `COMPANY_STARTED` from `events.company`, `CONFIG_LOADED` from `events.config`, `CORRELATION_ID_CREATED` from `events.correlation`, `DECOMPOSITION_STARTED` from `events.decomposition`, `DELEGATION_STARTED` from `events.delegation`, `EXECUTION_LOOP_STARTED` from `events.execution`, `GIT_OPERATION_START` from `events.git`, `PARALLEL_EXECUTION_STARTED` from `events.parallel`, `PERSONALITY_LOADED` from `events.personality`, `QUOTA_CHECKED` from `events.quota`, `ROLE_ASSIGNED` from `events.role`, `ROUTING_STARTED` from `events.routing`, `SANDBOX_EXECUTE_START` from `events.sandbox`, `TASK_CREATED` from `events.task`, `TASK_ASSIGNMENT_STARTED` from `events.task_assignment`, `TASK_ROUTING_STARTED` from `events.task_routing`, `TEMPLATE_LOADED` from `events.template`, `TOOL_INVOKE_START` from `events.tool`, `WORKSPACE_CREATED` from `events.workspace`). Import directly: `from ai_company.observability.events.<domain> import EVENT_CONSTANT`
+- **Event names**: always use constants from the domain-specific module under `ai_company.observability.events` (e.g. `PROVIDER_CALL_START` from `events.provider`, `BUDGET_RECORD_ADDED` from `events.budget`, `CFO_ANOMALY_DETECTED` from `events.cfo`, `CONFLICT_DETECTED` from `events.conflict`, `MEETING_STARTED` from `events.meeting`, `CLASSIFICATION_START` from `events.classification`, `CONSOLIDATION_START` from `events.consolidation`, `ORG_MEMORY_QUERY_START` from `events.org_memory`, `API_REQUEST_STARTED` from `events.api`, `API_ROUTE_NOT_FOUND` from `events.api`, `CODE_RUNNER_EXECUTE_START` from `events.code_runner`, `DOCKER_EXECUTE_START` from `events.docker`, `MCP_INVOKE_START` from `events.mcp`, `SECURITY_EVALUATE_START` from `events.security`, `HR_HIRING_REQUEST_CREATED` from `events.hr`, `PERF_METRIC_RECORDED` from `events.performance`, `TRUST_EVALUATE_START` from `events.trust`, `PROMOTION_EVALUATE_START` from `events.promotion`, `PROMPT_BUILD_START` from `events.prompt`, `MEMORY_RETRIEVAL_START` from `events.memory`, `MEMORY_BACKEND_CONNECTED` from `events.memory`, `MEMORY_ENTRY_STORED` from `events.memory`, `MEMORY_BACKEND_SYSTEM_ERROR` from `events.memory`, `AUTONOMY_ACTION_AUTO_APPROVED` from `events.autonomy`, `TIMEOUT_POLICY_EVALUATED` from `events.timeout`, `PERSISTENCE_AUDIT_ENTRY_SAVED` from `events.persistence`, `TASK_ENGINE_STARTED` from `events.task_engine`, `COORDINATION_STARTED` from `events.coordination`, `COMMUNICATION_DISPATCH_START` from `events.communication`, `COMPANY_STARTED` from `events.company`, `CONFIG_LOADED` from `events.config`, `CORRELATION_ID_CREATED` from `events.correlation`, `DECOMPOSITION_STARTED` from `events.decomposition`, `DELEGATION_STARTED` from `events.delegation`, `EXECUTION_LOOP_START` from `events.execution`, `CHECKPOINT_SAVED` from `events.checkpoint`, `PERSISTENCE_CHECKPOINT_SAVED` from `events.persistence`, `GIT_OPERATION_START` from `events.git`, `PARALLEL_GROUP_START` from `events.parallel`, `PERSONALITY_LOADED` from `events.personality`, `QUOTA_CHECKED` from `events.quota`, `ROLE_ASSIGNED` from `events.role`, `ROUTING_STARTED` from `events.routing`, `SANDBOX_EXECUTE_START` from `events.sandbox`, `TASK_CREATED` from `events.task`, `TASK_ASSIGNMENT_STARTED` from `events.task_assignment`, `TASK_ROUTING_STARTED` from `events.task_routing`, `TEMPLATE_LOADED` from `events.template`, `TOOL_INVOKE_START` from `events.tool`, `WORKSPACE_CREATED` from `events.workspace`). Import directly: `from ai_company.observability.events.<domain> import EVENT_CONSTANT`
 - **Structured kwargs**: always `logger.info(EVENT, key=value)` — never `logger.info("msg %s", val)`
 - **All error paths** must log at WARNING or ERROR with context before raising
 - **All state transitions** must log at INFO

@@ -31,7 +31,7 @@ The framework is provider-agnostic (any LLM via LiteLLM), configuration-driven (
 
 **Agent Orchestration**
 
-Define agents with roles, models, and tools. The engine handles task decomposition, routing, execution loops (ReAct, Plan-and-Execute), and multi-agent coordination.
+Define agents with roles, models, and tools. The engine handles task decomposition, routing, execution loops (ReAct, Plan-and-Execute), crash recovery (checkpoint resume), and multi-agent coordination.
 
 </td>
 <td width="33%">
@@ -111,6 +111,7 @@ graph TB
     Observability[Observability] -.-> Engine
     Persistence[Persistence] -.-> HR
     Persistence -.-> Security
+    Persistence -.-> Engine
 ```
 
 ## Documentation

@@ -508,6 +508,9 @@ implemented behind a `RecoveryStrategy` protocol, making the system pluggable.
 | `strategy_type` | `NotBlankStr` | Strategy identifier |
 | `context_snapshot` | `AgentContextSnapshot` | Redacted snapshot (turn count, accumulated cost, message count, max turns -- no message contents) |
 | `error_message` | `NotBlankStr` | Error that triggered recovery |
+| `checkpoint_context_json` | `str \| None` | Serialized `AgentContext` for resume (`None` for non-checkpoint strategies) |
+| `resume_attempt` | `int` (ge=0) | Current resume attempt number (0 when not resuming) |
+| `can_resume` | `bool` (computed) | `checkpoint_context_json is not None` |
 | `can_reassign` | `bool` (computed) | `retry_count < task.max_retries` |
 
 ### Recovery Strategies
@@ -547,9 +550,6 @@ implemented behind a `RecoveryStrategy` protocol, making the system pluggable.
 
 === "Strategy 2: Checkpoint Recovery"
 
-    !!! warning "Planned"
-        Checkpoint recovery is planned for a future release.
-
     The engine persists an `AgentContext` snapshot after each completed turn. On
     crash, the framework detects the failure (via heartbeat timeout or
     exception), loads the last checkpoint, and resumes execution from the exact
@@ -562,21 +562,21 @@ implemented behind a `RecoveryStrategy` protocol, making the system pluggable.
       strategy: "checkpoint"
       checkpoint:
         persist_every_n_turns: 1           # checkpoint frequency
-        storage: "sqlite"                  # sqlite, filesystem
+        # Storage backend determined by the injected CheckpointRepository
         heartbeat_interval_seconds: 30     # detect unresponsive agents
         max_resume_attempts: 2             # retry limit before falling back to fail_reassign
     ```
 
     - Preserves progress -- critical for long tasks (multi-step plans,
       epic-level work)
-    - Requires persistence layer and environment state reconciliation on resume
+    - Requires persistence layer and reconciliation message on resume
     - Natural fit with the existing immutable state model
 
-    When resuming from a checkpoint, the agent's tools and workspace may have
-    changed (other agents modified files, external state drifted). The
-    checkpoint strategy includes a reconciliation step: the resumed agent
-    receives a summary of changes since the checkpoint timestamp and can adapt
-    its plan accordingly.
+    When resuming from a checkpoint, the agent receives a system message
+    informing it of the resume point (turn number) and the error that triggered
+    recovery. This reconciliation message allows the agent to review its
+    progress and adapt.  Richer reconciliation (e.g. workspace change
+    detection) is planned for a future iteration.
 
 ---
 

@@ -28,6 +28,14 @@
     TaskAssignmentStrategy,
     build_strategy_map,
 )
+from ai_company.engine.checkpoint import (
+    Checkpoint,
+    CheckpointCallback,
+    CheckpointConfig,
+    CheckpointRecoveryStrategy,
+    Heartbeat,
+    make_checkpoint_callback,
+)
 from ai_company.engine.classification import (
     ClassificationResult,
     ErrorFinding,
@@ -208,6 +216,10 @@
     "BudgetChecker",
     "CancelTaskMutation",
     "CentralizedDispatcher",
+    "Checkpoint",
+    "CheckpointCallback",
+    "CheckpointConfig",
+    "CheckpointRecoveryStrategy",
     "ClassificationResult",
     "CleanupCallback",
     "ContextDependentDispatcher",
@@ -243,6 +255,7 @@
     "ExecutionResult",
     "ExecutionStateError",
     "FailAndReassignStrategy",
+    "Heartbeat",
     "HierarchicalAssignmentStrategy",
     "InMemoryResourceLock",
     "LlmDecompositionConfig",
@@ -332,5 +345,6 @@
     "build_strategy_map",
     "build_system_prompt",
     "classify_execution_errors",
+    "make_checkpoint_callback",
     "select_dispatcher",
 ]