Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ See `web/CLAUDE.md` for the full component inventory, design token rules, and po
- **Every module** with business logic MUST have: `from synthorg.observability import get_logger` then `logger = get_logger(__name__)`
- **Never** use `import logging` / `logging.getLogger()` / `print()` in application code (exception: `observability/setup.py`, `observability/sinks.py`, `observability/syslog_handler.py`, `observability/http_handler.py`, and `observability/otlp_handler.py` may use stdlib `logging` and `print(..., file=sys.stderr)` for handler construction, bootstrap, and error reporting code that runs before or during logging system configuration)
- **Variable name**: always `logger` (not `_logger`, not `log`)
- **Event names**: always use constants from the domain-specific module under `synthorg.observability.events` (e.g., `API_REQUEST_STARTED` from `events.api`, `TOOL_INVOKE_START` from `events.tool`, `GIT_COMMAND_START` from `events.git`, `CONTEXT_BUDGET_FILL_UPDATED`, `CONTEXT_BUDGET_COMPACTION_STARTED`, `CONTEXT_BUDGET_COMPACTION_COMPLETED`, `CONTEXT_BUDGET_COMPACTION_FAILED`, `CONTEXT_BUDGET_COMPACTION_SKIPPED`, `CONTEXT_BUDGET_COMPACTION_FALLBACK`, `CONTEXT_BUDGET_INDICATOR_INJECTED`, `CONTEXT_BUDGET_AGENT_COMPACTION_REQUESTED`, `CONTEXT_BUDGET_EPISTEMIC_MARKERS_PRESERVED` from `events.context_budget`, `BACKUP_STARTED` from `events.backup`, `SETUP_COMPLETED` from `events.setup`, `ROUTING_CANDIDATE_SELECTED` from `events.routing`, `SHIPPING_HTTP_BATCH_SENT` from `events.shipping`, `EVAL_REPORT_COMPUTED` from `events.evaluation`, `PROMPT_PROFILE_SELECTED` from `events.prompt`, `PROCEDURAL_MEMORY_START` from `events.procedural_memory`, `PERF_LLM_JUDGE_STARTED` from `events.performance`, `TASK_ENGINE_OBSERVER_FAILED` from `events.task_engine`, `TASK_ASSIGNMENT_PROJECT_FILTERED` and `TASK_ASSIGNMENT_PROJECT_NO_ELIGIBLE` from `events.task_assignment`, `WORKFLOW_EXEC_COMPLETED` from `events.workflow_execution`, `BLUEPRINT_INSTANTIATE_START` from `events.blueprint`, `WORKFLOW_DEF_ROLLED_BACK` from `events.workflow_definition`, `WORKFLOW_VERSION_SAVED` from `events.workflow_version`, `MEMORY_FINE_TUNE_STARTED`, `MEMORY_SELF_EDIT_TOOL_EXECUTE`, `MEMORY_SELF_EDIT_CORE_READ`, `MEMORY_SELF_EDIT_CORE_WRITE`, `MEMORY_SELF_EDIT_CORE_WRITE_REJECTED`, `MEMORY_SELF_EDIT_ARCHIVAL_SEARCH`, `MEMORY_SELF_EDIT_ARCHIVAL_WRITE`, `MEMORY_SELF_EDIT_RECALL_READ`, `MEMORY_SELF_EDIT_RECALL_WRITE`, `MEMORY_SELF_EDIT_WRITE_FAILED` from `events.memory`, `REPORTING_GENERATION_STARTED` from `events.reporting`, `RISK_BUDGET_SCORE_COMPUTED` from `events.risk_budget`, `BUDGET_PROJECT_COST_QUERIED`, `BUDGET_PROJECT_RECORDS_QUERIED`, `BUDGET_PROJECT_BUDGET_EXCEEDED`, and `BUDGET_PROJECT_ENFORCEMENT_CHECK` from `events.budget`, `LLM_STRATEGY_SYNTHESIZED` and `DISTILLATION_CAPTURED` from `events.consolidation`, `MEMORY_DIVERSITY_RERANKED`, `MEMORY_DIVERSITY_RERANK_FAILED`, and `MEMORY_REFORMULATION_ROUND` from `events.memory`, `NOTIFICATION_DISPATCHED` and `NOTIFICATION_DISPATCH_FAILED` from `events.notification`, `QUALITY_STEP_CLASSIFIED` from `events.quality`, `HEALTH_TICKET_EMITTED` from `events.health`, `TRAJECTORY_SCORING_START` from `events.trajectory`, `COORD_METRICS_AMDAHL_COMPUTED` from `events.coordination_metrics`, `COORDINATION_STARTED`, `COORDINATION_COMPLETED`, `COORDINATION_FAILED`, `COORDINATION_PHASE_STARTED`, `COORDINATION_PHASE_COMPLETED`, `COORDINATION_PHASE_FAILED`, `COORDINATION_WAVE_STARTED`, `COORDINATION_WAVE_COMPLETED`, `COORDINATION_TOPOLOGY_RESOLVED`, `COORDINATION_CLEANUP_STARTED`, `COORDINATION_CLEANUP_COMPLETED`, `COORDINATION_CLEANUP_FAILED`, `COORDINATION_WAVE_BUILT`, `COORDINATION_FACTORY_BUILT`, and `COORDINATION_ATTRIBUTION_BUILT` from `events.coordination`, `WEB_REQUEST_START` and `WEB_SSRF_BLOCKED` from `events.web`, `DB_QUERY_START` and `DB_WRITE_BLOCKED` from `events.database`, `TERMINAL_COMMAND_START` and `TERMINAL_COMMAND_BLOCKED` from `events.terminal`, `SUB_CONSTRAINT_RESOLVED` and `SUB_CONSTRAINT_DENIED` from `events.sub_constraint`, `VERSION_SAVED` and `VERSION_SNAPSHOT_FAILED` from `events.versioning`, `ANALYTICS_AGGREGATION_COMPUTED` and `ANALYTICS_RETRY_RATE_ALERT` from `events.analytics`, `CALL_CLASSIFICATION_COMPUTED` from `events.call_classification`, `QUOTA_THRESHOLD_ALERT` and `QUOTA_POLL_FAILED` from `events.quota`, `CONFLICT_DEBATE_EVALUATOR_FAILED` from `events.conflict`, `DELEGATION_LOOP_CIRCUIT_BACKOFF` and `DELEGATION_LOOP_CIRCUIT_PERSIST_FAILED` from `events.delegation`, `MEETING_EVENT_COOLDOWN_SKIPPED` and `MEETING_TASKS_CAPPED` from `events.meeting`, `PERSISTENCE_CIRCUIT_BREAKER_SAVED`, `PERSISTENCE_CIRCUIT_BREAKER_SAVE_FAILED`, `PERSISTENCE_CIRCUIT_BREAKER_LOADED`, `PERSISTENCE_CIRCUIT_BREAKER_LOAD_FAILED`, `PERSISTENCE_CIRCUIT_BREAKER_DELETED`, and `PERSISTENCE_CIRCUIT_BREAKER_DELETE_FAILED` from `events.persistence`, `METRICS_SCRAPE_COMPLETED`, `METRICS_SCRAPE_FAILED`, `METRICS_COLLECTOR_INITIALIZED`, `METRICS_COORDINATION_RECORDED`, `METRICS_OTLP_EXPORT_COMPLETED` and `METRICS_OTLP_FLUSHER_STOPPED` from `events.metrics`, `EXECUTION_PROJECT_VALIDATION_FAILED` from `events.execution`, `ORG_MEMORY_QUERY_START`, `ORG_MEMORY_QUERY_COMPLETE`, `ORG_MEMORY_QUERY_FAILED`, `ORG_MEMORY_WRITE_START`, `ORG_MEMORY_WRITE_COMPLETE`, `ORG_MEMORY_WRITE_DENIED`, `ORG_MEMORY_WRITE_FAILED`, `ORG_MEMORY_POLICIES_LISTED`, `ORG_MEMORY_BACKEND_CREATED`, `ORG_MEMORY_CONNECT_FAILED`, `ORG_MEMORY_DISCONNECT_FAILED`, `ORG_MEMORY_NOT_CONNECTED`, `ORG_MEMORY_ROW_PARSE_FAILED`, `ORG_MEMORY_CONFIG_INVALID`, `ORG_MEMORY_MODEL_INVALID`, `ORG_MEMORY_MVCC_PUBLISH_APPENDED`, `ORG_MEMORY_MVCC_RETRACT_APPENDED`, `ORG_MEMORY_MVCC_SNAPSHOT_AT_QUERIED`, and `ORG_MEMORY_MVCC_LOG_QUERIED` from `events.org_memory`). Each domain has its own module -- see `src/synthorg/observability/events/` for the full inventory of constants. Import directly: `from synthorg.observability.events.<domain> import EVENT_CONSTANT`
- **Event names**: always use constants from the domain-specific module under `synthorg.observability.events` (e.g., `API_REQUEST_STARTED` from `events.api`, `TOOL_INVOKE_START` from `events.tool`, `GIT_COMMAND_START` from `events.git`, `CONTEXT_BUDGET_FILL_UPDATED`, `CONTEXT_BUDGET_COMPACTION_STARTED`, `CONTEXT_BUDGET_COMPACTION_COMPLETED`, `CONTEXT_BUDGET_COMPACTION_FAILED`, `CONTEXT_BUDGET_COMPACTION_SKIPPED`, `CONTEXT_BUDGET_COMPACTION_FALLBACK`, `CONTEXT_BUDGET_INDICATOR_INJECTED`, `CONTEXT_BUDGET_AGENT_COMPACTION_REQUESTED`, `CONTEXT_BUDGET_EPISTEMIC_MARKERS_PRESERVED` from `events.context_budget`, `BACKUP_STARTED` from `events.backup`, `SETUP_COMPLETED` from `events.setup`, `ROUTING_CANDIDATE_SELECTED` from `events.routing`, `SHIPPING_HTTP_BATCH_SENT` from `events.shipping`, `EVAL_REPORT_COMPUTED` from `events.evaluation`, `PROMPT_PROFILE_SELECTED` from `events.prompt`, `PROCEDURAL_MEMORY_START` from `events.procedural_memory`, `PERF_LLM_JUDGE_STARTED` from `events.performance`, `TASK_ENGINE_OBSERVER_FAILED` from `events.task_engine`, `TASK_ASSIGNMENT_PROJECT_FILTERED` and `TASK_ASSIGNMENT_PROJECT_NO_ELIGIBLE` from `events.task_assignment`, `EXECUTION_SHUTDOWN_IMMEDIATE_CANCEL`, `EXECUTION_SHUTDOWN_TOOL_WAIT`, `EXECUTION_SHUTDOWN_CHECKPOINT_SAVE`, `EXECUTION_SHUTDOWN_CHECKPOINT_FAILED`, and `EXECUTION_PROJECT_VALIDATION_FAILED` from `events.execution`, `WORKFLOW_EXEC_COMPLETED` from `events.workflow_execution`, `BLUEPRINT_INSTANTIATE_START` from `events.blueprint`, `WORKFLOW_DEF_ROLLED_BACK` from `events.workflow_definition`, `WORKFLOW_VERSION_SAVED` from `events.workflow_version`, `MEMORY_FINE_TUNE_STARTED`, `MEMORY_SELF_EDIT_TOOL_EXECUTE`, `MEMORY_SELF_EDIT_CORE_READ`, `MEMORY_SELF_EDIT_CORE_WRITE`, `MEMORY_SELF_EDIT_CORE_WRITE_REJECTED`, `MEMORY_SELF_EDIT_ARCHIVAL_SEARCH`, `MEMORY_SELF_EDIT_ARCHIVAL_WRITE`, `MEMORY_SELF_EDIT_RECALL_READ`, `MEMORY_SELF_EDIT_RECALL_WRITE`, `MEMORY_SELF_EDIT_WRITE_FAILED` from `events.memory`, `REPORTING_GENERATION_STARTED` from `events.reporting`, `RISK_BUDGET_SCORE_COMPUTED` from `events.risk_budget`, `BUDGET_PROJECT_COST_QUERIED`, `BUDGET_PROJECT_RECORDS_QUERIED`, `BUDGET_PROJECT_BUDGET_EXCEEDED`, and `BUDGET_PROJECT_ENFORCEMENT_CHECK` from `events.budget`, `LLM_STRATEGY_SYNTHESIZED` and `DISTILLATION_CAPTURED` from `events.consolidation`, `MEMORY_DIVERSITY_RERANKED`, `MEMORY_DIVERSITY_RERANK_FAILED`, and `MEMORY_REFORMULATION_ROUND` from `events.memory`, `NOTIFICATION_DISPATCHED` and `NOTIFICATION_DISPATCH_FAILED` from `events.notification`, `QUALITY_STEP_CLASSIFIED` from `events.quality`, `HEALTH_TICKET_EMITTED` from `events.health`, `TRAJECTORY_SCORING_START` from `events.trajectory`, `COORD_METRICS_AMDAHL_COMPUTED` from `events.coordination_metrics`, `COORDINATION_STARTED`, `COORDINATION_COMPLETED`, `COORDINATION_FAILED`, `COORDINATION_PHASE_STARTED`, `COORDINATION_PHASE_COMPLETED`, `COORDINATION_PHASE_FAILED`, `COORDINATION_WAVE_STARTED`, `COORDINATION_WAVE_COMPLETED`, `COORDINATION_TOPOLOGY_RESOLVED`, `COORDINATION_CLEANUP_STARTED`, `COORDINATION_CLEANUP_COMPLETED`, `COORDINATION_CLEANUP_FAILED`, `COORDINATION_WAVE_BUILT`, `COORDINATION_FACTORY_BUILT`, and `COORDINATION_ATTRIBUTION_BUILT` from `events.coordination`, `WEB_REQUEST_START` and `WEB_SSRF_BLOCKED` from `events.web`, `DB_QUERY_START` and `DB_WRITE_BLOCKED` from `events.database`, `TERMINAL_COMMAND_START` and `TERMINAL_COMMAND_BLOCKED` from `events.terminal`, `SUB_CONSTRAINT_RESOLVED` and `SUB_CONSTRAINT_DENIED` from `events.sub_constraint`, `VERSION_SAVED` and `VERSION_SNAPSHOT_FAILED` from `events.versioning`, `ANALYTICS_AGGREGATION_COMPUTED` and `ANALYTICS_RETRY_RATE_ALERT` from `events.analytics`, `CALL_CLASSIFICATION_COMPUTED` from `events.call_classification`, `QUOTA_THRESHOLD_ALERT` and `QUOTA_POLL_FAILED` from `events.quota`, `CONFLICT_DEBATE_EVALUATOR_FAILED` from `events.conflict`, `DELEGATION_LOOP_CIRCUIT_BACKOFF` and `DELEGATION_LOOP_CIRCUIT_PERSIST_FAILED` from `events.delegation`, `MEETING_EVENT_COOLDOWN_SKIPPED` and `MEETING_TASKS_CAPPED` from `events.meeting`, `PERSISTENCE_CIRCUIT_BREAKER_SAVED`, `PERSISTENCE_CIRCUIT_BREAKER_SAVE_FAILED`, `PERSISTENCE_CIRCUIT_BREAKER_LOADED`, `PERSISTENCE_CIRCUIT_BREAKER_LOAD_FAILED`, `PERSISTENCE_CIRCUIT_BREAKER_DELETED`, and `PERSISTENCE_CIRCUIT_BREAKER_DELETE_FAILED` from `events.persistence`, `METRICS_SCRAPE_COMPLETED`, `METRICS_SCRAPE_FAILED`, `METRICS_COLLECTOR_INITIALIZED`, `METRICS_COORDINATION_RECORDED`, `METRICS_OTLP_EXPORT_COMPLETED` and `METRICS_OTLP_FLUSHER_STOPPED` from `events.metrics`, `ORG_MEMORY_QUERY_START`, `ORG_MEMORY_QUERY_COMPLETE`, `ORG_MEMORY_QUERY_FAILED`, `ORG_MEMORY_WRITE_START`, `ORG_MEMORY_WRITE_COMPLETE`, `ORG_MEMORY_WRITE_DENIED`, `ORG_MEMORY_WRITE_FAILED`, `ORG_MEMORY_POLICIES_LISTED`, `ORG_MEMORY_BACKEND_CREATED`, `ORG_MEMORY_CONNECT_FAILED`, `ORG_MEMORY_DISCONNECT_FAILED`, `ORG_MEMORY_NOT_CONNECTED`, `ORG_MEMORY_ROW_PARSE_FAILED`, `ORG_MEMORY_CONFIG_INVALID`, `ORG_MEMORY_MODEL_INVALID`, `ORG_MEMORY_MVCC_PUBLISH_APPENDED`, `ORG_MEMORY_MVCC_RETRACT_APPENDED`, `ORG_MEMORY_MVCC_SNAPSHOT_AT_QUERIED`, and `ORG_MEMORY_MVCC_LOG_QUERIED` from `events.org_memory`). Each domain has its own module -- see `src/synthorg/observability/events/` for the full inventory of constants. Import directly: `from synthorg.observability.events.<domain> import EVENT_CONSTANT`
- **Structured kwargs**: always `logger.info(EVENT, key=value)` -- never `logger.info("msg %s", val)`
- **All error paths** must log at WARNING or ERROR with context before raising
- **All state transitions** must log at INFO
Expand Down
93 changes: 66 additions & 27 deletions docs/design/engine.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ stateDiagram-v2
ASSIGNED --> BLOCKED : blocked
ASSIGNED --> CANCELLED : cancelled
ASSIGNED --> INTERRUPTED : shutdown signal
ASSIGNED --> SUSPENDED : checkpoint shutdown

IN_PROGRESS --> IN_REVIEW : agent done
IN_PROGRESS --> FAILED : runtime crash
IN_PROGRESS --> CANCELLED : cancelled
IN_PROGRESS --> INTERRUPTED : shutdown signal
IN_PROGRESS --> SUSPENDED : checkpoint shutdown

IN_REVIEW --> COMPLETED : approved
IN_REVIEW --> IN_PROGRESS : rework
Expand All @@ -40,18 +42,22 @@ stateDiagram-v2

INTERRUPTED --> ASSIGNED : reassign on restart

SUSPENDED --> ASSIGNED : resume from checkpoint

COMPLETED --> [*]
CANCELLED --> [*]
```

!!! info "Non-terminal states"
`BLOCKED`, `FAILED`, and `INTERRUPTED` are non-terminal:
`BLOCKED`, `FAILED`, `INTERRUPTED`, and `SUSPENDED` are non-terminal:

- **BLOCKED** returns to `ASSIGNED` when unblocked.
- **FAILED** returns to `ASSIGNED` for retry when `retry_count < max_retries`
(see [Crash Recovery](#agent-crash-recovery)).
- **INTERRUPTED** returns to `ASSIGNED` on restart
(see [Graceful Shutdown](#graceful-shutdown-protocol)).
- **SUSPENDED** returns to `ASSIGNED` for resume from checkpoint
(see [Graceful Shutdown](#graceful-shutdown-protocol), Strategy 4).
- **COMPLETED** and **CANCELLED** are the only terminal states with no
outgoing transitions.

Expand Down Expand Up @@ -140,7 +146,7 @@ Backlog | Ready | In Progress | Review | Done
The `KanbanColumn` enum defines five columns that map bidirectionally to
`TaskStatus` (Backlog=CREATED, Ready=ASSIGNED, In Progress=IN_PROGRESS,
Review=IN_REVIEW, Done=COMPLETED). Off-board statuses (BLOCKED, FAILED,
INTERRUPTED, CANCELLED) map to `None`. `KanbanConfig` provides per-column
INTERRUPTED, SUSPENDED, CANCELLED) map to `None`. `KanbanConfig` provides per-column
WIP limits with strict (hard-reject) or advisory (log-warning) enforcement.
Column transitions are validated independently and resolved to the underlying
task status transition path.
Expand Down Expand Up @@ -751,8 +757,9 @@ async run(
``DecisionRecord.metadata`` field (best-effort; lookup failure
is logged at WARNING and the decision record is still written).
See ``docs/design/agents.md`` for the full design.
- `SHUTDOWN` termination: current status -> INTERRUPTED
(see [Graceful Shutdown](#graceful-shutdown-protocol)).
- `SHUTDOWN` termination: current status -> INTERRUPTED (or SUSPENDED
if the checkpoint strategy successfully checkpointed the task;
see [Graceful Shutdown](#graceful-shutdown-protocol)).
- `ERROR` termination: recovery strategy is applied (default
`FailAndReassignStrategy` transitions to FAILED;
see [Crash Recovery](#agent-crash-recovery)).
Expand Down Expand Up @@ -1147,15 +1154,16 @@ The engine sets a shutdown event, stops accepting new tasks, and gives in-flight
agents a grace period to finish their current turn. Agents check the shutdown
event at turn boundaries (between LLM calls, before tool invocations) and exit
cooperatively. After the grace period, remaining agents are force-cancelled.
**All tasks terminated by shutdown -- whether they exited cooperatively or were
force-cancelled -- are marked `INTERRUPTED`** by the engine layer.
**All tasks terminated by this strategy -- whether they exited cooperatively or
were force-cancelled -- are marked `INTERRUPTED`** by the engine layer.
(Strategy 4 uses `SUSPENDED` for successfully checkpointed tasks instead;
see [Strategy 4](#strategy-4-checkpoint-and-stop).)

```yaml
graceful_shutdown:
strategy: "cooperative_timeout" # cooperative_timeout, immediate, finish_tool, checkpoint
cooperative_timeout:
grace_seconds: 30 # time for agents to finish cooperatively
cleanup_seconds: 5 # time for final cleanup (persist cost records, close connections)
grace_seconds: 30 # time for agents to finish cooperatively
cleanup_seconds: 5 # time for final cleanup (persist cost records, close connections)
```

On shutdown signal:
Expand Down Expand Up @@ -1188,25 +1196,56 @@ On shutdown signal:
minimum an input-cost audit record. Streaming calls are charged only for
tokens sent before disconnect.

### Future Strategies
### Strategy 2: Immediate Cancel

All agent tasks are cancelled immediately via `task.cancel()` with no grace
period. Fastest shutdown but highest data loss -- partial tool side effects,
billed-but-lost LLM responses. Tasks are marked `INTERRUPTED`.

```yaml
graceful_shutdown:
strategy: "immediate"
cleanup_seconds: 5
```

### Strategy 3: Finish Current Tool

Like cooperative timeout, but uses a per-tool timeout (default 60s) to allow
the current tool invocation to complete. The execution loop finishes the
current tool before checking shutdown at turn boundaries; this strategy
gives a longer window for that. Tasks that exceed the tool timeout are
force-cancelled and marked `INTERRUPTED`.

Strategy 2: Immediate Cancel
: All agent tasks are cancelled immediately via `task.cancel()`. Fastest
shutdown but highest data loss -- partial tool side effects, billed-but-lost
LLM responses.

Strategy 3: Finish Current Tool
: Like cooperative timeout, but waits for the current tool invocation to
complete even if it exceeds the grace period. Needs per-tool timeout as a
backstop for long-running sandboxed execution.

Strategy 4: Checkpoint and Stop
: On shutdown signal, each agent persists its full `AgentContext` snapshot and
transitions to `INTERRUPTED`. On restart, the engine loads checkpoints and
resumes execution. This naturally extends
[Checkpoint Recovery](#agent-crash-recovery) -- the only difference is
whether the checkpoint was written proactively (graceful shutdown) or loaded
from the last turn (crash recovery).
```yaml
graceful_shutdown:
strategy: "finish_tool"
tool_timeout_seconds: 60
cleanup_seconds: 5
```

### Strategy 4: Checkpoint and Stop

On shutdown signal, agents checkpoint cooperatively during the grace period.
Stragglers are checkpointed via a `checkpoint_saver` callback, then cancelled.
Successfully checkpointed tasks transition to `SUSPENDED` (not `INTERRUPTED`);
failed checkpoints fall back to `INTERRUPTED`. On restart, the engine loads
checkpoints and resumes execution from the exact point of interruption. This
naturally extends [Checkpoint Recovery](#agent-crash-recovery) -- the only
difference is whether the checkpoint was written proactively (graceful
shutdown) or loaded from the last turn (crash recovery).

!!! info "SUSPENDED vs INTERRUPTED"
`SUSPENDED` indicates the task was checkpointed before stop and can resume
from the exact point of interruption. `INTERRUPTED` indicates the task was
stopped without a checkpoint and requires full reassignment. Both are
non-terminal: `SUSPENDED -> ASSIGNED`, `INTERRUPTED -> ASSIGNED`.
Comment thread
coderabbitai[bot] marked this conversation as resolved.

```yaml
graceful_shutdown:
strategy: "checkpoint"
grace_seconds: 30
cleanup_seconds: 5
```

---

Expand Down
17 changes: 14 additions & 3 deletions src/synthorg/config/schema.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Root configuration schema and config-level Pydantic models."""

from collections import Counter
from typing import Any, ClassVar, Self
from typing import Any, ClassVar, Literal, Self

from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, model_validator

Expand Down Expand Up @@ -441,16 +441,21 @@ class GracefulShutdownConfig(BaseModel):
"""Configuration for graceful shutdown behaviour.

Attributes:
strategy: Shutdown strategy name (e.g. ``"cooperative_timeout"``).
strategy: Shutdown strategy name (``"cooperative_timeout"``,
``"immediate"``, ``"finish_tool"``, or ``"checkpoint"``).
grace_seconds: Seconds to wait for cooperative agent exit
before force-cancelling.
cleanup_seconds: Seconds allowed for cleanup callbacks
(persist costs, close connections, flush logs).
tool_timeout_seconds: Per-tool timeout for the
``"finish_tool"`` strategy (seconds).
"""

model_config = ConfigDict(frozen=True, allow_inf_nan=False)

strategy: NotBlankStr = Field(
strategy: Literal[
"cooperative_timeout", "immediate", "finish_tool", "checkpoint"
] = Field(
default="cooperative_timeout",
description="Shutdown strategy name",
)
Expand All @@ -466,6 +471,12 @@ class GracefulShutdownConfig(BaseModel):
le=60,
description="Seconds allowed for cleanup callbacks",
)
tool_timeout_seconds: float = Field(
default=60.0,
gt=0,
le=300,
description="Per-tool timeout for finish_tool strategy",
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class TaskAssignmentConfig(BaseModel):
Expand Down
Loading
Loading