diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eaeaf9426b..7b93f8a070 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,12 +28,16 @@ ci: # check-no-modify-migration: # rebase / migration state checks meaningful only on the pushing # developer's clone, not in an ephemeral cloud runner. - # ``no-release-please-token``, ``workflow-shell-git-commits``, - # ``no-review-origin-in-code`` and ``no-migration-framing`` are - # deliberately NOT skipped: all four are pure-Python, zero-dependency + # ``no-release-please-token`` and ``workflow-shell-git-commits`` are + # deliberately NOT skipped: both are pure-Python, zero-dependency # scripts with no CI counterpart, and letting pre-commit.ci enforce - # them closes the gap where a PR from a contributor who skipped local - # hooks would otherwise introduce a regression unchecked. + # them at the pre-commit stage closes the gap where a PR from a + # contributor who skipped local hooks would otherwise introduce a + # regression unchecked. ``no-review-origin-in-code`` and + # ``no-migration-framing`` run at ``stages: [pre-push]`` (the + # local-only pre-push hook surface) so pre-commit.ci does not pick + # them up; their CI counterpart is the ``Lint`` job in ``ci.yml`` + # which runs the same scripts on every PR. skip: [commitizen, gitleaks, hadolint-docker, caddy-validate, zizmor, no-em-dashes, no-redundant-timeout, mypy, pytest-unit, golangci-lint, go-vet, go-test, eslint-web, check-push-rebased, check-single-migration-per-pr, check-no-modify-migration, forbidden-literals, persistence-boundary, persistence-protocol-uniformity, dependency-inversion, provider-complete-chokepoint, no-new-logger-exception-str-exc, otlp-span-redaction, orphan-fixtures, doc-drift-counts, boundary-typed, setting-to-startup-trace, long-running-loop-kill-switch, list-pagination, domain-error-hierarchy, dead-api-endpoints, dual-backend-test-parity, schema-drift, no-magic-numbers, convention-gate-inventory, mcp-admin-guardrail, runtime-stats-freshness, dto-types-ts-in-sync] default_install_hook_types: [pre-commit, commit-msg, pre-push] @@ -452,7 +456,12 @@ repos: # script / baseline) cannot bypass the check. files: ^(src/synthorg/.*\.py|scripts/check_domain_error_hierarchy\.py|scripts/domain_error_hierarchy_baseline\.txt|\.pre-commit-config\.yaml)$ pass_filenames: false - stages: [pre-commit, pre-push] + # Full-repo AST walk: ~2.7s on every commit becomes noticeable + # on focused-edit cycles where a developer commits multiple + # times in a row. Move to pre-push so each commit stays + # interactive while the gate still fires before code leaves + # the machine. + stages: [pre-push] - id: no-controller-response-for-domain-errors name: no-controller-response-for-domain-errors gate (raise typed errors, never build local Response envelopes) @@ -618,10 +627,11 @@ repos: # the script's internal allowlist would immediately skip. files: ^(src/synthorg/(?!persistence/(?:postgres|sqlite)/revisions/).*\.(py|sql)|tests/.*\.py|scripts/check_no_review_origin_in_code\.py|\.pre-commit-config\.yaml)$ pass_filenames: false - # pre-commit covers pre-commit.ci (catches GitHub-UI edits and - # contributors who skip local hooks); pre-push remains the - # primary developer-machine enforcement point. - stages: [pre-commit, pre-push] + # Full-repo scan is ~14s wall-clock; running it on every commit + # made focused-edit cycles painful. pre-push still catches + # everything before code leaves the machine, and pre-commit.ci + # is a separate hook surface (uses its own stage selection). + stages: [pre-push] - id: no-migration-framing name: no migration / origin / phase-N framing in code @@ -635,4 +645,6 @@ repos: # files the script's internal allowlist would immediately skip. files: ^(src/synthorg/(?!persistence/(?:postgres|sqlite)/revisions/).*\.(py|sql)|tests/.*\.py|scripts/check_no_migration_framing\.py|\.pre-commit-config\.yaml)$ pass_filenames: false - stages: [pre-commit, pre-push] + # ~13s wall-clock on the full repo; same cost trade-off as + # no-review-origin-in-code above. Moved to pre-push. + stages: [pre-push] diff --git a/CLAUDE.md b/CLAUDE.md index dd3955a6c3..437ef216d8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,6 +23,7 @@ Web: see `web/CLAUDE.md`. CLI: see `cli/CLAUDE.md` (use `go -C cli`, never `cd c ```bash uv sync # all deps uv sync --group docs # docs toolchain (zensical + D2) +bash scripts/install_cli_tools.sh # one-time per-machine: golangci-lint only (CI installs separately; install d2 via docs/getting_started.md) uv run ruff check src/ tests/ --fix # lint + auto-fix uv run ruff format src/ tests/ # format uv run mypy src/ tests/ # strict type-check diff --git a/cli/cmd/new.go b/cli/cmd/new.go index 918a76bcb2..3ba1b8b024 100644 --- a/cli/cmd/new.go +++ b/cli/cmd/new.go @@ -35,6 +35,15 @@ Available kinds: synthorg new controller ping`, GroupID: "core", Args: cobra.NoArgs, + // Render the help text when the user runs ``synthorg new`` with no + // subcommand, then exit with the usage error code so the parent + // shell can detect that a kind/domain was required. Without the + // explicit ExitUsage, the bare ``synthorg new`` would print help + // and exit 0, indistinguishable from a successful operation. + RunE: func(cmd *cobra.Command, _ []string) error { + _ = cmd.Help() + return NewExitError(ExitUsage, nil) + }, } var newServiceCmd = newKindCmd(scaffold.KindService, "service") diff --git a/cli/cmd/start.go b/cli/cmd/start.go index 4542188ef6..d90ae7c224 100644 --- a/cli/cmd/start.go +++ b/cli/cmd/start.go @@ -83,7 +83,31 @@ func runStart(cmd *cobra.Command, _ []string) error { state, err := config.Load(opts.DataDir) if err != nil { - return fmt.Errorf("loading config: %w", err) + // config.Load(...) returns DefaultState silently when the file + // is absent, so a non-nil error here means the file exists but + // is unreadable, malformed, or fails schema validation. + // Distinguish each shape via typed sentinels so the operator + // knows whether to repair the file or check permissions + // instead of guessing from a generic ``loading config:`` + // wrapper. + switch { + case errors.Is(err, config.ErrParsing): + return fmt.Errorf( + "config file is malformed (invalid JSON); "+ + "edit it manually or remove it and re-run "+ + "'synthorg init': %w", err, + ) + case errors.Is(err, config.ErrReading): + return fmt.Errorf( + "config file is unreadable (check filesystem "+ + "permissions): %w", err, + ) + default: + // Anything else (validation / DataDir canonicalisation) is + // surfaced as-is with a ``config:`` prefix so the operator + // reads the wrapped detail directly. + return fmt.Errorf("config: %w", err) + } } safeDir, err := safeStateDir(state) if err != nil { diff --git a/cli/cmd/update.go b/cli/cmd/update.go index beabf35d19..0525a8d78e 100644 --- a/cli/cmd/update.go +++ b/cli/cmd/update.go @@ -105,9 +105,11 @@ func runUpdate(cmd *cobra.Command, _ []string) error { // CLI update (unless --images-only). if !updateImagesOnly { - if err := updateCLI(cmd, state.AutoUpdateCLI); errors.Is(err, errReexec) { + err := updateCLI(cmd, state.AutoUpdateCLI) + if errors.Is(err, errReexec) { return reexecUpdate(cmd) - } else if err != nil { + } + if err != nil { return fmt.Errorf("updating CLI binary: %w", err) } } @@ -235,6 +237,16 @@ func downloadAndApplyCLI(ctx context.Context, out *ui.UI, result selfupdate.Chec return nil } + // Surface a permission error in the install directory before the + // download starts; otherwise the user waits through a multi-MB + // transfer only to fail at the final ``Replace`` step. + if err := selfupdate.ProbeInstallDirWritable(); err != nil { + return fmt.Errorf( + "cannot update CLI in place; re-run as an administrator "+ + "or move the binary to a writable directory: %w", err, + ) + } + out.Step("Downloading...") binary, err := selfupdate.Download(ctx, result.AssetURL, result.ChecksumURL, result.SigstoreBundURL) if err != nil { diff --git a/cli/internal/config/state.go b/cli/internal/config/state.go index 95e6c7a8d7..0d5e2818d7 100644 --- a/cli/internal/config/state.go +++ b/cli/internal/config/state.go @@ -18,6 +18,19 @@ import ( const stateFileName = "config.json" +// Sentinel errors for Load failure modes, classified so callers can +// branch on shape (errors.Is) rather than on error.Error() prefix. The +// shapes are mutually exclusive: at most one wraps any given Load +// error. +var ( + // ErrReading is wrapped when the persisted config file exists but + // cannot be read (filesystem permissions, I/O error, etc.). + ErrReading = errors.New("reading config") + // ErrParsing is wrapped when the config file is present and + // readable but its bytes do not decode as valid JSON. + ErrParsing = errors.New("parsing config") +) + // Fine-tune variant identifiers persisted in State.FineTuningVariant and // used to construct image service names (e.g. "synthorg-fine-tune-gpu"). const ( @@ -267,12 +280,12 @@ func Load(dataDir string) (State, error) { defaults.Sandbox = false return defaults, nil } - return State{}, fmt.Errorf("reading config %s: %w", path, err) + return State{}, fmt.Errorf("%w %s: %w", ErrReading, path, err) } // Unmarshal onto defaults so missing fields retain default values. s := DefaultState() if err := json.Unmarshal(data, &s); err != nil { - return State{}, fmt.Errorf("parsing config %s: %w", path, err) + return State{}, fmt.Errorf("%w %s: %w", ErrParsing, path, err) } if err := s.validate(); err != nil { return State{}, fmt.Errorf("config %s: %w", path, err) diff --git a/cli/internal/selfupdate/updater.go b/cli/internal/selfupdate/updater.go index afee843ea9..f8bf6fa5f7 100644 --- a/cli/internal/selfupdate/updater.go +++ b/cli/internal/selfupdate/updater.go @@ -512,6 +512,43 @@ func Replace(binaryData []byte) error { return ReplaceAt(binaryData, execPath) } +// ProbeInstallDirWritable verifies the directory holding the current +// executable is writable BEFORE a download is started. The probe +// creates and removes a short-named tempfile so a permission error +// surfaces in microseconds instead of after the user has already +// waited through a multi-MB download. Returns nil when writable. +func ProbeInstallDirWritable() error { + execPath, err := os.Executable() + if err != nil { + return fmt.Errorf("finding executable path: %w", err) + } + return ProbeInstallDirWritableAt(execPath) +} + +// ProbeInstallDirWritableAt is the testable core of +// ProbeInstallDirWritable: it accepts the executable path explicitly +// so unit tests can target an arbitrary directory. +func ProbeInstallDirWritableAt(execPath string) error { + resolved, err := filepath.EvalSymlinks(execPath) + if err != nil { + return fmt.Errorf("resolving symlinks for write probe: %w", err) + } + dir := filepath.Dir(resolved) + f, err := os.CreateTemp(dir, ".synthorg-write-probe.*.tmp") + if err != nil { + return fmt.Errorf("install directory %s is not writable: %w", dir, err) + } + tmpPath := f.Name() + if cerr := f.Close(); cerr != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("closing write-probe file: %w", cerr) + } + if rerr := os.Remove(tmpPath); rerr != nil { + return fmt.Errorf("removing write-probe file %s: %w", tmpPath, rerr) + } + return nil +} + // ReplaceAt swaps the binary at the given path with new content. // This is the testable core of Replace. func ReplaceAt(binaryData []byte, execPath string) error { diff --git a/docs/architecture/decisions.md b/docs/architecture/decisions.md index de983d515c..09e00e0d73 100644 --- a/docs/architecture/decisions.md +++ b/docs/architecture/decisions.md @@ -129,6 +129,13 @@ All significant design and architecture decisions in force today, organized by d **Mitigation plan:** (1) File upstream PR against `nats-io/nats.py` with the one-line `inspect.iscoroutinefunction` fix; upstream PR status is tracked in the project issue queue (search `nats-py` label); the scoped `filterwarnings` entry in `pyproject.toml` remains the active workaround until a fixed upstream release is available. (2) If upstream is unresponsive by **2026-06-10** (60 days from the 2026-04-11 review), maintain a local monkey-patch in `bus/_nats_compat.py`. (3) Monitor `nats-core` for future JetStream support. +**Verification checkpoint (2026-06-10):** on this date run the checklist below and update this section with the outcome (mark each item Done / Not done / Outcome). + +1. Inspect `nats-io/nats.py` open PRs and recent releases on GitHub for the `inspect.iscoroutinefunction` fix. +2. If a fixed release is available: bump the `nats-py` pin in `pyproject.toml`, drop the matching `filterwarnings` entry, run `uv run python -m pytest tests/ -m integration -k nats` to confirm warnings are gone, and replace this checkpoint section with the resolution outcome. +3. If no fixed release exists: implement the local monkey-patch in `src/synthorg/communication/bus/_nats_compat.py` (one-line `nats.aio.client.iscoroutinefunction = inspect.iscoroutinefunction`), import it at bus initialisation, and extend this section with the patch landing date. +4. Re-evaluate `nats-core` JetStream support: a maintained alternative removes the entire mitigation requirement. + ## Overarching Pattern Nearly every decision follows the same architecture: a pluggable protocol interface with one initial implementation shipped, and alternative strategies documented for future extension. This is consistent with the project's protocol-driven design philosophy. diff --git a/docs/getting_started.md b/docs/getting_started.md index 95aa690246..34e2ff60a9 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -32,6 +32,26 @@ uv sync `uv sync` creates a virtual environment in `.venv/` and installs all development dependencies (linters, type checker, test runner, pre-commit, etc.). +## Install external CLI tools (one-time per machine) + +Some gates and the docs build rely on external binaries that are not Python packages: `golangci-lint` (Go linter, used by the CLI) and `d2` (architecture diagram renderer). + +Install `golangci-lint` once per machine: + +```bash +bash scripts/install_cli_tools.sh +``` + +The script downloads the pinned `golangci-lint` version that matches CI (`.github/workflows/cli.yml`). Re-run only after bumping the pinned version; subsequent `uv sync` invocations do NOT re-run the script. CI uses its own action-based install step, so this is strictly a local-developer convenience. + +Install `d2` separately (the docs job pins `v0.7.1`). The fastest path is the upstream installer: + +```bash +curl -fsSL https://d2lang.com/install.sh | sh -s -- --version v0.7.1 +``` + +On Windows, install via `winget install Terrastruct.d2` or download the release archive from `https://github.com/terrastruct/d2/releases`. Either way, ensure the resulting `d2` binary is on `PATH`; the docs build invokes it directly. + ## Verify Installation Run the smoke tests to confirm everything is working: diff --git a/docs/guides/a2a-federation.md b/docs/guides/a2a-federation.md new file mode 100644 index 0000000000..a835a17344 --- /dev/null +++ b/docs/guides/a2a-federation.md @@ -0,0 +1,112 @@ +--- +title: A2A Federation +description: Register a peer SynthOrg deployment, expose JSON-RPC methods, route tasks across the federation. +--- + +# A2A Federation + +The Agent-to-Agent (A2A) bridge lets one SynthOrg deployment delegate tasks to a peer over JSON-RPC. Each side authenticates with a shared JWT credential and the typed boundary at `synthorg.a2a.rpc_params.parse_rpc_params` validates every inbound `params` block. This guide walks through registering a peer, enabling specific RPC methods, and observing a federation round-trip. + +## Concepts + +- **Peer**: a SynthOrg deployment reachable at an HTTPS URL with a JSON-RPC endpoint mounted at `/a2a`. +- **Method**: a JSON-RPC operation the gateway exposes. The current method set is `message/send`, `tasks/get`, and `tasks/cancel`. +- **Envelope precedence**: the JSON-RPC `method` field on the envelope always wins; a `method` key smuggled inside `params` is rejected at `parse_rpc_params` time. + +## Configuration surface + +Settings live under the `a2a` namespace. Resolve them via `SettingsService` or set them in the company-template YAML. + +| Key | Type | Default | Purpose | +|---|---|---|---| +| `a2a.enabled` | bool | `false` | Master switch for the federation gateway. | +| `a2a.peer_url` | URL | (unset) | Outbound peer endpoint. | +| `a2a.peer_jwt_secret` | secret | (unset) | HMAC key for outbound JWT. | +| `a2a.methods_enabled` | list[str] | `[]` | Allowlist of inbound methods. | +| `a2a.timeout_seconds` | float | `30` | Per-request wall-clock budget. | + +## Worked example: two-node round-trip + +The example uses two local processes on ports `8000` (node A) and `8001` (node B); each side has the other registered as its peer. + +### Node B (callee) + +```bash +SYNTHORG_DATA_DIR=/tmp/synthorg-b \ + SYNTHORG_BACKEND_PORT=8001 \ + uv run python -m synthorg.api +``` + +```yaml +# /tmp/synthorg-b/config.yaml +a2a: + enabled: true + methods_enabled: + - tasks/get + peer_jwt_secret: "shared-secret-do-not-commit" +``` + +### Node A (caller) + +```yaml +# /tmp/synthorg-a/config.yaml +a2a: + enabled: true + peer_url: http://localhost:8001/a2a + peer_jwt_secret: "shared-secret-do-not-commit" + methods_enabled: [] +``` + +Call `tasks/get` from node A: + +```python +import httpx +import jwt +import uuid + +token = jwt.encode({"sub": "synthorg-a", "aud": "synthorg-b"}, "shared-secret-do-not-commit", algorithm="HS256") + +payload = { + "jsonrpc": "2.0", + "id": str(uuid.uuid4()), + "method": "tasks/get", + "params": {"task_id": "task-12345"}, +} +resp = httpx.post( + "http://localhost:8001/a2a", + json=payload, + headers={"Authorization": f"Bearer {token}"}, +) +print(resp.json()) +``` + +Expected outcomes: + +- `200` with a `result` block when the task exists. +- `404` (mapped to JSON-RPC error `-32602` with `data.code: "task_not_found"`) when the task is unknown. +- `403` when the bearer JWT does not validate (peer secret mismatch or `aud` claim incorrect). + +## Observability + +Every inbound JSON-RPC call emits these events: + +- `a2a.jsonrpc.received`: at envelope decode; carries `peer`, `method`, `id`. +- `api.boundary.validation_failed`: when `parse_rpc_params` rejects a malformed `params` block. +- `a2a.jsonrpc.dispatched`: at successful method dispatch. +- `a2a.jsonrpc.error`: at error path (with `code` and `message`). + +The `a2a.dispatch_latency_seconds` histogram has a `method` label so per-RPC latency is easy to chart. + +## Threat model + extension + +The boundary check is the only validation gate; downstream handlers MUST treat their typed `params` as already-validated. + +To add a new method: + +1. Define an `A2AParams` Pydantic model under `src/synthorg/a2a/rpc_params.py`. +2. Add it to the `A2ARpcParams` discriminated union. +3. Register the handler in the gateway registry. +4. Add the method name to the per-peer `a2a.methods_enabled` allowlist. +5. Cover the wire shape in `tests/unit/a2a/test_.py`. + +See [docs/reference/typed-boundaries.md](../reference/typed-boundaries.md) for the boundary contract and [docs/design/a2a.md](../design/a2a.md) for the full protocol design. diff --git a/docs/guides/approval-workflow.md b/docs/guides/approval-workflow.md new file mode 100644 index 0000000000..ef1e749a20 --- /dev/null +++ b/docs/guides/approval-workflow.md @@ -0,0 +1,101 @@ +--- +title: Approval Workflow +description: Configure the approval gate, route requests to the right reviewer, observe the audit chain. +--- + +# Approval Workflow + +The approval gate is SynthOrg's human-in-the-loop control surface: certain actions (deploy to production, rotate a secret, kill a runaway agent) pause until an authorised operator approves. The gate lives at `synthorg.engine.approval_gate` and integrates with the audit chain so every decision is signed and chained. + +## Concepts + +- **Escalation**: a structured request that an action requires approval. Carries `approval_id`, `action_type`, `agent_id`, `task_id`, and rationale. +- **Parked context**: the agent state frozen while an escalation is pending (`ParkedContext` in `synthorg.security.timeout.parked_context`). +- **Approval verdict**: `approve` / `reject` / `request_changes` with an actor identity and timestamp. +- **Park service**: stores parked contexts, supplies their `id` and `approval_id`. + +## Configuration + +| Key | Type | Default | Purpose | +|---|---|---|---| +| `approval.enabled` | bool | `true` | Master switch. | +| `approval.timeout_seconds` | int | `86400` | Auto-reject pending requests after this window. | +| `approval.reviewer_groups` | list[str] | `[]` | Identity-aware roles allowed to decide. | +| `approval.notification_target` | str | (unset) | Where to push pending-approval alerts. | +| `approval.audit_chain_signing` | bool | `true` | Sign verdicts into the audit chain. | + +## Worked example: a manual approval round-trip + +The agent emits a pre-tool escalation: + +```python +from synthorg.engine.approval_gate import ApprovalGate + +gate: ApprovalGate = app_state.approval_gate +parked = await gate.park_context( + escalation=escalation, + context=task_context, + agent_id="agent-007", + task_id="task-12345", +) +print(parked.id, parked.approval_id) +``` + +The dashboard at `/dashboard/approvals` lists pending requests. Reviewer clicks `Approve`; the API persists a verdict: + +```bash +curl -X POST http://localhost:8000/api/v1/approvals/approval-1/decide \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + --data '{"verdict": "approve", "rationale": "Looks good; canary signal is clean."}' +``` + +The gate unparks the context and resumes the agent loop. The audit chain records: + +1. `api.approval.created` at park time (one row per pending approval). +2. `security.approval.approved` / `security.approval.rejected` at verdict time, with the reviewer identity. +3. `approval.status_transitioned` AFTER persistence write. + +## Operator surface + +The approvals page surfaces pending requests with: + +- Action type, agent, task, requested change. +- Time-since-raised badge (counts toward the auto-reject deadline). +- One-click `Approve` / `Reject` / `Request changes` actions; rejection requires a rationale. +- Filters on `action_type`, `agent`, `actor` (last reviewer). + +For terminal automation, the MCP tool `approvals.decide` accepts the same verdict payload. + +## Observability + +- `api.approval.created` (info): one per pending approval row written to the store. +- `security.approval.approved` / `security.approval.rejected` (info): one per verdict; carries `actor_id` and either the rationale or the approval payload. +- `approval.status_transitioned` (info): AFTER persistence write, with `from_status` and `to_status`. +- `api.approval.expired` (warning): emitted on timeout-driven auto-rejection alongside the persistence write. + +The `synthorg_approval_decisions_total` counter has bounded label `outcome` in `VALID_APPROVAL_OUTCOMES` (`approve` / `reject` / `request_changes` / `auto_rejected`). + +## Audit chain + +Every verdict is appended to the audit chain via the typed-boundary `audit_chain` (see [docs/reference/typed-boundaries.md](../reference/typed-boundaries.md)). The chain is hash-linked so a tampered verdict breaks downstream verification. + +Operator verification: + +```bash +curl -s http://localhost:8000/api/v1/audit-chain/verify \ + -H "Authorization: Bearer $TOKEN" | jq +# {"status": "valid", "appends_total": 4271, "depth": 4271} +``` + +A `broken` status surfaces the first divergent index plus the diverging append; either repair via the documented `audit_chain.repair` workflow or restore from a signed backup. + +## Threat model + +The approval gate's reliance on identity-aware reviewers means the surrounding auth surface MUST be tight: + +- JWT validation at the controller (`parse_typed("jwt", ...)`). +- Reviewer-group membership checked AT decide time, not just at session start. +- Audit chain enabled in production (an unsigned audit log silently loses tamper evidence). + +See [docs/design/approval.md](../design/approval.md) for the broader design and [docs/reference/sec-prompt-safety.md](../reference/sec-prompt-safety.md) for the redaction rules around the rationale payload. diff --git a/docs/guides/ceremony-scheduling-tuning.md b/docs/guides/ceremony-scheduling-tuning.md new file mode 100644 index 0000000000..c15f00d8d0 --- /dev/null +++ b/docs/guides/ceremony-scheduling-tuning.md @@ -0,0 +1,105 @@ +--- +title: Ceremony Scheduling Tuning +description: Configure ceremony triggers and budgets, swap scheduling strategies, observe ceremony firings. +--- + +# Ceremony Scheduling Tuning + +Ceremonies are scheduled meetings the platform fires during a sprint: standups, planning, retros, and one-off triggers. The `CeremonyScheduler` (`src/synthorg/engine/workflow/ceremony_scheduler.py`) owns trigger state and delegates the "should this fire now?" decision to the active `CeremonySchedulingStrategy`. This guide walks through tuning trigger thresholds, picking a strategy, and observing a ceremony round-trip. + +## Concepts + +- **Ceremony**: a scheduled meeting with a name, type, agenda, and a trigger. +- **Trigger**: a predicate that fires the ceremony (`task_completion_count`, `sprint_progress`, `budget_threshold`, `event_count`, `milestone`, `external_event`). +- **Strategy**: a plug-in that evaluates triggers and decides auto-transitions (`count_driven`, `event_driven`, `budget_driven`, `milestone_driven`, `throughput_adaptive`, `external_trigger`). + +## Configuration surface + +The `SprintCeremonyConfig` carries the per-ceremony tuning. Operators set it via the company template or runtime config service. + +| Key | Type | Default | Purpose | +|---|---|---|---| +| `ceremonies[].name` | str | (required) | Human-readable identifier. | +| `ceremonies[].trigger.kind` | enum | `count_driven` | Trigger family. | +| `ceremonies[].trigger.threshold` | int / float | (required) | Numeric gate. | +| `ceremony_policy.strategy` | enum | `count_driven` | Active strategy. | +| `ceremony_policy.auto_transition` | bool | `false` | Strategy may auto-transition the sprint when budget exhausted / milestone reached. | +| `ceremony_policy.notification_target` | str | (unset) | Operator notification channel on strategy migration. | + +## Worked example: tune retro frequency + +Default config fires the retro after every 20 task completions. Tighten it to fire after every 10 by setting: + +```yaml +sprints: + template: + ceremonies: + - name: retro + type: retrospective + trigger: + kind: count_driven + threshold: 10 + ceremony_policy: + strategy: count_driven + auto_transition: false +``` + +The next sprint's scheduler picks up the override at construction. To migrate a running sprint, call: + +```python +from synthorg.engine.workflow.ceremony_scheduler import CeremonyScheduler + +scheduler: CeremonyScheduler = app_state.ceremony_scheduler +await scheduler.reload_for_active_sprint() # reads the updated config +``` + +## Swap to a different strategy + +Strategies live in `src/synthorg/engine/workflow/ceremony_strategy.py` and are selected via the discriminator in `ceremony_policy.strategy`. Switching mid-sprint emits `workflow.sprint.ceremony_strategy_changed` with `from`, `to`, and the strategy-specific config delta. + +Budget-driven example: fire planning when the sprint has burned 40% of its budget. + +```yaml +ceremony_policy: + strategy: budget_driven +ceremonies: + - name: planning + type: planning + trigger: + kind: budget_threshold + threshold: 0.4 +``` + +## Auto-transition + +When `auto_transition: true`, the strategy may transition the sprint from `ACTIVE` to `RECONCILING` once its terminal condition is met (budget exhausted, milestone reached). The scheduler emits two events around the transition: + +1. `workflow.sprint.auto_transition` BEFORE applying the new status. +2. `workflow.sprint.status_transitioned` AFTER the in-memory sprint object reflects the new status. + +Both events carry `sprint_id`, `from_status`, and `to_status` for downstream dashboards. + +## Observability + +Per-ceremony events: + +- `workflow.sprint.ceremony_triggered`: trigger fired, ceremony about to run. +- `workflow.sprint.ceremony_skipped`: strategy evaluated false; ceremony not fired this cycle. +- `workflow.sprint.ceremony_trigger_failed`: dispatching the meeting raised (event swallowed). +- `workflow.sprint.event_counter_incremented`: per-event counters for event-driven strategies. + +Counters and gauges (with bounded labels): + +- `synthorg_ceremony_triggered_total` counter, `ceremony` (registry-bound), `strategy`. +- `synthorg_ceremony_skipped_total` counter, same labels. + +## Diagnostic checklist + +| Symptom | Likely cause | Mitigation | +|---|---|---| +| Retro never fires | `count_driven` threshold too high vs. observed completions | Lower threshold or switch strategy to `event_driven`. | +| Strategy migration loops | Two strategies enabled simultaneously by mistake | Confirm `ceremony_policy.strategy` is exactly one value; remove stray YAML keys. | +| Auto-transition does not fire | `auto_transition: false` or terminal condition unmet | Set true and verify the trigger threshold via the dashboard. | +| Notification missing on strategy change | `notification_target` unset | Set the notification target or check the dispatcher's logs for `SPRINT_CEREMONY_NOTIFICATION_FAILED`. | + +See [docs/design/ceremony-scheduling.md](../design/ceremony-scheduling.md) for the full strategy catalogue and design rationale. diff --git a/docs/guides/cost-attribution.md b/docs/guides/cost-attribution.md new file mode 100644 index 0000000000..fbaef0bbd2 --- /dev/null +++ b/docs/guides/cost-attribution.md @@ -0,0 +1,101 @@ +--- +title: Cost Attribution +description: Slice spend by provider, model, agent, and project; query the rollup; route alerts at the right granularity. +--- + +# Cost Attribution + +SynthOrg records every LLM call with a `CostRecord` (`src/synthorg/budget/cost_record.py`) that carries enough dimensions to slice spend four ways: provider, model, agent, and project. This guide walks through reading the rollup, choosing the right query, and wiring alerts at each granularity. + +## Dimensions + +| Dimension | Source | Cardinality | +|---|---|---| +| Provider | Provider driver name | ~10 | +| Model | Provider model identifier | ~50 | +| Agent | `agent_id` from the executing context | Registry-bound (~100s) | +| Project | `project_id` from the task context | Hundreds to thousands | + +All dimensions are bounded label values when surfaced as Prometheus metrics; see [docs/guides/monitoring.md](monitoring.md) for the registry-bound enforcement rule. + +## Querying the rollup + +The cost API lives at `/api/v1/costs`. Three core endpoints: + +- `GET /api/v1/costs/summary` returns the current period totals across dimensions. +- `GET /api/v1/costs/by/{dim}` returns rollups for a single dimension (e.g. `by/agent`). +- `GET /api/v1/costs/records` returns the raw record stream (paginated). + +```bash +# Summary for the current billing month. +curl -s http://localhost:8000/api/v1/costs/summary \ + -H "Authorization: Bearer $TOKEN" | jq + +# Per-agent breakdown. +curl -s "http://localhost:8000/api/v1/costs/by/agent?since=2026-05-01" \ + -H "Authorization: Bearer $TOKEN" | jq + +# Per-project breakdown filtered to one project. +curl -s "http://localhost:8000/api/v1/costs/by/project?project_id=proj-acme" \ + -H "Authorization: Bearer $TOKEN" | jq +``` + +## Worked example: route a Slack alert at 80% project budget + +Set the project budget in the company template: + +```yaml +budget: + projects: + proj-acme: + monthly: 250.00 + currency: GBP + alerts: + warning_at: 50 + critical_at: 80 + hard_stop_at: 95 +``` + +Configure the notification dispatcher to route critical alerts to Slack: + +```yaml +notifications: + channels: + slack: + enabled: true + webhook_url: https://hooks.slack.com/services/.../... + routing: + - event: budget.project.warning + channels: [slack] + severity: warning + - event: budget.project.critical + channels: [slack] + severity: critical +``` + +The enforcer fires `BUDGET_PROJECT_BUDGET_EXCEEDED` and the dispatcher routes the notification through the configured channels. On hard-stop (95% in the example), the project's tasks are auto-cancelled and a `notifications.budget_exhausted.send` event lands on the notification feed. + +## Aggregation under concurrency + +`CostTracker.record(...)` is async and lock-guarded; concurrent writes from many agents collapse to a single durable append. The per-currency invariant (`assert_currencies_match`) protects against accidental cross-currency rollups; mixed-currency calls raise at record time rather than silently producing a wrong total. + +## Limitations + +- The summary endpoint reports the **billing period** total. Daily totals come from `/api/v1/costs/by/agent?period=day` with the appropriate filter. +- Per-tool cost is NOT a first-class dimension. Tools are observed via `synthorg_tool_invocations_total`; cost attribution stops at the model + provider level. +- Project assignment relies on `task.project_id` being set; unassigned tasks aggregate under the implicit `unassigned` project bucket. + +## Observability + +- `synthorg_cost_total` (gauge): total accumulated spend. +- `synthorg_agent_cost_total` (gauge, `agent_id` registry-bound): per-agent cumulative spend. +- `synthorg_budget_used_percent` (gauge): monthly utilisation. +- `synthorg_budget_daily_used_percent` (gauge): daily utilisation (pro-rated). + +Events emitted on every record: + +- `budget.cost.recorded`: at successful persistence. +- `budget.cost.record_rejected`: at currency mismatch. +- `budget.enforcement.check`: pre-flight budget check (allow / downgrade / deny). + +See [docs/design/cost-control.md](../design/cost-control.md) for the full design. diff --git a/docs/guides/custom-mcp-server-dev.md b/docs/guides/custom-mcp-server-dev.md new file mode 100644 index 0000000000..a5d4c1cd01 --- /dev/null +++ b/docs/guides/custom-mcp-server-dev.md @@ -0,0 +1,141 @@ +--- +title: Custom MCP Server Development +description: Register a new MCP tool, define its typed args model, wire admin guardrails. +--- + +# Custom MCP Server Development + +SynthOrg's MCP surface exposes 200+ tools across 15 domain modules under `src/synthorg/meta/mcp/domains/`. Each tool is a `ToolHandler` with an optional `args_model` that drives the typed boundary. This guide shows how to register a hello-world tool, validate its arguments, and surface it to operators. + +## Anatomy of a tool + +A tool consists of: + +1. A `MCPToolDef` with `name`, `description`, optional `args_model`. +2. A handler coroutine `(app_state, arguments, **kwargs) -> dict`. +3. Registration in the relevant domain module. + +The invoker (`src/synthorg/meta/mcp/invoker.py`) routes args validation through `parse_typed("mcp.tool", ...)` when `args_model` is set; otherwise the handler's `common_args` helpers do field-level validation. Both paths converge on the `ArgumentValidationError` envelope with `domain_code=invalid_argument` on failure. See [docs/reference/typed-boundaries.md](../reference/typed-boundaries.md) for the dual-path contract. + +## Worked example: a `hello.greet` tool + +Define the args model under a new domain file: + +```python +# src/synthorg/meta/mcp/domains/hello/handler.py +from pydantic import BaseModel, ConfigDict, Field + +from synthorg.core.types import NotBlankStr +from synthorg.meta.mcp.registry import MCPToolDef, register_tool + + +class GreetArgs(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + + name: NotBlankStr = Field(description="Subject of the greeting") + times: int = Field(default=1, ge=1, le=5, description="Repeat count") + + +async def greet( + *, + app_state, + arguments: dict, + actor_id: str, + **_: object, +) -> dict: + args = GreetArgs.model_validate(arguments) + greeting = ", ".join([f"Hello {args.name}"] * args.times) + return {"status": "ok", "greeting": greeting} + + +register_tool( + MCPToolDef( + name="hello.greet", + description="Return a greeting; primarily for smoke tests.", + args_model=GreetArgs, + handler=greet, + operation_type="read", + ) +) +``` + +Register the domain module in the MCP boot path (see existing domains under `src/synthorg/meta/mcp/domains/` for the pattern). + +Invoke the tool through the MCP client: + +```python +from synthorg.meta.mcp.invoker import invoke + +result = await invoke( + app_state=app_state, + tool_name="hello.greet", + arguments={"name": "world", "times": 2}, + actor_id="agent-007", +) +print(result.content) # {"status": "ok", "greeting": "Hello world, Hello world"} +``` + +## Admin guardrails + +Tools that mutate global state (delete agents, rotate secrets, etc.) MUST guard against unprivileged callers. Call `require_admin_guardrails(...)` at the start of the handler: + +```python +from synthorg.security.guardrails import require_admin_guardrails + +async def delete_agent(*, app_state, arguments, actor_id, **_): + await require_admin_guardrails( + actor_id=actor_id, + action="agent.delete", + app_state=app_state, + ) + # ... mutation logic ... +``` + +The guardrail emits `mcp.admin.denied` on rejection (with `actor_id`, `action`, `reason`) and the invoker returns the `forbidden` envelope. See [docs/reference/mcp-handler-contract.md](../reference/mcp-handler-contract.md) for the full contract. + +## Observability + +Every dispatch emits: + +- `mcp.server.invoke.start`: at boundary entry (after auth). +- `mcp.server.invoke.success`: on a returned dict. +- `mcp.server.invoke.failed`: on validation, exception, or guardrail rejection. + +The `synthorg_mcp_handler_outcomes_total` counter and `synthorg_mcp_handler_duration_seconds` histogram both carry `tool` and `outcome` labels with bounded values from `VALID_MCP_HANDLER_OUTCOMES`. + +## Testing + +Add `tests/unit/mcp/domains/test_hello.py`: + +```python +import pytest + +from synthorg.meta.mcp.invoker import invoke + + +@pytest.mark.unit +async def test_greet_returns_repeated_greeting(app_state) -> None: + result = await invoke( + app_state=app_state, + tool_name="hello.greet", + arguments={"name": "tester", "times": 3}, + actor_id="agent-test", + ) + assert not result.is_error + body = result.json_body() + assert body["greeting"].count("Hello tester") == 3 + + +@pytest.mark.unit +async def test_greet_rejects_extra_keys(app_state) -> None: + result = await invoke( + app_state=app_state, + tool_name="hello.greet", + arguments={"name": "x", "color": "blue"}, + actor_id="agent-test", + ) + assert result.is_error + assert "extra" in result.error_message.lower() +``` + +The `app_state` fixture is shared across MCP tests; see `tests/unit/mcp/conftest.py`. diff --git a/docs/guides/custom-rules-and-meta-loop.md b/docs/guides/custom-rules-and-meta-loop.md new file mode 100644 index 0000000000..2e32bafaef --- /dev/null +++ b/docs/guides/custom-rules-and-meta-loop.md @@ -0,0 +1,118 @@ +--- +title: Custom Rules & Meta-Loop +description: Plug a rule into the meta-loop, observe its evaluation, gate auto-evolution on rule outcomes. +--- + +# Custom Rules & Meta-Loop + +The meta-loop (`src/synthorg/meta/loop/`) is SynthOrg's reflective layer: it observes agent behaviour, evaluates rules over the observed evidence, and proposes adaptations through the evolution pipeline. Custom rules let operators encode local invariants (e.g. "no agent retries the same failing tool more than five times in a sprint") without forking the core. + +## Concepts + +- **Rule**: a callable that accepts a `MetaLoopContext` and returns a `RuleVerdict` (pass / warn / fail / skip). +- **Verdict**: carries severity, a structured `details` dict, and an optional `proposal` suggesting an adaptation. +- **Meta-loop step**: one observation -> rule fan-out -> verdict aggregation -> optional evolution proposal. + +## Rule contract + +```python +from synthorg.meta.rules.protocol import Rule, RuleVerdict +from synthorg.meta.loop.context import MetaLoopContext + + +class HighFailRateRule: + name = "high_fail_rate" + + async def evaluate(self, context: MetaLoopContext) -> RuleVerdict: + snapshot = await context.recent_task_outcomes(minutes=60) + fails = sum(1 for s in snapshot if s.outcome == "failed") + total = len(snapshot) + if total == 0: + return RuleVerdict.skip(reason="no_observations") + ratio = fails / total + if ratio > 0.3: + return RuleVerdict.fail( + rule=self.name, + details={"ratio": ratio, "total": total}, + ) + return RuleVerdict.pass_(rule=self.name, details={"ratio": ratio}) +``` + +## Registering the rule + +Rules live in `src/synthorg/meta/rules/` and are registered with the rule registry: + +```python +# src/synthorg/meta/rules/__init__.py +from synthorg.core.registry.strategy import StrategyRegistry +from synthorg.meta.rules.high_fail_rate import HighFailRateRule +from synthorg.meta.rules.protocol import Rule + +RULE_REGISTRY: StrategyRegistry[Rule] = StrategyRegistry( + { + HighFailRateRule.name: HighFailRateRule, + }, + kind="meta_loop_rule", +) +``` + +## Configuration + +Rules opt in per-deployment under the `meta_loop.rules` namespace: + +```yaml +meta_loop: + enabled: true + rules: + - name: high_fail_rate + severity: warn + - name: budget_drift + severity: fail + params: + threshold: 0.15 +``` + +A failing verdict with `severity: fail` blocks the meta-loop step from emitting an evolution proposal; the violation surfaces on the operator dashboard at `/dashboard/meta-loop`. + +## Worked example: observe a meta-loop step + +Add a unit test under `tests/unit/meta/rules/test_high_fail_rate.py`: + +```python +import pytest + +from synthorg.meta.rules.high_fail_rate import HighFailRateRule + + +@pytest.mark.unit +async def test_fail_ratio_above_threshold_fails(meta_loop_context_with_outcomes) -> None: + context = meta_loop_context_with_outcomes( + outcomes=["succeeded"] * 4 + ["failed"] * 6, + ) + verdict = await HighFailRateRule().evaluate(context) + assert verdict.severity == "fail" + assert verdict.details["ratio"] == pytest.approx(0.6) + + +@pytest.mark.unit +async def test_no_observations_skipped(meta_loop_context_with_outcomes) -> None: + context = meta_loop_context_with_outcomes(outcomes=[]) + verdict = await HighFailRateRule().evaluate(context) + assert verdict.severity == "skip" +``` + +The `meta_loop_context_with_outcomes` fixture lives in `tests/unit/meta/conftest.py`. + +## Observability + +Every rule evaluation emits: + +- `meta.rule.evaluated`: with `name`, `severity`, `details`. +- `meta.rule.proposal_emitted`: when the verdict carries an adaptation proposal. +- `meta.rule.skipped`: when prerequisites are absent (no observations, missing context). + +The `synthorg_meta_rule_evaluations_total` counter has bounded labels `rule` (registry-bound) and `severity`. + +## Where this fits + +A failing rule does NOT itself mutate the system: it returns a verdict that the meta-loop coordinator aggregates. Adaptation lands through the evolution pipeline (see [docs/design/evolution.md](../design/evolution.md)). For the broader meta-loop architecture, see [docs/design/meta-loop.md](../design/meta-loop.md). diff --git a/docs/guides/dynamic-scoring.md b/docs/guides/dynamic-scoring.md new file mode 100644 index 0000000000..d73521ea9f --- /dev/null +++ b/docs/guides/dynamic-scoring.md @@ -0,0 +1,130 @@ +--- +title: Dynamic Scoring +description: Register a custom scoring strategy, surface hyperparameters in settings, observe score drift. +--- + +# Dynamic Scoring + +Scoring drives every "what to do next" decision in SynthOrg: which task to assign, which agent to pick, which strategy to apply. The scoring layer at `synthorg.engine.assignment.scoring` is pluggable: each strategy is a `ScoringStrategy` implementation registered in the strategy registry. This guide shows how to add a custom strategy, expose its hyperparameters, and observe its outputs. + +## Strategy contract + +```python +from synthorg.engine.assignment.scoring.protocol import ( + ScoringStrategy, + ScoringContext, + ScoreResult, +) + + +class FreshnessBoostedScorer: + name = "freshness_boosted" + + def __init__(self, *, boost_factor: float = 0.2) -> None: + self._boost = boost_factor + + async def score(self, context: ScoringContext) -> ScoreResult: + base = await context.base_score() + recency_bonus = self._recency_term(context) + return ScoreResult( + value=base + self._boost * recency_bonus, + details={"base": base, "recency_bonus": recency_bonus}, + ) + + def _recency_term(self, context: ScoringContext) -> float: + elapsed = context.now - context.candidate.last_active + return max(0.0, 1.0 - (elapsed.total_seconds() / 86400)) +``` + +## Registering the strategy + +Add to the registry: + +```python +# src/synthorg/engine/assignment/scoring/__init__.py +from synthorg.core.registry.strategy import StrategyRegistry +from synthorg.engine.assignment.scoring.freshness_boosted import ( + FreshnessBoostedScorer, +) +from synthorg.engine.assignment.scoring.protocol import ScoringStrategy + +SCORING_STRATEGY_REGISTRY: StrategyRegistry[ScoringStrategy] = StrategyRegistry( + { + FreshnessBoostedScorer.name: FreshnessBoostedScorer, + }, + kind="scoring_strategy", +) +``` + +## Hyperparameter surface + +Strategies that carry tunable hyperparameters expose them through the settings system so operators can adjust without redeploying: + +```python +# src/synthorg/settings/definitions/scoring.py +from synthorg.settings import SettingDefinition + +SCORING_DEFINITIONS = ( + SettingDefinition( + namespace="scoring", + key="freshness_boost_factor", + default=0.2, + validator=lambda v: 0.0 <= v <= 1.0, + description="Multiplier on the freshness bonus term.", + ), +) +``` + +The factory reads the resolved value at strategy construction time: + +```python +async def build_freshness_boosted( + settings: SettingsService, +) -> FreshnessBoostedScorer: + factor = await settings.get_float( + "scoring", "freshness_boost_factor" + ) + return FreshnessBoostedScorer(boost_factor=factor) +``` + +## Configuration + +Pick the active strategy via the `scoring.strategy` setting: + +```yaml +scoring: + strategy: freshness_boosted + freshness_boost_factor: 0.3 +``` + +The setting is hot-reloadable: a change via `synthorg config set scoring.strategy ` swaps the active strategy on the next assignment decision. + +## Worked example: end-to-end test + +```python +# tests/unit/engine/scoring/test_freshness_boosted.py +import pytest + +from synthorg.engine.assignment.scoring.freshness_boosted import ( + FreshnessBoostedScorer, +) + + +@pytest.mark.unit +async def test_recent_candidate_outscores_stale( + scoring_context_factory, +) -> None: + scorer = FreshnessBoostedScorer(boost_factor=0.5) + recent = await scorer.score(scoring_context_factory(hours_idle=0)) + stale = await scorer.score(scoring_context_factory(hours_idle=72)) + assert recent.value > stale.value + assert recent.details["recency_bonus"] > stale.details["recency_bonus"] +``` + +The `scoring_context_factory` fixture lives in `tests/unit/engine/scoring/conftest.py`. + +## Observability + +Every score emission fires `scoring.score.computed` with `strategy`, `score`, and the `details` payload. The dashboard `Scoring` panel charts the rolling p50/p95/p99 score per strategy so operators can detect drift. + +For the broader scoring architecture and the existing strategies (composite, weighted, ranked, multi-objective), see [docs/design/scoring.md](../design/scoring.md) and [docs/reference/scoring-hyperparameters.md](../reference/scoring-hyperparameters.md). diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md index c5b321934b..048ecf9632 100644 --- a/docs/guides/monitoring.md +++ b/docs/guides/monitoring.md @@ -101,6 +101,7 @@ Bounded-label values are enforced at record time in `src/synthorg/observability/ | `synthorg_audit_chain_appends_total` | Counter | `status` | Audit chain append operations (`status` bounded to `signed` / `fallback` / `error`). | `Audit & Security` | | `synthorg_audit_chain_depth` | Gauge | - | Current hash chain length. | `Audit & Security` | | `synthorg_audit_chain_last_append_timestamp_seconds` | Gauge | - | Unix timestamp of the most recent append. | `Audit & Security` | +| `synthorg_security_audit_log_fill_ratio` | Gauge | - | Security audit log occupancy as a fraction of `max_entries` (0.0 empty, 1.0 full). Alert at 0.9: increase retention or archive older entries before the ring buffer wraps and overwrites unread evidence. | `Audit & Security` | ### OTLP export health @@ -222,6 +223,61 @@ rate(synthorg_audit_chain_appends_total{status="error"}[5m]) # Seconds since last append (flat line for > 5m is suspicious) time() - synthorg_audit_chain_last_append_timestamp_seconds + +# Audit log fill ratio (alert when the ring buffer is near capacity). +# At >0.9 the next bursts of activity overwrite the oldest entries +# before an operator can read them; rotate retention or archive. +synthorg_security_audit_log_fill_ratio +``` + +### Audit log fill ratio + +The `synthorg_security_audit_log_fill_ratio` gauge reports the +occupancy of the in-memory security audit log as a fraction of its +configured `max_entries` capacity. The log is a ring buffer: once +full, the oldest entries are overwritten as new audit events land. +A sustained value above 0.9 means the buffer is about to wrap; any +unread evidence beyond that point is permanently lost. + +Recommended alert rule: + +```yaml +- alert: SynthorgSecurityAuditLogNearCapacity + expr: synthorg_security_audit_log_fill_ratio > 0.9 + for: 10m + labels: {severity: warning} + annotations: + summary: "Security audit log is {{ $value | humanizePercentage }} full" + runbook: "increase max_entries, archive entries to long-term storage, or shorten retention" +``` + +Grafana panel definition (drop into the `Audit & Security` row of +`monitoring/grafana/synthorg-overview.json`): + +```json +{ + "title": "Security audit log fill ratio", + "type": "gauge", + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "min": 0, + "max": 1, + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 0.75}, + {"color": "red", "value": 0.9} + ] + } + } + }, + "targets": [ + {"expr": "synthorg_security_audit_log_fill_ratio", "refId": "A"} + ] +} ``` ### OTLP export health @@ -258,7 +314,7 @@ The dashboard organises 30+ panels into seven collapsible rows. Only `Health & S | `Workflows` | collapsed | Workflow duration p50/p95, workflow execution rate by status, top-N workflow definitions | | `Tools & Providers` | collapsed | Tool invocation rate, tool duration p95 by `tool_name`, provider tokens, provider cost, provider errors by class | | `Cost & Budget` | collapsed | `synthorg_cost_total`, monthly cost, daily used %, top-25 per-agent cost, agent budget used % | -| `Audit & Security` | collapsed | Audit chain append rate, depth, last-append age, security verdicts, agent identity version changes, API error categories | +| `Audit & Security` | collapsed | Audit chain append rate, depth, last-append age, audit-log fill-ratio gauge, security verdicts, agent identity version changes, API error categories | | `Client Health` | collapsed | Client disconnects by transport+reason, API request rate by status class, OTLP export batches, OTLP dropped records, cache hit rate, app info | To install via the Grafana UI: `Dashboards → New → Import → Upload JSON file`. Via the provisioning API: `POST /api/dashboards/db` with `{"dashboard": , "overwrite": true, "inputs": [...]}`. diff --git a/docs/guides/ontology-extension.md b/docs/guides/ontology-extension.md new file mode 100644 index 0000000000..52915bfa6e --- /dev/null +++ b/docs/guides/ontology-extension.md @@ -0,0 +1,110 @@ +--- +title: Ontology Extension +description: Register a new domain term, attach a description, observe how it threads into agent prompts. +--- + +# Ontology Extension + +SynthOrg's ontology lives at `synthorg.ontology`. Each domain term carries a name, a description, and optional examples; the injection profile pipes terms into agent prompts so the LLM has consistent vocabulary. This guide walks through registering a new term and verifying it surfaces in a prompt. + +## Concepts + +- **Domain term**: a noun-phrase concept the platform knows about (e.g. `task_status`, `sprint`, `cost_record`). +- **Injection profile**: a named bundle of terms applied to a prompt class via `synthorg.ontology.injection.profile`. +- **Resolution order**: explicit definition > company-template override > built-in default. + +## Registering a new term + +Built-in terms live in `src/synthorg/ontology/terms/`. Add a new file or extend an existing module: + +```python +# src/synthorg/ontology/terms/finance.py +from synthorg.ontology.term import DomainTerm + +cost_centre = DomainTerm( + name="cost_centre", + description=( + "An accounting bucket used to attribute spend to a " + "department or project." + ), + examples=( + "engineering", + "rd-platform", + "client-services", + ), +) +``` + +Register the term with the ontology registry: + +```python +# src/synthorg/ontology/registry.py +from synthorg.ontology.terms.finance import cost_centre + +ONTOLOGY_REGISTRY.register(cost_centre) +``` + +## Worked example: thread the term into a prompt + +Add the term to the relevant injection profile: + +```python +# src/synthorg/ontology/injection/profile.py +FINANCE_PROFILE = InjectionProfile( + name="finance", + terms=("cost_record", "cost_centre", "budget"), +) +``` + +Reference the profile in a prompt class: + +```python +class CostExplainer: + PROFILE = FINANCE_PROFILE + + def render(self, cost: CostRecord) -> str: + ontology_block = self.PROFILE.render_for_prompt() + return f"{ontology_block}\n\nExplain: {cost.model_dump_json()}" +``` + +Verify the term appears in the rendered prompt: + +```python +from synthorg.ontology.registry import ONTOLOGY_REGISTRY +from synthorg.ontology.injection.profile import FINANCE_PROFILE + +rendered = FINANCE_PROFILE.render_for_prompt() +assert "cost_centre" in rendered +assert "An accounting bucket" in rendered +``` + +## Company-template overrides + +A company template (YAML) can shadow built-in terms: + +```yaml +ontology: + terms: + - name: cost_centre + description: | + Cost centres in this company map to budget owners rather + than accounting buckets; treat them as the canonical + spend-attribution unit. + examples: + - acme.platform + - acme.research +``` + +The company-template override flows through `synthorg.templates.loader.apply_ontology_overrides` at startup and is the canonical source once applied. Operators changing the override do not need to redeploy; reload the template via `synthorg config reload-template`. + +## Observability + +Term resolution emits `ontology.term.resolved` with `name` and `source` (`built_in` / `template`) at debug. Missing terms (referenced by a profile but not registered) emit `ontology.term.missing` at warning. + +## Adding a profile + +1. Define the `InjectionProfile` in `src/synthorg/ontology/injection/profile.py`. +2. Reference the profile from the relevant prompt class via `PROFILE = ...`. +3. Add `tests/unit/ontology/test_.py` asserting the rendered block contains every expected term. + +See [docs/design/ontology.md](../design/ontology.md) for the broader design and resolution rules. diff --git a/docs/guides/rest-api-examples.md b/docs/guides/rest-api-examples.md new file mode 100644 index 0000000000..7a4fcf5ab9 --- /dev/null +++ b/docs/guides/rest-api-examples.md @@ -0,0 +1,247 @@ +--- +title: REST API Examples +description: Authenticate and call the 10 most common SynthOrg REST endpoints via curl, Python (httpx), and JavaScript (fetch). +--- + +# REST API Examples + +The SynthOrg REST API is mounted at `/api/v1` on the backend service (default port `3001`). Every endpoint requires authentication; the JWT is delivered as an HttpOnly `Set-Cookie` header by `/auth/login`, so subsequent calls authenticate by carrying the cookie back, not by attaching an `Authorization: Bearer` header. The response envelope is a typed `ApiResponse` or `PaginatedResponse`. This guide shows the 10 most common operations. + +The base URL placeholder `$BASE` defaults to `http://localhost:3001`. Examples assume `jq` is installed for response inspection. + +## Authenticate + +### curl + +```bash +# Login. -c writes the session cookie to a jar; -b on every subsequent +# call reads it back. The response body carries only metadata +# (expires_in, must_change_password); the JWT is in Set-Cookie. +curl -s -c cookies.txt -X POST $BASE/api/v1/auth/login \ + -H "Content-Type: application/json" \ + --data '{"username":"admin","password":"admin"}' | jq +``` + +### Python (httpx) + +```python +import httpx + +# httpx.Client persists cookies on its ``.cookies`` jar between calls. +client = httpx.Client(base_url="http://localhost:3001") +resp = client.post("/api/v1/auth/login", json={"username": "admin", "password": "admin"}) +resp.raise_for_status() +# Token is in client.cookies now; every subsequent client.get/post +# carries it back automatically. +``` + +### JavaScript (fetch) + +```javascript +// credentials: 'include' both sends and accepts cookies. In a browser +// this works against same-origin or CORS-allowed targets; in Node 18+ +// fetch use undici's cookie jar via dispatchers (see node docs). +const resp = await fetch('http://localhost:3001/api/v1/auth/login', { + method: 'POST', + credentials: 'include', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ username: 'admin', password: 'admin' }), +}) +const { data: session } = await resp.json() +console.log('session expires in', session.expires_in, 'seconds') +``` + +## 1. List agents + +```bash +curl -s -b cookies.txt "$BASE/api/v1/agents" | jq +``` + +```python +agents = client.get("/api/v1/agents").json()["data"] +``` + +```javascript +const r = await fetch(`${base}/api/v1/agents`, { credentials: 'include' }) +const { data: agents } = await r.json() +``` + +Returns a paginated envelope; the `meta.next_cursor` field drives the next page. + +## 2. Create a task + +```bash +curl -s -b cookies.txt -X POST "$BASE/api/v1/tasks" \ + -H "Content-Type: application/json" \ + --data '{"title":"Build a sample","description":"Smoke test","acceptance_criteria":["Compiles","Runs"]}' +``` + +```python +resp = client.post( + "/api/v1/tasks", + json={ + "title": "Build a sample", + "description": "Smoke test", + "acceptance_criteria": ["Compiles", "Runs"], + }, +) +task = resp.json()["data"] +``` + +```javascript +const r = await fetch(`${base}/api/v1/tasks`, { + method: 'POST', + credentials: 'include', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ title: 'Build a sample', description: 'Smoke test', acceptance_criteria: ['Compiles', 'Runs'] }), +}) +const { data: task } = await r.json() +``` + +## 3. Get a task + +```bash +curl -s -b cookies.txt "$BASE/api/v1/tasks/$TASK_ID" | jq +``` + +```python +task = client.get(f"/api/v1/tasks/{task_id}").json()["data"] +``` + +## 4. List artifacts for a task + +```bash +curl -s -b cookies.txt "$BASE/api/v1/artifacts?task_id=$TASK_ID" | jq +``` + +```python +resp = client.get("/api/v1/artifacts", params={"task_id": task_id}) +artifacts = resp.json()["data"] +``` + +## 5. Submit a client request + +```bash +curl -s -b cookies.txt -X POST "$BASE/api/v1/requests" \ + -H "Content-Type: application/json" \ + --data '{"client_id":"c-1","requirement":{"title":"Ship the thing","description":"Make it work","acceptance_criteria":["Tests pass"]}}' +``` + +```python +resp = client.post( + "/api/v1/requests", + json={ + "client_id": "c-1", + "requirement": { + "title": "Ship the thing", + "description": "Make it work", + "acceptance_criteria": ["Tests pass"], + }, + }, +) +``` + +## 6. Approve a client request + +```bash +curl -s -b cookies.txt -X POST "$BASE/api/v1/requests/$REQUEST_ID/approve" +``` + +The approve endpoint walks the request through the intake engine (when in `SUBMITTED` status) or finalises a previously-scoped request. + +## 7. Fetch budget utilisation + +```bash +curl -s -b cookies.txt "$BASE/api/v1/budget/utilization" | jq +``` + +```python +util = client.get("/api/v1/budget/utilization").json()["data"] +print(f"Monthly: {util['monthly_used_percent']:.1f}% Daily: {util['daily_used_percent']:.1f}%") +``` + +## 8. Decide on a pending approval + +```bash +curl -s -b cookies.txt -X POST "$BASE/api/v1/approvals/$APPROVAL_ID/decide" \ + -H "Content-Type: application/json" \ + --data '{"verdict":"approve","rationale":"Canary signal clean."}' +``` + +```python +resp = client.post( + f"/api/v1/approvals/{approval_id}/decide", + json={"verdict": "approve", "rationale": "Canary signal clean."}, +) +``` + +## 9. Invoke an MCP tool + +```bash +curl -s -b cookies.txt -X POST "$BASE/api/v1/mcp/invoke" \ + -H "Content-Type: application/json" \ + --data '{"tool":"hello.greet","arguments":{"name":"world","times":2}}' +``` + +```javascript +const r = await fetch(`${base}/api/v1/mcp/invoke`, { + method: 'POST', + credentials: 'include', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ tool: 'hello.greet', arguments: { name: 'world', times: 2 } }), +}) +const result = await r.json() +``` + +## 10. Subscribe to the live event WebSocket + +```javascript +// The session cookie is sent automatically because the WebSocket +// upgrade runs against the same origin; no Authorization header is +// involved. Make sure document.cookie still holds the session cookie +// at upgrade time. +const ws = new WebSocket(`ws://localhost:3001/api/v1/ws`) +ws.onmessage = (e) => { + const evt = JSON.parse(e.data) + console.log('[event]', evt.event_type, evt.payload) +} +ws.onopen = () => { + ws.send(JSON.stringify({ action: 'subscribe', channels: ['tasks', 'approvals'] })) +} +``` + +The first frame the server sends is `{"event_type":"auth_ok"}`; once seen, the channels you subscribed to deliver events in real time. See [docs/reference/websocket-protocol.md](../reference/websocket-protocol.md) for the full handshake and event-type catalogue. + +## Pagination + +List endpoints return `PaginatedResponse`: + +```json +{ + "data": [...], + "meta": { + "limit": 50, + "next_cursor": "eyJsYXN0X2lkIjoidGFzay0xMjMifQ==", + "has_more": true + } +} +``` + +To fetch the next page: pass `?cursor=` to the same endpoint. Stop when `has_more` is false. + +## Error envelopes + +Errors follow RFC 9457: + +```json +{ + "type": "synthorg/not-found", + "title": "Task not found", + "status": 404, + "detail": "Task 'task-X' not found", + "code": "RESOURCE_NOT_FOUND", + "category": "client_error" +} +``` + +The `code` field is the typed `ErrorCode` enum (see [docs/reference/errors.md](../reference/errors.md)). Clients can switch on the enum without parsing prose. diff --git a/docs/guides/webhook-management.md b/docs/guides/webhook-management.md new file mode 100644 index 0000000000..20c4d357ce --- /dev/null +++ b/docs/guides/webhook-management.md @@ -0,0 +1,89 @@ +--- +title: Webhook Management +description: Register inbound webhook receivers, choose the right envelope shape, observe retries and idempotency. +--- + +# Webhook Management + +SynthOrg accepts inbound webhooks from external providers (GitHub, Stripe, Linear, etc.) at `/webhooks/{connection}`. Each receiver registers a connection record (transport, signing secret, replay window) and is handled by an integration-specific dispatcher that validates the typed envelope and routes the payload. + +## Envelope contract + +Wire shape: any JSON object. Inbound bodies route through `parse_typed("webhook.payload", body, WebhookEventPayload)` which enforces: + +- Object root (arrays, scalars, non-JSON bodies are rejected with HTTP 400). +- Arbitrary keys via `ConfigDict(extra="allow")` so provider-specific schemas flow through unchanged. + +Details: [docs/reference/typed-boundaries.md](../reference/typed-boundaries.md) (webhook payload envelope section). + +## Configuration surface + +| Key | Type | Default | Purpose | +|---|---|---|---| +| `integrations.webhooks.enabled` | bool | `false` | Master switch. | +| `integrations.webhooks.replay_window_seconds` | int | `300` | Reject nonces older than this. | +| `integrations.webhooks.max_payload_bytes` | int | `1048576` | Bound inbound body size. | +| `integrations.webhooks.idempotency_ttl_seconds` | int | `86400` | Idempotency-key cache lifetime. | + +Connection records are stored per integration via `WebhookConnectionRepository`. Each connection carries `signing_secret`, `nonce_header`, `signature_header`, and a per-integration retry policy. + +## Worked example: register and POST + +Register a receiver for a `github` connection: + +```python +from synthorg.integrations.webhooks.activity_service import WebhookActivityService + +service = WebhookActivityService(...) +await service.register( + connection_type="github", + connection_name="primary", + signing_secret="whsec_PROVIDED_BY_GITHUB", + nonce_header="X-GitHub-Delivery", + signature_header="X-Hub-Signature-256", +) +``` + +POST a sample payload from the command line: + +```bash +NONCE=$(uuidgen) +TS=$(date +%s) +BODY='{"action":"opened","number":7,"pull_request":{"id":42}}' +SIG=$(printf '%s' "$BODY" | openssl dgst -sha256 -hmac whsec_PROVIDED_BY_GITHUB | sed 's/^.* //') + +curl -i http://localhost:8000/webhooks/github/primary \ + -H "Content-Type: application/json" \ + -H "X-GitHub-Delivery: $NONCE" \ + -H "X-Hub-Signature-256: sha256=$SIG" \ + --data "$BODY" +``` + +Expected: + +- `204 No Content` on the first delivery (handler accepted). +- `204` again on a retry within the replay window (idempotency-key short-circuit). +- `400` on a malformed JSON body (envelope rejection at `parse_typed` time). +- `401` on a signature mismatch. +- `409` on a replay older than the replay window. + +## Retry semantics + +Providers retry on non-2xx responses; SynthOrg accepts duplicate deliveries up to `idempotency_ttl_seconds`. The composed idempotency key is `connection_name:event_type:nonce`, length-clamped to `255` chars (DB schema cap), then folded to a SHA-256 digest if oversized so the cache lookup never fails on length. + +Per-delivery state transitions land on the `WebhookReceipt`: + +- `received` -> `dispatched` (handler returned) -> `acknowledged` (downstream completed). +- `received` -> `duplicate` when the idempotency key has been observed. +- `received` -> `rejected` on signature/nonce failure. + +The `WEBHOOK_RECEIPT_STATUS_TRANSITIONED` event fires AFTER each persistence write so dashboards can chart delivery health. + +## Adding a new provider + +1. Add a row to `WebhookConnectionRepository` schema (already covers the common columns). +2. Implement a handler in `src/synthorg/integrations/webhooks/handlers/.py` that consumes a typed `WebhookEventPayload` and dispatches to the right service. +3. Register the handler in the dispatcher's strategy registry. +4. Add a per-provider test under `tests/unit/integrations/webhooks/` covering accept, replay, signature mismatch, and oversized payload. + +See [docs/design/integrations.md](../design/integrations.md) for the broader integrations architecture. diff --git a/docs/guides/workers-and-background-tasks.md b/docs/guides/workers-and-background-tasks.md new file mode 100644 index 0000000000..a88ad83823 --- /dev/null +++ b/docs/guides/workers-and-background-tasks.md @@ -0,0 +1,134 @@ +--- +title: Workers and Background Tasks +description: Configure the JetStream task queue, scale the worker pool, observe dispatch and retry. +--- + +# Workers and Background Tasks + +SynthOrg's distributed task queue runs over NATS JetStream. The dispatcher (`synthorg.workers.dispatcher`) enqueues `TaskClaim` envelopes; workers (`synthorg.workers.worker`) pull from a shared durable consumer and execute the task via an injected executor. This guide walks through configuration, running a worker pool against a local NATS, and observing the dispatch path. + +## Concepts + +- **Task claim**: a small JSON envelope (`TaskClaim`) carrying `task_id`, `project_id`, `previous_status`, `new_status`, and an `idempotency_key` for redelivery dedup. +- **JetStream stream**: `SYNTHORG_TASKS` with `WorkQueuePolicy` and exclusive subjects (`tasks.ready.>` and `tasks.dead.>`). +- **Durable consumer**: shared `synthorg_workers` consumer; every worker pulls from the same name so JetStream handles load distribution. +- **Executor**: a pluggable async callable that consumes a claim and returns `TaskClaimStatus.SUCCESS` / `FAILED` / `RETRY`. + +## Configuration + +`QueueConfig` carries the queue settings: + +| Key | Type | Default | Purpose | +|---|---|---|---| +| `workers.enabled` | bool | `false` | Master switch. | +| `workers.count` | int | `1` | Concurrency per process. | +| `workers.ack_wait_seconds` | int | `30` | JetStream ack timeout before redelivery. | +| `workers.max_deliver` | int | `5` | Max delivery attempts before dead-letter. | +| `workers.stream_name` | str | `SYNTHORG_TASKS` | Stream identifier. | +| `workers.ready_subject_prefix` | str | `tasks.ready` | Subject prefix for ready claims. | +| `workers.dead_subject_prefix` | str | `tasks.dead` | Dead-letter subject prefix. | + +NATS-side settings live under `communication.nats_*` (URL, credentials, reconnect timing). + +## Worker authentication + +The worker pool's HTTP executor authenticates back to the backend with a per-deployment bearer token. The token source is the `SYNTHORG_WORKER_AUTH_TOKEN` environment variable, read once at worker construction time (see `src/synthorg/workers/__main__.py`). + +Operational guidance: + +- Treat the token like any other long-lived credential: store it in your secrets manager (Vault, AWS Secrets Manager, etc.) and inject it into the worker container via `env` rather than baking it into the image. +- Rotate the token on a schedule that matches the rest of your service-account hygiene (typically every 90 days). Rotation is a rolling restart of the worker pool with the new value; nothing else changes. +- Always set `SYNTHORG_API_BASE_URL` to an HTTPS endpoint in production. Sending the bearer token over plain HTTP discloses it on the wire. +- The token is NEVER logged. Worker observability events redact the `Authorization` header; if you see a token in a worker log it is a regression. File an issue. +- Workers refuse to start with an empty token (`SYNTHORG_WORKER_AUTH_TOKEN=""`); failing fast at boot is intentional so an unauthenticated executor never reaches a real backend. + +## Worked example: run the worker pool + +Start a local NATS: + +```bash +docker run -d --name synthorg-nats -p 4222:4222 nats:2.10-alpine -js +``` + +Run the worker module: + +```bash +SYNTHORG_NATS_URL=nats://localhost:4222 \ + uv run python -m synthorg.workers --workers 2 +``` + +The worker: + +1. Calls `JetStreamTaskQueue.start()` which creates the stream and durable consumer. +2. Polls the consumer for claims with a per-fetch timeout. +3. Runs the configured executor on each claim. +4. ACKs on `SUCCESS`/`FAILED`; NAKs with backoff on `RETRY`. + +Dispatch a claim from the API side: + +```python +from synthorg.workers.claim import TaskClaim +from synthorg.workers.dispatcher import JetStreamDispatcher + +dispatcher: JetStreamDispatcher = app_state.task_dispatcher +await dispatcher.dispatch( + TaskClaim(task_id="task-A", new_status="ready"), +) +``` + +The worker logs: + +```text +workers.worker.claim_received task_id=task-A +workers.worker.executor.invoked task_id=task-A +workers.worker.executor.completed task_id=task-A outcome=success +``` + +## Retry and dead-letter + +JetStream redelivers a claim on ack-wait timeout (worker crash, slow execution) or `RETRY` outcome. After `max_deliver` attempts the claim moves to the dead-letter subject (`tasks.dead.{task_id}`). Operators monitor the dead-letter subject via `synthorg status --check workers` or directly with `nats consumer report SYNTHORG_TASKS synthorg_workers`. + +Idempotency: every claim carries a UUID `idempotency_key`. Workers `mark_seen` the key via `SeenClaimsRepository`; a duplicate redelivery short-circuits to `ack-and-skip` without re-running the executor. + +## Observability + +Per-claim events: + +- `workers.worker.claim_received` (info): claim pulled from JetStream. +- `workers.worker.duplicate_claim_suppressed` (info): redelivery deduped by idempotency key. +- `workers.worker.executor_failed` (warning): executor raised. +- `workers.worker.finalize_failed` (warning): ACK/NAK failed. +- `workers.task_queue.connect_failed` (error): NATS unreachable at start. +- `workers.task_queue.claim_parse_failed` (warning): malformed claim (terminal ACK, never redelivered). + +Metrics: + +- `synthorg_worker_pool_size` (gauge): configured worker count. +- `synthorg_worker_invocations_total` (counter, `outcome`): per-claim outcomes. +- `synthorg_worker_invocation_duration_seconds` (histogram): per-claim wall time. + +Use `synthorg status --check workers` for a quick view; the Grafana `Tasks` dashboard row charts the same metrics over time. + +## Pluggable executor + +The default executor in `src/synthorg/workers/__main__.py` is a thin wrapper that fetches the task and dispatches to the agent runtime. Operators can supply a custom executor by replacing `run_worker_pool`'s `executor` argument; the contract is: + +```python +async def my_executor(claim: TaskClaim) -> TaskClaimStatus: + # ... custom dispatch logic ... + return TaskClaimStatus.SUCCESS +``` + +The executor is responsible for the HTTP-callback that transitions the task status; see the existing `synthorg.communication.async_tasks.callbacks` module for the expected shape. + +## Diagnostic checklist + +| Symptom | Likely cause | Mitigation | +|---|---|---| +| Workers never start | NATS unreachable | Check `synthorg.workers.task_queue.connect_failed`; verify `SYNTHORG_NATS_URL`. | +| Tasks never run | Dispatcher disabled | Verify `workers.enabled: true`. | +| Duplicate execution | Idempotency disabled | Verify `SeenClaimsRepository` is wired and durable. | +| Claims hit dead-letter | Executor systematically failing | Inspect dead-letter subject for repeating `task_id`; consult task logs. | +| Slow drain on shutdown | `_drain_partial` waiting on NATS | Check `WORKERS_TASK_QUEUE_DRAIN_FAILED`; force-kill after the timeout. | + +See [docs/design/distributed-runtime.md](../design/distributed-runtime.md) for the full design. diff --git a/docs/reference/typed-boundaries.md b/docs/reference/typed-boundaries.md index f2d740c17b..eac8723909 100644 --- a/docs/reference/typed-boundaries.md +++ b/docs/reference/typed-boundaries.md @@ -1,8 +1,15 @@ # Typed Boundaries -The six security-sensitive API entry points listed below validate -inbound payloads through a single helper, -`synthorg.api.boundary.parse_typed`. The helper replaces the legacy +The security-sensitive API entry points listed below split into two +groups. The **parse_typed-enforced** boundaries (`jwt`, +`settings.security`, `ws.control`, `audit_chain`, `a2a.jsonrpc`, +`mcp.tool`: the original six) validate inbound payloads through a +single helper, `synthorg.api.boundary.parse_typed`. The +**informational/lenient** entries (`provider.tool_call`, +`webhook.payload`, `mcp.tool.dual_path`) are documented in the same +table for discoverability but are NOT gated by `parse_typed`; the +table's `Model` column marks them explicitly (`no Pydantic`, +`extra="allow"`, dual-path helpers). The helper replaces the legacy `dict[str, Any]` contract that let a typo or rename slip silently through dict access at the auth, agent tool plane, audit trail, RPC, and settings surfaces. @@ -54,6 +61,9 @@ Behaviour: | `audit_chain` | `src/synthorg/observability/audit_chain/sink.py` | `emit` | `synthorg.observability.audit_chain.payloads.AuditChainEventPayload` | | `a2a.jsonrpc` | `src/synthorg/a2a/rpc_params.py` | `parse_rpc_params` | `synthorg.a2a.rpc_params.A2ARpcParams` (TypeAdapter) | | `mcp.tool` | `src/synthorg/meta/mcp/invoker.py` | `invoke` | Per-tool `MCPToolDef.args_model` | +| `provider.tool_call` | `src/synthorg/providers/drivers/mappers.py` | `extract_tool_calls` | (no Pydantic; lenient dict/object extraction) | +| `webhook.payload` | `src/synthorg/api/controllers/_webhooks_wiring.py`| `parse_payload` | `WebhookEventPayload` (extra="allow") | +| `mcp.tool.dual_path` | `src/synthorg/meta/mcp/invoker.py` | `invoke` | Per-tool `args_model` OR `common_args` handler helpers | ## Per-boundary notes @@ -148,6 +158,120 @@ on the wire. Tools without an `args_model` fall through to the deepcopy path and continue to validate via `common_args` helpers in the handler; this gate fires whenever a tool opts into typed args. +### Provider tool-call extraction (`provider.tool_call`) + +LiteLLM provider drivers return tool-call payloads in heterogeneous +shapes: some completions parse to plain dicts, others surface objects +with attribute access (`item.function.arguments`). The provider layer +has no control over the upstream payload shape, so this boundary is +deliberately **lenient**: it does NOT run `parse_typed`. Instead, +`extract_tool_calls` (`src/synthorg/providers/drivers/mappers.py:131`) +walks the raw list and rescues whatever it can. + +- Wire shape: `list[dict] | list[object]` (or `None` for completions + that emitted no tool calls). +- Field access: `_get(item, "id"/"function", ...)` uses + `dict.get` for mappings and `getattr` for objects; the helper + centralises the lenient access so each call site does not branch. +- Failure modes: + - Missing `function` block: skip the entry; emit + `provider.tool_call.missing_function` warning with + `item_type=type(item).__name__`. + - Empty `id` or `name`: skip the entry; emit + `provider.tool_call.incomplete` warning carrying whatever fields + were recoverable. + - Malformed `arguments` JSON: fall back to `{}` so the handler can + apply its own validation (the alternative, rejecting the entire + completion, would discard one good tool call because a sibling + arrived malformed). +- Why lenient: provider variability dominates. A strict + `parse_typed` here would surface a hard 5xx on every novel + upstream shape, blocking the whole completion path. The warning + logs preserve observability without coupling the wire contract to + any single provider. + +Each skipped entry is logged so a provider regression surfaces in +the event stream rather than disappearing silently. The handler that +consumes the returned `tuple[ToolCall, ...]` re-validates field +shape via the typed Pydantic `ToolCall` model. + +### Webhook payload envelope (`webhook.payload`) + +External webhook providers send arbitrary JSON keys (each integration +has its own schema). The boundary uses a Pydantic model with +`ConfigDict(extra="allow")` to enforce envelope shape only: + +```python +class WebhookEventPayload(BaseModel): + model_config = ConfigDict(frozen=True, extra="allow") +``` + +(`src/synthorg/api/controllers/_webhooks_wiring.py:39`). + +- Wire shape: any JSON object. Arrays, scalars, and non-JSON bodies + are rejected at `parse_typed("webhook.payload", ...)` time and + surface as HTTP 400. +- Provider keys flow through unchanged via `extra="allow"`. The + controller routes the typed envelope to the integration-specific + handler, which validates the inner payload against its own schema. +- Why `extra="allow"` and NOT `extra="forbid"`: flipping the config + would break every integration the moment a provider added a new + optional field. `frozen=True` still prevents mutation; the only + relaxation is on unknown-key rejection. + +The envelope-only validation closes the silent `{"raw": ...}` +fallback the controller carried before typed boundaries existed: a +non-object payload now fails fast instead of routing as a +single-key dict. + +### MCP tool-execution dual paths (`mcp.tool.dual_path`) + +The MCP invoker (`src/synthorg/meta/mcp/invoker.py:149`) routes tool +arguments through one of two validation paths depending on whether +the tool declares an `args_model`: + +```python +if tool_def.args_model is not None: + # Typed-args path. + validated = parse_typed("mcp.tool", arguments, tool_def.args_model) + handler_arguments = deepcopy(validated.model_dump(mode="python")) +else: + # Per-field path. + handler_arguments = deepcopy(arguments) +``` + +- **Typed-args path** (`args_model` declared): the raw dict goes + through `parse_typed`, which emits + `api.boundary.validation_failed` on rejection and re-raises. The + invoker catches the `PydanticValidationError`, records + `record_mcp_handler_outcome(outcome="validation_error", ...)`, + and returns an `ArgumentValidationError` envelope with + `domain_code=invalid_argument` to the client. The validated + `model_dump(mode="python")` is deep-copied so handlers receive a + fresh mutable dict. +- **Per-field path** (no `args_model`): the raw dict is deep-copied + unchanged and handed to the handler. The handler validates each + field through `common_args` helpers (`require_arg`, + `require_non_blank`, `require_dict`, etc. in + `src/synthorg/meta/mcp/handlers/common_args.py`), which raise + `ArgumentValidationError` directly on missing or malformed + inputs. + +Both paths converge on the same wire envelope +(`domain_code=invalid_argument` on validation failure) and the same +observability surface (`MCP_SERVER_INVOKE_FAILED` warning, +`record_mcp_handler_outcome` with the `validation_error` outcome). +The typed-args path provides typed validation at construction time; +the per-field path provides field-level validation at call time. +Validation surface area is equivalent; opting a tool into +`args_model` is a code-quality refactor, not a security upgrade. + +Pre-mapping shape check: the typed-args path rejects non-mapping +payloads (`isinstance(raw_arguments, dict)`) before invoking +`parse_typed`. A JSON array would otherwise survive `dict(...)` +coercion and reach Pydantic, broadening the contract beyond what +MCP expects. + ## Lint guard (Phase 3) `scripts/check_boundary_typed.py` walks the six registered functions diff --git a/docs/reference/yaml-schema.md b/docs/reference/yaml-schema.md new file mode 100644 index 0000000000..85c7139102 --- /dev/null +++ b/docs/reference/yaml-schema.md @@ -0,0 +1,209 @@ +--- +title: YAML Schema Reference +description: Field-by-field reference for the SynthOrg company-template YAML format. +--- + +# YAML Schema Reference + +The company template YAML describes a synthetic organisation: its agents, departments, budget, integrations, sprints, and operational policies. The schema is enforced by Pydantic models under `src/synthorg/config/schema.py`; this document captures the field set as a fixed-state reference. For the broader configuration precedence story (DB > env > code default) see [docs/reference/configuration-precedence.md](configuration-precedence.md). + +## Top-level shape + +```yaml +version: 1 +company: + name: Acme Robotics + description: ... +agents: + - id: ... +departments: + - name: ... +sprints: + template: ... +budget: + ... +integrations: + ... +notifications: + ... +security: + ... +meta_loop: + ... +ontology: + ... +scoring: + ... +``` + +Every top-level key is optional except `version` and `company`. Missing sections fall back to the Pydantic-defined defaults. + +## `version` + +| Field | Type | Default | Description | +|---|---|---|---| +| `version` | int | (required) | Schema version. Currently `1`. | + +A future incompatible change bumps the integer; the loader rejects unknown versions at startup with a typed error. + +## `company` + +| Field | Type | Default | Description | +|---|---|---|---| +| `company.name` | str | (required) | Display name. | +| `company.description` | str | `""` | Free-form description. | +| `company.timezone` | str | `"UTC"` | IANA timezone identifier; used by ceremony scheduling. | +| `company.locale` | str | `"en-GB"` | BCP 47 locale; drives number/date rendering on the dashboard. | + +The locale defaults to `en-GB` per the regional-defaults rule; see [docs/reference/regional-defaults.md](regional-defaults.md). + +## `agents` + +A list of agent definitions; each must declare at least `id`, `role`, and `provider`. + +| Field | Type | Default | Description | +|---|---|---|---| +| `id` | str | (required) | Stable agent identifier (kebab-case). | +| `role` | str | (required) | Role name; matches a department's `roles` list. | +| `provider` | str | (required) | Provider driver identifier. | +| `model` | str | provider default | Model identifier (e.g. `example-medium-001`). | +| `system_prompt` | str | derived | Override the role-default system prompt. | +| `tools` | list[str] | `[]` | Tool names this agent may invoke. | +| `trust_level` | enum | `standard` | One of `restricted`, `standard`, `elevated`. | +| `cost_centre` | str | `unassigned` | Cost-attribution bucket. | + +## `departments` + +| Field | Type | Default | Description | +|---|---|---|---| +| `name` | str | (required) | Department name. | +| `roles` | list[str] | (required) | Roles available in this department. | +| `head` | str | (optional) | `id` of the head agent. | + +## `sprints` + +| Field | Type | Default | Description | +|---|---|---|---| +| `sprints.template.duration_days` | int | `14` | Default sprint length. | +| `sprints.template.start_day` | str | `monday` | Day of week to start. | +| `sprints.template.ceremonies` | list | `[]` | Ceremony definitions; see [ceremony-scheduling-tuning](../guides/ceremony-scheduling-tuning.md). | +| `sprints.template.ceremony_policy.strategy` | enum | `count_driven` | Scheduling strategy. | +| `sprints.template.ceremony_policy.auto_transition` | bool | `false` | Strategy may auto-transition the sprint. | + +## `budget` + +| Field | Type | Default | Description | +|---|---|---|---| +| `budget.total_monthly` | float | `0` | Monthly cap in `currency`. | +| `budget.currency` | str | `GBP` | ISO 4217 code. | +| `budget.reset_day` | int (1..28) | `1` | Day-of-month for the monthly reset. | +| `budget.alerts.warning_at` | int (0..100) | `50` | Warning threshold percentage. | +| `budget.alerts.critical_at` | int (0..100) | `80` | Critical threshold percentage. | +| `budget.alerts.hard_stop_at` | int (0..100) | `95` | Hard-stop threshold percentage. | +| `budget.daily_limit` | float | unset | Optional daily cap. | +| `budget.projects..monthly` | float | unset | Per-project monthly cap. | +| `budget.risk_budget.enabled` | bool | `false` | Enable risk-weighted budget enforcement. | + +See [docs/guides/budget.md](../guides/budget.md) for the broader operations guide. + +## `integrations` + +Each integration is a typed sub-block keyed by name: + +```yaml +integrations: + github: + enabled: true + token: ${GITHUB_TOKEN} + slack: + enabled: true + webhook_url: ${SLACK_WEBHOOK} + webhooks: + enabled: true + replay_window_seconds: 300 +``` + +| Field | Type | Default | Description | +|---|---|---|---| +| `integrations..enabled` | bool | `false` | Activate the integration. | +| `integrations..token` / `webhook_url` / `api_key` | str | (provider) | Secret material; `${ENV_VAR}` substitution supported. | +| `integrations.webhooks.replay_window_seconds` | int | `300` | Webhook replay protection. | + +## `notifications` + +```yaml +notifications: + channels: + slack: + enabled: true + webhook_url: ${SLACK_WEBHOOK} + email: + enabled: false + routing: + - event: budget.project.critical + channels: [slack] + severity: critical +``` + +| Field | Type | Default | Description | +|---|---|---|---| +| `notifications.channels..enabled` | bool | `false` | Activate the channel. | +| `notifications.routing[].event` | str | (required) | Event-name prefix to match. | +| `notifications.routing[].channels` | list[str] | (required) | Channel names to route to. | +| `notifications.routing[].severity` | enum | `warning` | One of `info`, `warning`, `critical`. | + +## `security` + +| Field | Type | Default | Description | +|---|---|---|---| +| `security.audit_chain.enabled` | bool | `true` | Sign every audit event into the hash chain. | +| `security.audit_chain.max_entries` | int | `100000` | Ring-buffer capacity; alert at 90%. | +| `security.approval.timeout_seconds` | int | `86400` | Auto-reject pending approvals after this window. | +| `security.approval.reviewer_groups` | list[str] | `[]` | Roles allowed to decide. | +| `security.prompt_safety.enabled` | bool | `true` | Wrap untrusted content via `wrap_untrusted`. | + +## `meta_loop` + +| Field | Type | Default | Description | +|---|---|---|---| +| `meta_loop.enabled` | bool | `false` | Activate the meta-loop. | +| `meta_loop.rules` | list | `[]` | Rule activations (see [custom-rules-and-meta-loop](../guides/custom-rules-and-meta-loop.md)). | +| `meta_loop.rules[].name` | str | (required) | Rule identifier. | +| `meta_loop.rules[].severity` | enum | `warn` | One of `info`, `warn`, `fail`. | + +## `ontology` + +```yaml +ontology: + terms: + - name: cost_centre + description: ... + examples: [...] +``` + +See [docs/guides/ontology-extension.md](../guides/ontology-extension.md) for the term-extension workflow. + +## `scoring` + +| Field | Type | Default | Description | +|---|---|---|---| +| `scoring.strategy` | str | `composite` | Active scoring strategy. | +| `scoring.` | float / int | per strategy | Strategy-specific tuning (e.g. `freshness_boost_factor`). | + +## Validation + +The loader applies the Pydantic schema, then runs validation hooks: + +- Currency consistency (`budget.currency` must match per-project currencies). +- Cross-section references (`agents[].role` must appear in some `departments[].roles`). +- Allowlist enforcement (channels in `notifications.routing[].channels` must exist). + +Failures surface at startup with a typed `ConfigValidationError` and a line/column pointer into the YAML. + +## Reload semantics + +Most fields are hot-reloadable via `synthorg config reload-template`. Exceptions: + +- `version` change requires a process restart. +- New agents require a worker pool restart (the runtime caches per-agent prompts). +- `budget.currency` change requires draining the cost tracker first; the runtime refuses the reload until the tracker is empty. diff --git a/scripts/check_no_migration_framing.py b/scripts/check_no_migration_framing.py index ad3840fba3..f41c231733 100644 --- a/scripts/check_no_migration_framing.py +++ b/scripts/check_no_migration_framing.py @@ -11,6 +11,24 @@ ``we used to`` -- forensic prose that names a past state of the code. ``moved`` alone is fine; only the ``moved here in`` shape signals migration narrative ("once upon a time..."). +* ``previously `` where ```` is one of the migration + shapes (``lived``, ``inlined``, ``extracted``, ``duplicated``, + ``scattered``, ``routed``, ``emitted``, ``wrapped``, ``owned``). + Bare ``previously`` is left alone so legitimate runtime prose + ("previously stored ciphertext", "previously compacted + conversation") is not flagged. +* ``were previously inlined`` / ``was previously inlined`` and the + same shape with ``duplicated`` / ``extracted`` / ``scattered`` / + ``owned`` / ``emitted`` / ``wrapped`` -- copular variants of the + ``previously `` rule above. +* ``used to be`` -- variant of ``we used to`` that names a past + shape of the code as a fact ("used to be scattered across N + handlers"). +* ``originally `` (and ``originally-``) where ```` + is one of ``generated``, ``promised``, ``claimed``, ``owned``, + ``wrapped``, ``emitted``, ``inlined``, ``extracted``, ``routed``, + ``handled`` -- "as originally promised" framing that pins prose to + a long-gone earlier version of the code. * ``Phase \\d+`` and ``phase \\d+`` -- ordinal pipeline numbering couples to a specific shape; semantic names (``decompose``, ``route``, ``dispatch``) survive insertions / reorders. @@ -68,6 +86,51 @@ "Round-N fix/review", re.compile(r"\bround-\d+\s+(?:fix|review|fix:|review:)", re.IGNORECASE), ), + # Code-migration "previously " shapes. The verb list is + # deliberately code-migration specific: ``lived``, ``inlined``, + # ``extracted``, ``duplicated``, ``scattered`` describe where the + # code USED TO LIVE. Bare ``previously`` matches plenty of + # legitimate runtime prose (``previously compacted conversation``, + # ``previously stored ciphertext``) and is not enforced. + ( + "previously ", + re.compile( + r"\bpreviously\s+(?:lived|inlined|extracted|duplicated|" + r"scattered|routed|emitted|wrapped|owned)\b", + re.IGNORECASE, + ), + ), + ( + "were previously inlined", + re.compile( + r"\bwere\s+previously\s+(?:inlined|duplicated|scattered|" + r"extracted|owned|emitted|wrapped)\b", + re.IGNORECASE, + ), + ), + ( + "was previously inlined", + re.compile( + r"\bwas\s+previously\s+(?:inlined|duplicated|extracted|" + r"scattered|owned|emitted|wrapped)\b", + re.IGNORECASE, + ), + ), + ( + "used to be", + re.compile(r"\bused\s+to\s+be\b", re.IGNORECASE), + ), + # ``originally ``: targeted code-migration verb list. + # Compound form ``originally-claimed`` is the same narrative shape + # and matches via the ``[-\s]`` alternation. + ( + "originally ", + re.compile( + r"\boriginally[-\s](?:generated|promised|claimed|owned|" + r"wrapped|emitted|inlined|extracted|routed|handled)\b", + re.IGNORECASE, + ), + ), ) _SUPPRESSION_MARKER: Final[str] = "lint-allow: migration-framing" diff --git a/src/synthorg/api/controllers/__init__.py b/src/synthorg/api/controllers/__init__.py index ecb3a40fdf..990ec75090 100644 --- a/src/synthorg/api/controllers/__init__.py +++ b/src/synthorg/api/controllers/__init__.py @@ -43,6 +43,7 @@ EventStreamController, InterruptController, ) +from synthorg.api.controllers.experiments import ExperimentsController from synthorg.api.controllers.health import ( LivenessController, ReadinessController, @@ -111,6 +112,7 @@ DepartmentController, ProjectController, TaskController, + ExperimentsController, MessageController, MeetingController, ArtifactController, @@ -222,6 +224,7 @@ "EscalationsController", "EvaluationConfigVersionController", "EventStreamController", + "ExperimentsController", "IntegrationHealthController", "InterruptController", "LivenessController", diff --git a/src/synthorg/api/controllers/_department_health.py b/src/synthorg/api/controllers/_department_health.py index df4d83e691..e57d118a29 100644 --- a/src/synthorg/api/controllers/_department_health.py +++ b/src/synthorg/api/controllers/_department_health.py @@ -162,8 +162,9 @@ async def _resolve_agent_ids( ``assemble_department_health`` handler can surface registry outage as a degraded response instead of silently reporting zero agents. Other exceptions (e.g. unexpected registry bugs) are logged and - swallowed because per-name lookup failures used to be tolerated in - the previous per-agent fan-out. + swallowed so a single bad agent identity does not collapse the + department-wide health snapshot; the missing agent simply omits + from the result. """ if not app_state.has_agent_registry: return () diff --git a/src/synthorg/api/controllers/experiments.py b/src/synthorg/api/controllers/experiments.py new file mode 100644 index 0000000000..5d76045181 --- /dev/null +++ b/src/synthorg/api/controllers/experiments.py @@ -0,0 +1,145 @@ +"""A/B experiment registry endpoints. + +Mounts the variant CRUD plus deterministic assignment lookup under +``/api/v1/experiments``. Variant lifecycle is operator-facing; the +assignment endpoint is the runtime path agents call when they want to +discover which experiment branch they belong to. +""" + +from typing import Final + +from litestar import Controller, get, post +from litestar.datastructures import State # noqa: TC002 + +from synthorg.api.cursor import decode_cursor, encode_cursor +from synthorg.api.dto import ( + ApiResponse, + AssignExperimentRequest, + PaginatedResponse, + PaginationMeta, + RegisterExperimentVariantRequest, +) +from synthorg.api.guards import require_read_access, require_write_access +from synthorg.api.pagination import ( + CursorLimit, # noqa: TC001 -- runtime parameter annotation + CursorParam, # noqa: TC001 -- runtime parameter annotation +) +from synthorg.api.path_params import PathId # noqa: TC001 +from synthorg.api.rate_limits import per_op_rate_limit_from_policy +from synthorg.api.state import AppState # noqa: TC001 +from synthorg.core.types import NotBlankStr +from synthorg.experiments.models import ( # noqa: TC001 -- runtime return-type annotations + ExperimentAssignment, + ExperimentVariant, +) +from synthorg.observability import get_logger + +logger = get_logger(__name__) + +_DEFAULT_LIMIT: Final[int] = 50 + + +class ExperimentsController(Controller): + """REST surface for the experiment registry.""" + + path = "/experiments" + tags = ("experiments",) + guards = [require_read_access] # noqa: RUF012 + + @get("/{experiment:str}/variants") + async def list_variants( + self, + state: State, + experiment: PathId, + ) -> ApiResponse[tuple[ExperimentVariant, ...]]: + """List every registered variant for an experiment.""" + app_state: AppState = state.app_state + variants = await app_state.experiment_service.list_variants( + NotBlankStr(experiment), + ) + return ApiResponse(data=variants) + + @post( + "/{experiment:str}/variants", + guards=[ + require_write_access, + per_op_rate_limit_from_policy("experiments.register", key="user"), + ], + status_code=201, + ) + async def register_variant( + self, + state: State, + experiment: PathId, + data: RegisterExperimentVariantRequest, + ) -> ApiResponse[ExperimentVariant]: + """Register or replace a variant on an experiment.""" + app_state: AppState = state.app_state + record = await app_state.experiment_service.register_variant( + experiment=NotBlankStr(experiment), + variant=data.variant, + weight=data.weight, + description=data.description, + ) + return ApiResponse(data=record) + + @post( + "/{experiment:str}/assign", + guards=[ + require_write_access, + per_op_rate_limit_from_policy("experiments.assign", key="user"), + ], + ) + async def assign( + self, + state: State, + experiment: PathId, + data: AssignExperimentRequest, + ) -> ApiResponse[ExperimentAssignment]: + """Return the deterministic variant assignment for a subject. + + On first call for ``(experiment, subject_id)`` the service + computes the assignment and persists it; subsequent calls + return the recorded assignment unchanged. + """ + app_state: AppState = state.app_state + assignment = await app_state.experiment_service.assign( + experiment=NotBlankStr(experiment), + subject_id=data.subject_id, + ) + return ApiResponse(data=assignment) + + @get("/{experiment:str}/assignments") + async def list_assignments( + self, + state: State, + experiment: PathId, + limit: CursorLimit = _DEFAULT_LIMIT, + cursor: CursorParam = None, + ) -> PaginatedResponse[ExperimentAssignment]: + """List recorded assignments for an experiment (newest first). + + Pagination uses the standard opaque HMAC-signed cursor (see + :mod:`synthorg.api.cursor`); the cursor decodes to an internal + offset so callers cannot forge a token that skips to an + arbitrary page. + """ + app_state: AppState = state.app_state + offset = decode_cursor(cursor, secret=app_state.cursor_secret) if cursor else 0 + page, total = await app_state.experiment_service.list_assignments( + NotBlankStr(experiment), + limit=limit, + offset=offset, + ) + next_offset = offset + len(page) + has_more = next_offset < total + meta = PaginationMeta( + limit=limit, + next_cursor=( + encode_cursor(next_offset, secret=app_state.cursor_secret) + if has_more + else None + ), + has_more=has_more, + ) + return PaginatedResponse(data=page, pagination=meta) diff --git a/src/synthorg/api/controllers/requests.py b/src/synthorg/api/controllers/requests.py index 01f85e4cfc..371396fb02 100644 --- a/src/synthorg/api/controllers/requests.py +++ b/src/synthorg/api/controllers/requests.py @@ -22,6 +22,7 @@ from synthorg.core.domain_errors import ConflictError, NotFoundError from synthorg.core.types import NotBlankStr # noqa: TC001 from synthorg.observability import get_logger +from synthorg.observability.events.client import CLIENT_REQUEST_STATUS_TRANSITIONED logger = get_logger(__name__) _DEFAULT_LIMIT: Final[int] = 50 @@ -232,6 +233,13 @@ async def scope_request( **overrides, ) await sim_state.request_store.save(scoped) + logger.info( + CLIENT_REQUEST_STATUS_TRANSITIONED, + request_id=scoped.request_id, + client_id=scoped.client_id, + from_status=stored.status.value, + to_status=scoped.status.value, + ) # Publish inside the lock so the save + WS event are # ordered atomically: a concurrent approve that takes the # lock after us cannot emit its own event before ours @@ -290,6 +298,13 @@ async def approve_request( else: final, _ = await sim_state.intake_engine.finalize_scoped(stored) await sim_state.request_store.save(final) + logger.info( + CLIENT_REQUEST_STATUS_TRANSITIONED, + request_id=final.request_id, + client_id=final.client_id, + from_status=stored.status.value, + to_status=final.status.value, + ) _publish(request, WsEventType.REQUEST_APPROVED, final) # Approve walks to ``TASK_CREATED`` (terminal) -- drop the lock # so the registry does not accumulate. @@ -335,6 +350,13 @@ async def reject_request( metadata=metadata, ) await sim_state.request_store.save(cancelled) + logger.info( + CLIENT_REQUEST_STATUS_TRANSITIONED, + request_id=cancelled.request_id, + client_id=cancelled.client_id, + from_status=stored.status.value, + to_status=cancelled.status.value, + ) _publish(request, WsEventType.REQUEST_REJECTED, cancelled) # Reject walks to ``CANCELLED`` (terminal) -- drop the lock. app_state.release_request_lock_if_idle(request_id) diff --git a/src/synthorg/api/controllers/simulations.py b/src/synthorg/api/controllers/simulations.py index 351b2c39fa..bb10b71f29 100644 --- a/src/synthorg/api/controllers/simulations.py +++ b/src/synthorg/api/controllers/simulations.py @@ -155,7 +155,7 @@ async def _rollback_register_if_absent( request handler, not a coroutine guarding against external cancellation. - Passes the originally-claimed ``record`` to ``unregister`` so the + Passes the claimed ``record`` to ``unregister`` so the compare-and-delete semantics protect a fresh retry that might have won the slot between the failure and this rollback running. """ diff --git a/src/synthorg/api/controllers/tasks.py b/src/synthorg/api/controllers/tasks.py index 220872629b..51ff5a1980 100644 --- a/src/synthorg/api/controllers/tasks.py +++ b/src/synthorg/api/controllers/tasks.py @@ -5,12 +5,13 @@ from litestar import Controller, delete, get, patch, post from litestar.datastructures import State # noqa: TC002 from litestar.params import Parameter -from litestar.status_codes import HTTP_204_NO_CONTENT +from litestar.status_codes import HTTP_200_OK, HTTP_204_NO_CONTENT from synthorg.api.dto import ( ApiResponse, CancelTaskRequest, CreateTaskRequest, + ExecuteTaskRequest, PaginatedResponse, TransitionTaskRequest, UpdateTaskRequest, @@ -321,6 +322,54 @@ async def delete_task( ) logger.info(API_TASK_DELETED, task_id=task_id) + @post( + "/{task_id:str}/execute", + # The endpoint mutates the existing task, not creates a new + # resource. Override Litestar's default 201 with 200 so the + # worker's ACK/NACK contract reads the success class directly + # instead of treating "Created" as a special case. + status_code=HTTP_200_OK, + guards=[ + require_write_access, + per_op_rate_limit_from_policy("tasks.execute", key="user"), + ], + ) + async def execute_task( + self, + state: State, + task_id: PathId, + data: ExecuteTaskRequest, + ) -> ApiResponse[Task]: + """Execute one step of a task on behalf of a worker. + + Called by the distributed worker (``synthorg.workers.executor``) + when a JetStream claim arrives. The endpoint delegates to + ``WorkerExecutionService.execute_once`` so the agent-runtime + invocation is configurable per deployment; the controller + itself only routes auth + HTTP envelope. + + The response carries the task at its post-execution status so + the worker can map the outcome (terminal status -> ACK, + non-terminal -> NACK / RETRY). + """ + app_state: AppState = state.app_state + requester = _extract_requester(state) + task = await app_state.worker_execution_service.execute_once( + task_id=task_id, + previous_status=data.previous_status, + new_status=data.new_status, + idempotency_key=data.idempotency_key, + requested_by=requester, + ) + logger.info( + TASK_STATUS_CHANGED, + task_id=task_id, + from_status=data.previous_status, + to_status=task.status.value, + triggered_by="worker_executor", + ) + return ApiResponse(data=task) + @post( "/{task_id:str}/cancel", guards=[ diff --git a/src/synthorg/api/controllers/ws.py b/src/synthorg/api/controllers/ws.py index c9b2c10872..b911342514 100644 --- a/src/synthorg/api/controllers/ws.py +++ b/src/synthorg/api/controllers/ws.py @@ -680,6 +680,39 @@ async def _setup_connection( return channels_plugin, subscriber +def _record_ws_connection_opened(socket: WebSocket[Any, Any, Any]) -> None: + """Increment the WS active-connection gauge. + + The Prometheus collector lives on ``app_state``; the helper checks + presence defensively so a controller running without the + observability stack (rare; mostly tests) does not blow up at + setup. ``Gauge.inc()`` is internally thread-safe, so no explicit + lock is needed between the WS handler coroutine and the metrics + scrape thread. + """ + app_state = socket.app.state["app_state"] + if not app_state.has_prometheus_collector: + return + app_state.prometheus_collector.inc_ws_active_connections() + + +def _record_ws_connection_closed( + socket: WebSocket[Any, Any, Any], + *, + duration_sec: float, +) -> None: + """Observe the WS lifetime histogram and decrement the active gauge.""" + app_state = socket.app.state["app_state"] + if not app_state.has_prometheus_collector: + return + collector = app_state.prometheus_collector + collector.record_ws_connection_lifetime( + transport="websocket", + duration_sec=duration_sec, + ) + collector.dec_ws_active_connections() + + async def _teardown_connection( socket: WebSocket[Any, Any, Any], user: AuthenticatedUser, @@ -771,6 +804,13 @@ async def ws_handler( return channels_plugin, subscriber = setup + # Wall-clock start so the teardown path can observe connection + # lifetime into the ``synthorg_ws_connection_lifetime_seconds`` + # histogram. ``time.monotonic`` so a wall-clock NTP step does not + # push the bucket bound. + connection_started_at = time.monotonic() + _record_ws_connection_opened(socket) + # Auto-subscribe to the user's private channel. user_ch = user_channel(user.user_id) subscribed: set[str] = {user_ch} @@ -849,6 +889,10 @@ async def _event_callback(event_data: bytes) -> None: ) finally: if consumer_task is not None: + _record_ws_connection_closed( + socket, + duration_sec=time.monotonic() - connection_started_at, + ) await _teardown_connection( socket, user, diff --git a/src/synthorg/api/dto.py b/src/synthorg/api/dto.py index 123862629d..43a5864ce7 100644 --- a/src/synthorg/api/dto.py +++ b/src/synthorg/api/dto.py @@ -35,6 +35,7 @@ _MAX_METADATA_KEYS: int = 20 _MAX_METADATA_STR_LEN: int = 256 +_MAX_SELECTION_WEIGHT: int = 1000 # ── Structured error detail (RFC 9457) ───────────────────────── @@ -432,6 +433,56 @@ class TransitionTaskRequest(BaseModel): ) +class RegisterExperimentVariantRequest(BaseModel): + """Payload for registering an A/B experiment variant.""" + + model_config = ConfigDict(frozen=True, allow_inf_nan=False, extra="forbid") + + variant: NotBlankStr = Field(description="Variant name within the experiment") + weight: int = Field( + ge=1, + le=_MAX_SELECTION_WEIGHT, + description="Relative selection weight", + ) + description: str = Field(default="", description="Operator notes") + + +class AssignExperimentRequest(BaseModel): + """Payload for requesting a deterministic variant assignment.""" + + model_config = ConfigDict(frozen=True, allow_inf_nan=False, extra="forbid") + + subject_id: NotBlankStr = Field( + description="Subject identifier (agent id, user id, project id, ...)", + ) + + +class ExecuteTaskRequest(BaseModel): + """Payload for the worker-callable ``POST /tasks/{id}/execute`` endpoint. + + Mirrors the ``TaskClaim`` envelope fields the worker carries so the + backend's ``WorkerExecutionService`` has the same provenance the + dispatcher captured when it built the claim. The endpoint only + needs the status pair and the dedup key; the task body is read + server-side via the task repository. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False, extra="forbid") + + previous_status: NotBlankStr | None = Field( + default=None, + description="Task status before the triggering transition", + ) + new_status: NotBlankStr = Field( + description=( + "Task status that triggered the dispatch (typically 'assigned' or 'ready')" + ), + ) + idempotency_key: NotBlankStr = Field( + description="Per-dispatch idempotency key; backend dedups duplicate executions", + ) + + class CancelTaskRequest(BaseModel): """Payload for cancelling a task. @@ -706,6 +757,7 @@ class RollbackAgentIdentityRequest(BaseModel): __all__ = [ "ApiResponse", "ApproveRequest", + "AssignExperimentRequest", "CancelTaskRequest", "CoordinateTaskRequest", "CoordinationPhaseResponse", @@ -718,12 +770,14 @@ class RollbackAgentIdentityRequest(BaseModel): "CreateTaskRequest", "DiscoverModelsResponse", "ErrorDetail", + "ExecuteTaskRequest", "PaginatedResponse", "PaginationMeta", "ProbeLocalResponse", "ProbePresetResponse", "ProblemDetail", "ProviderResponse", + "RegisterExperimentVariantRequest", "RejectRequest", "RollbackAgentIdentityRequest", "TestConnectionRequest", diff --git a/src/synthorg/api/etag.py b/src/synthorg/api/etag.py index 4a2523d82d..420cc9c90a 100644 --- a/src/synthorg/api/etag.py +++ b/src/synthorg/api/etag.py @@ -350,9 +350,9 @@ async def _emit_passthrough( The captured ``Content-Length`` (if any) is replaced with ``len(body)`` because the truncation-fallback path may have - captured fewer bytes than the inner app originally promised; an - unmatched length would produce an invalid response on the very - cleanup path this helper exists to make safe. + captured fewer bytes than the inner app declared; an unmatched + length would produce an invalid response on the very cleanup path + this helper exists to make safe. """ headers_value = captured_start.get("headers", []) headers: list[tuple[bytes, bytes]] = ( diff --git a/src/synthorg/api/lifecycle_builder.py b/src/synthorg/api/lifecycle_builder.py index c8560604bb..b793eb20c4 100644 --- a/src/synthorg/api/lifecycle_builder.py +++ b/src/synthorg/api/lifecycle_builder.py @@ -1,7 +1,9 @@ """Startup/shutdown lifecycle builder for the Litestar application. -Contains the two-phase (construct + on_startup) wiring helpers that -were previously inlined in ``api/app.py``. +The two-phase wiring helpers live here so ``api/app.py`` stays a +thin entry point: construct-phase wires synchronous services +(registries, factories) while ``on_startup`` wires services that +require a connected persistence backend. """ import asyncio diff --git a/src/synthorg/api/rate_limits/policies.py b/src/synthorg/api/rate_limits/policies.py index 5813575285..a6efbdb1d9 100644 --- a/src/synthorg/api/rate_limits/policies.py +++ b/src/synthorg/api/rate_limits/policies.py @@ -194,11 +194,15 @@ # simulations "simulations.cancel": (30, 60), "simulations.create": (30, 3600), + # experiments (A/B test registry) + "experiments.register": (50, 60), + "experiments.assign": (500, 60), # tasks "tasks.cancel": (50, 60), "tasks.coordinate": (10, 60), "tasks.create": (50, 60), "tasks.delete": (20, 60), + "tasks.execute": (200, 60), "tasks.transition": (100, 60), "tasks.update": (100, 60), # training diff --git a/src/synthorg/api/state.py b/src/synthorg/api/state.py index a48e21b574..32e52465ba 100644 --- a/src/synthorg/api/state.py +++ b/src/synthorg/api/state.py @@ -64,6 +64,8 @@ from synthorg.engine.review_gate import ReviewGateService # noqa: TC001 from synthorg.engine.task_engine import TaskEngine # noqa: TC001 from synthorg.engine.workflow.ceremony_scheduler import CeremonyScheduler # noqa: TC001 +from synthorg.experiments import ExperimentService +from synthorg.experiments.in_memory_repository import InMemoryExperimentRepository from synthorg.hr.performance.tracker import PerformanceTracker # noqa: TC001 from synthorg.hr.registry import AgentRegistryService # noqa: TC001 from synthorg.hr.scaling.service import ScalingService # noqa: TC001 @@ -113,6 +115,10 @@ from synthorg.settings.service import SettingsService # noqa: TC001 from synthorg.telemetry.collector import TelemetryCollector # noqa: TC001 from synthorg.tools.invocation_tracker import ToolInvocationTracker # noqa: TC001 +from synthorg.workers.execution_service import ( + LifecycleAdvancingExecutionService, + WorkerExecutionService, +) if TYPE_CHECKING: from synthorg.a2a.agent_card import AgentCardBuilder @@ -195,11 +201,13 @@ class AppState(AppStateServicesMixin): "_evaluation_version_service", "_event_stream_hub", "_events_read_service", + "_experiment_service", "_fine_tune_orchestrator", "_health_prober_service", "_idempotency_service", "_integration_health_facade_service", "_interrupt_store", + "_lazy_service_lock", "_lockout_store", "_mcp_catalog_facade_service", "_mcp_catalog_service", @@ -272,6 +280,7 @@ class AppState(AppStateServicesMixin): "_webhook_event_bridge", "_webhook_replay_protector", "_webhook_service", + "_worker_execution_service", "_workflow_execution_service", "_workflow_rollback_service", "_workflow_service", @@ -394,6 +403,21 @@ def __init__( # noqa: PLR0913, PLR0915 self._tunnel_provider = tunnel_provider self._webhook_event_bridge = webhook_event_bridge self._webhook_replay_protector: object | None = None + # Defaults to a lifecycle-advancing implementation wired + # against the task engine in lifecycle_builder; production + # deployments may swap the implementation to invoke the full + # AgentEngine instead of the baseline lifecycle walk. + self._worker_execution_service: WorkerExecutionService | None = None + # Guards the double-checked locking on first-access lazy wiring + # of worker_execution_service / experiment_service. Both + # properties may be invoked from concurrent request handlers + # before any explicit ``set_*`` call, so the bare None check + # without a lock could construct two instances and lose state. + self._lazy_service_lock: threading.Lock = threading.Lock() + # Lazily constructed against an in-memory repository so the + # ``/experiments`` controller works out of the box; deployments + # swap in a durable repository via ``set_experiment_service``. + self._experiment_service: ExperimentService | None = None # Lazily constructed when first accessed via the property; the # service wraps ``persistence.idempotency_keys`` and lives only # if a persistence backend is configured. @@ -780,6 +804,66 @@ def set_task_engine(self, engine: TaskEngine) -> None: """Attach the task engine (once-only).""" self._set_once("_task_engine", engine, "Task engine") + @property + def worker_execution_service(self) -> WorkerExecutionService: + """Return the worker-callable execution service or auto-wire the default. + + Lazily constructs the baseline lifecycle-advancing service + the first time the worker-callable execute endpoint fires. + Deployments that want the full agent-runtime invocation call + :meth:`set_worker_execution_service` at startup to swap the + implementation before any HTTP traffic arrives. + """ + if self._worker_execution_service is None: + with self._lazy_service_lock: + if self._worker_execution_service is None: + self._worker_execution_service = LifecycleAdvancingExecutionService( + task_engine=self.task_engine, + ) + return self._worker_execution_service + + def set_worker_execution_service( + self, + service: WorkerExecutionService, + ) -> None: + """Attach a worker execution service implementation (once-only). + + Wired before any HTTP traffic so the property's lazy default + does not race the explicit assignment. + """ + self._set_once( + "_worker_execution_service", + service, + "Worker execution service", + ) + + @property + def experiment_service(self) -> ExperimentService: + """Return the A/B experiment service, auto-wiring the default. + + Lazy construction uses the in-memory repository so the + ``/experiments`` controller works in dev / smoke-test runs + without a persistence backend. Production deployments call + :meth:`set_experiment_service` at startup with a durable + repository before any HTTP traffic arrives. + """ + if self._experiment_service is None: + with self._lazy_service_lock: + if self._experiment_service is None: + self._experiment_service = ExperimentService( + repository=InMemoryExperimentRepository(), + clock=self.clock, + ) + return self._experiment_service + + def set_experiment_service(self, service: ExperimentService) -> None: + """Attach the experiment service (once-only).""" + self._set_once( + "_experiment_service", + service, + "Experiment service", + ) + @property def distributed_task_queue(self) -> JetStreamTaskQueue | None: """Return the distributed task queue, or ``None`` when not wired.""" diff --git a/src/synthorg/budget/_aggregation.py b/src/synthorg/budget/_aggregation.py index f6f25eb415..0006e8c2a6 100644 --- a/src/synthorg/budget/_aggregation.py +++ b/src/synthorg/budget/_aggregation.py @@ -1,8 +1,8 @@ """Shared aggregation helpers for cost-record analyses. -Consolidates the ``defaultdict(list)`` + ``math.fsum`` + cost-per-1k -idiom that previously lived in both ``_tracker_helpers`` and -``_optimizer_helpers``. Pure functions, no I/O. +The ``defaultdict(list)`` plus ``math.fsum`` plus cost-per-1k +aggregation idiom used across ``_tracker_helpers`` and +``_optimizer_helpers`` lives here as pure functions with no I/O. Same-currency enforcement: :func:`sum_cost` calls ``assert_currencies_match`` itself so it is safe by construction; diff --git a/src/synthorg/budget/enforcer.py b/src/synthorg/budget/enforcer.py index 7c63ca212c..4800e4531c 100644 --- a/src/synthorg/budget/enforcer.py +++ b/src/synthorg/budget/enforcer.py @@ -86,6 +86,29 @@ _DEFAULT_TIMEOUT_SEC: Final[float] = 5.0 +def _current_trace_ids() -> tuple[str | None, str | None]: + """Return ``(trace_id, span_id)`` from the active OTel span, or ``(None, None)``. + + The OpenTelemetry API import is local because the project keeps + ``opentelemetry`` as an optional surface and the enforcer must + still emit the warning even when tracing is unconfigured. Both + ids are returned as hex strings so structured log sinks can route + them through the existing trace-correlation pipeline. + """ + try: + from opentelemetry import trace as _otel_trace # noqa: PLC0415 + except ImportError: + return (None, None) + span = _otel_trace.get_current_span() + span_ctx = span.get_span_context() + if not span_ctx.is_valid: + return (None, None) + return ( + f"{span_ctx.trace_id:032x}", + f"{span_ctx.span_id:016x}", + ) + + class BudgetEnforcer(BudgetEnforcerRiskMixin): """Budget enforcement: pre-flight, in-flight, and auto-downgrade. @@ -459,12 +482,15 @@ async def _check_monthly_hard_stop( ) if monthly_cost >= hard_stop_limit: + trace_id, span_id = _current_trace_ids() logger.warning( BUDGET_HARD_STOP_EXCEEDED, agent_id=agent_id, total_cost=monthly_cost, monthly_budget=cfg.total_monthly, hard_stop_limit=hard_stop_limit, + trace_id=trace_id, + span_id=span_id, ) _fmt = format_cost _cur = cfg.currency diff --git a/src/synthorg/client/pool.py b/src/synthorg/client/pool.py index c3310d0c0a..1ba6dd094a 100644 --- a/src/synthorg/client/pool.py +++ b/src/synthorg/client/pool.py @@ -9,6 +9,7 @@ ) from synthorg.client.protocols import ClientInterface # noqa: TC001 from synthorg.observability import get_logger +from synthorg.observability.events.client import CLIENT_NOT_FOUND logger = get_logger(__name__) @@ -69,6 +70,11 @@ async def remove(self, client_id: str) -> ClientProfile: """ async with self._lock: if client_id not in self._profiles: + logger.warning( + CLIENT_NOT_FOUND, + client_id=client_id, + operation="remove", + ) msg = f"Client {client_id!r} not found" raise KeyError(msg) profile = self._profiles.pop(client_id) @@ -89,6 +95,11 @@ async def deactivate(self, client_id: str) -> ClientProfile: """ async with self._lock: if client_id not in self._profiles: + logger.warning( + CLIENT_NOT_FOUND, + client_id=client_id, + operation="deactivate", + ) msg = f"Client {client_id!r} not found" raise KeyError(msg) self._active[client_id] = False @@ -105,6 +116,11 @@ async def reactivate(self, client_id: str) -> ClientProfile: """ async with self._lock: if client_id not in self._profiles: + logger.warning( + CLIENT_NOT_FOUND, + client_id=client_id, + operation="reactivate", + ) msg = f"Client {client_id!r} not found" raise KeyError(msg) self._active[client_id] = True @@ -118,6 +134,11 @@ async def is_active(self, client_id: str) -> bool: """ async with self._lock: if client_id not in self._profiles: + logger.warning( + CLIENT_NOT_FOUND, + client_id=client_id, + operation="is_active", + ) msg = f"Client {client_id!r} not found" raise KeyError(msg) return self._active.get(client_id, True) @@ -130,6 +151,11 @@ async def get_profile(self, client_id: str) -> ClientProfile: """ async with self._lock: if client_id not in self._profiles: + logger.warning( + CLIENT_NOT_FOUND, + client_id=client_id, + operation="get_profile", + ) msg = f"Client {client_id!r} not found" raise KeyError(msg) return self._profiles[client_id] diff --git a/src/synthorg/client/store.py b/src/synthorg/client/store.py index e242527578..925b364645 100644 --- a/src/synthorg/client/store.py +++ b/src/synthorg/client/store.py @@ -18,10 +18,12 @@ from synthorg.observability import get_logger from synthorg.observability.events.client import ( CLIENT_FEEDBACK_RECORDED, + CLIENT_REQUEST_NOT_FOUND, CLIENT_REQUEST_SUBMITTED, SIMULATION_RUN_CANCELLED, SIMULATION_RUN_COMPLETED, SIMULATION_RUN_FAILED, + SIMULATION_RUN_NOT_FOUND, SIMULATION_RUN_STARTED, SIMULATION_RUN_UPDATE_REJECTED, ) @@ -144,6 +146,11 @@ async def get(self, request_id: str) -> ClientRequest: """Return the request by id or raise ``KeyError``.""" async with self._lock: if request_id not in self._requests: + logger.warning( + CLIENT_REQUEST_NOT_FOUND, + request_id=request_id, + operation="get", + ) msg = f"Request {request_id!r} not found" raise KeyError(msg) return self._requests[request_id] @@ -267,6 +274,11 @@ async def get(self, simulation_id: str) -> SimulationRecord: """Return the record by id or raise ``KeyError``.""" async with self._lock: if simulation_id not in self._runs: + logger.warning( + SIMULATION_RUN_NOT_FOUND, + simulation_id=simulation_id, + operation="get", + ) msg = f"Simulation {simulation_id!r} not found" raise KeyError(msg) return self._runs[simulation_id] diff --git a/src/synthorg/core/concurrency/cas_retry.py b/src/synthorg/core/concurrency/cas_retry.py index 6c18c53ed4..3c794850cf 100644 --- a/src/synthorg/core/concurrency/cas_retry.py +++ b/src/synthorg/core/concurrency/cas_retry.py @@ -1,16 +1,19 @@ """Compare-and-set retry loop for optimistic-concurrency mutations. -Centralizes the read-modify-write cycle that mutation services run -under optimistic concurrency. Callers provide a ``read`` closure that -performs the read + validation + new-value construction (returning a -``(new_value, version)`` pair) and a ``write`` callable that persists -the new value guarded by the version. The handler retries up to -``max_attempts`` on :class:`VersionConflictError`, emitting structured -``API_CONCURRENCY_CONFLICT`` logs at DEBUG on each retry and at -WARNING on the final exhausted attempt before re-raising. - -Replaces the inline ``for attempt in range(_MAX_CAS_ATTEMPTS)`` loops -that previously lived in every mutation method. +Centralises the read-modify-write cycle that mutation services run +under optimistic concurrency. Callers provide a ``read`` closure that +performs the read plus validation plus new-value construction +(returning a ``(new_value, version)`` pair) and a ``write`` callable +that persists the new value guarded by the version. The handler +retries up to ``max_attempts`` on :class:`VersionConflictError`, +emitting structured ``API_CONCURRENCY_CONFLICT`` logs at DEBUG on +each retry and at WARNING on the final exhausted attempt before +re-raising. + +Centralising the loop here keeps the retry policy (attempt count, +log severity transition, exception surface) consistent across every +mutation method without each one carrying its own +``for attempt in range(_MAX_CAS_ATTEMPTS)`` block. """ from typing import TYPE_CHECKING, Final, TypeVar diff --git a/src/synthorg/core/state_machine.py b/src/synthorg/core/state_machine.py index 17943bb294..63236559fb 100644 --- a/src/synthorg/core/state_machine.py +++ b/src/synthorg/core/state_machine.py @@ -47,8 +47,9 @@ class StateMachine[S: _HasValue]: Pre-validates the transition table at construction: every enum member referenced by the enumerated state type must appear as a - key, mirroring the ``_missing`` checks previously duplicated - across four modules. + key. Centralising the ``_missing``-style completeness check here + means callers cannot accidentally ship a partial table that + silently routes an unknown state to the default branch. The transition table is deep-copied and wrapped in a ``MappingProxyType`` at construction so mutations of the caller's diff --git a/src/synthorg/engine/quality/decomposers/llm.py b/src/synthorg/engine/quality/decomposers/llm.py index c768ff95e9..cee13d457f 100644 --- a/src/synthorg/engine/quality/decomposers/llm.py +++ b/src/synthorg/engine/quality/decomposers/llm.py @@ -329,7 +329,7 @@ async def _invoke_provider( messages=messages, model=self._model_id, tools=[tool], - config=CompletionConfig(temperature=0.0, max_tokens=2048), + config=CompletionConfig(temperature=0.0, top_p=1.0, max_tokens=2048), ) def _extract_raw_probes( diff --git a/src/synthorg/engine/task_engine.py b/src/synthorg/engine/task_engine.py index 50ed3f88e3..f55faf9d1b 100644 --- a/src/synthorg/engine/task_engine.py +++ b/src/synthorg/engine/task_engine.py @@ -387,25 +387,30 @@ async def stop(self, *, timeout: float | None = None) -> None: # noqa: ASYNC109 async def _drain_all(self, effective_timeout: float) -> None: """Drain the mutation queue + observer queue within the given budget. - Extracted from :meth:`stop` so the outer ``asyncio.wait_for`` - hard-deadline guard has a single awaitable to bound. + Splits ``effective_timeout`` evenly between the processing-drain + stage and the observer-drain stage so each stage is guaranteed + at least ``effective_timeout / 2``. A slow processing + cancellation (cancellation handshake latency under contention, + for example) cannot starve the observer drain into a + zero-budget call, and the outer + ``hard_deadline = 2 * effective_timeout`` guard set by + :meth:`stop` never fires on a normal drain. Wrapped in a single + awaitable so the outer ``asyncio.wait_for`` has exactly one + thing to bound. """ - loop = asyncio.get_running_loop() - deadline = loop.time() + effective_timeout + stage_budget = effective_timeout / 2.0 - await self._drain_processing(effective_timeout) + await self._drain_processing(stage_budget) # Signal the observer loop that no more events will arrive. - # Bounded by remaining budget -- if the queue is full and the - # dispatcher is stuck, we skip the sentinel and let - # _drain_observer cancel the observer task on timeout. - remaining = max(0.0, deadline - loop.time()) + # Bounded by the observer-stage budget -- if the queue is full + # and the dispatcher is stuck, the suppressed TimeoutError lets + # _drain_observer cancel the observer task on its own timeout. with contextlib.suppress(TimeoutError): await asyncio.wait_for( self._observer_queue.put(None), - timeout=remaining, + timeout=stage_budget, ) - observer_budget = max(0.0, deadline - loop.time()) - await self._drain_observer(observer_budget) + await self._drain_observer(stage_budget) @property def is_running(self) -> bool: diff --git a/src/synthorg/engine/workflow/ceremony_scheduler.py b/src/synthorg/engine/workflow/ceremony_scheduler.py index 3814de091a..e55b59521e 100644 --- a/src/synthorg/engine/workflow/ceremony_scheduler.py +++ b/src/synthorg/engine/workflow/ceremony_scheduler.py @@ -9,6 +9,7 @@ """ import asyncio +from collections.abc import Callable # noqa: TC003 -- runtime annotation on __init__ from typing import TYPE_CHECKING, Any from synthorg.core.clock import Clock, SystemClock @@ -30,6 +31,7 @@ from synthorg.observability import get_logger from synthorg.observability.events.workflow import ( SPRINT_AUTO_TRANSITION, + SPRINT_CEREMONY_BUDGET_SNAPSHOT_FAILED, SPRINT_CEREMONY_DEACTIVATION_HOOK_FAILED, SPRINT_CEREMONY_SCHEDULER_START_FAILED, SPRINT_CEREMONY_SCHEDULER_STARTED, @@ -39,6 +41,7 @@ SPRINT_CEREMONY_STRATEGY_HOOK_FAILED, SPRINT_CEREMONY_TRIGGER_FAILED, SPRINT_CEREMONY_TRIGGERED, + SPRINT_STATUS_TRANSITIONED, ) if TYPE_CHECKING: @@ -86,6 +89,7 @@ class CeremonyScheduler: "_activation_time", "_active_sprint", "_active_strategy", + "_budget_snapshot", "_clock", "_completion_counters", "_fired_once_triggers", @@ -102,9 +106,27 @@ def __init__( *, meeting_scheduler: MeetingScheduler, clock: Clock | None = None, + budget_snapshot: Callable[[], tuple[float, float]] | None = None, ) -> None: + """Wire the scheduler against the meeting subsystem. + + Args: + meeting_scheduler: The MeetingScheduler that dispatches + ceremony meetings. + clock: Optional Clock for the activation timestamp seam. + budget_snapshot: Optional sync callable returning the + current ``(consumed_fraction, remaining)`` pair. When + provided, the scheduler threads the values into every + CeremonyEvalContext so budget-driven strategies can + evaluate against live spend. When ``None`` the context + fields fall back to ``(0.0, 0.0)`` and the runtime + logs a single ``SPRINT_CEREMONY_BUDGET_BRIDGE_OFF`` + event at scheduler activation so operators know the + strategy is running blind. + """ self._meeting_scheduler = meeting_scheduler self._clock = clock or SystemClock() + self._budget_snapshot = budget_snapshot self._active_strategy: CeremonySchedulingStrategy | None = None self._active_sprint: Sprint | None = None self._sprint_config: SprintConfig | None = None @@ -116,6 +138,27 @@ def __init__( self._velocity_history: tuple[VelocityRecord, ...] = () self._lock = asyncio.Lock() + def _resolve_budget_snapshot(self) -> tuple[float, float]: + """Return the live budget snapshot, or zeros when none is wired. + + Errors from the snapshot callable are swallowed (with a + warning) so a transient budget-service failure cannot break + ceremony evaluation. The strategy then evaluates against + ``(0.0, 0.0)`` for that one tick. + """ + if self._budget_snapshot is None: + return (0.0, 0.0) + try: + return self._budget_snapshot() + except MemoryError, RecursionError: + raise + except Exception as exc: + logger.warning( + SPRINT_CEREMONY_BUDGET_SNAPSHOT_FAILED, + error_type=type(exc).__name__, + ) + return (0.0, 0.0) + @property def running(self) -> bool: """Whether the scheduler has an active sprint.""" @@ -412,15 +455,27 @@ def _check_auto_transition( context, ) if target is not None and sprint.status is SprintStatus.ACTIVE: + previous_status = sprint.status.value logger.info( SPRINT_AUTO_TRANSITION, sprint_id=sprint.id, - from_status=sprint.status.value, + from_status=previous_status, to_status=target.value, strategy=self._active_strategy.strategy_type.value, ) sprint = sprint.with_transition(target) self._active_sprint = sprint + # Sprint state is in-memory on the scheduler (no sprint + # repository exists today), so this is a transition of the + # cached object, not a persistence write. Logged at DEBUG + # so it does not get treated as an audit-grade transition + # event alongside the persisted ``client.request`` family. + logger.debug( + SPRINT_STATUS_TRANSITIONED, + sprint_id=sprint.id, + from_status=previous_status, + to_status=sprint.status.value, + ) return sprint # -- One-shot ceremonies ------------------------------------------- @@ -513,15 +568,15 @@ def _build_context(self, sprint: Sprint) -> CeremonyEvalContext: Per-ceremony contexts use ``_build_ceremony_context`` instead. """ total_tasks, _, pct = self._compute_sprint_progress(sprint) + consumed_fraction, remaining = self._resolve_budget_snapshot() return CeremonyEvalContext( completions_since_last_trigger=0, total_completions_this_sprint=self._total_completions, total_tasks_in_sprint=total_tasks, elapsed_seconds=self._clock.monotonic() - self._activation_time, - # Budget integration is a follow-up. - budget_consumed_fraction=0.0, - budget_remaining=0.0, + budget_consumed_fraction=consumed_fraction, + budget_remaining=remaining, velocity_history=self._velocity_history, external_events=(), sprint_percentage_complete=pct, @@ -536,6 +591,7 @@ def _build_ceremony_context( ) -> CeremonyEvalContext: """Build context for a specific ceremony (per-ceremony counter).""" total_tasks, _, pct = self._compute_sprint_progress(sprint) + consumed_fraction, remaining = self._resolve_budget_snapshot() return CeremonyEvalContext( completions_since_last_trigger=self._completion_counters.get( @@ -545,9 +601,8 @@ def _build_ceremony_context( total_completions_this_sprint=self._total_completions, total_tasks_in_sprint=total_tasks, elapsed_seconds=self._clock.monotonic() - self._activation_time, - # Budget integration is a follow-up. - budget_consumed_fraction=0.0, - budget_remaining=0.0, + budget_consumed_fraction=consumed_fraction, + budget_remaining=remaining, velocity_history=self._velocity_history, external_events=(), sprint_percentage_complete=pct, diff --git a/src/synthorg/engine/workspace/config.py b/src/synthorg/engine/workspace/config.py index 367d2be456..1fc3c7502f 100644 --- a/src/synthorg/engine/workspace/config.py +++ b/src/synthorg/engine/workspace/config.py @@ -49,6 +49,12 @@ class SemanticAnalysisConfig(BaseModel): le=2.0, description="Temperature for LLM analysis", ) + llm_top_p: float = Field( + default=1.0, + ge=0.0, + le=1.0, + description="Nucleus-sampling top-p for LLM analysis", + ) llm_max_tokens: int = Field( default=4096, gt=0, diff --git a/src/synthorg/engine/workspace/semantic_llm.py b/src/synthorg/engine/workspace/semantic_llm.py index 85a0a1e670..813dbb41f6 100644 --- a/src/synthorg/engine/workspace/semantic_llm.py +++ b/src/synthorg/engine/workspace/semantic_llm.py @@ -169,6 +169,7 @@ def _prepare_review_context( tool_def = build_semantic_review_tool() comp_config = CompletionConfig( temperature=self._config.llm_temperature, + top_p=self._config.llm_top_p, max_tokens=self._config.llm_max_tokens, ) return messages, tool_def, comp_config diff --git a/src/synthorg/experiments/__init__.py b/src/synthorg/experiments/__init__.py new file mode 100644 index 0000000000..e18e4bdb7c --- /dev/null +++ b/src/synthorg/experiments/__init__.py @@ -0,0 +1,28 @@ +"""A/B test variant registry. + +Operators register experiment variants (the alternatives under test) +and ask the service to assign a subject (agent, user, project) to one +variant deterministically. The assignment is hashed by ``(experiment, +subject_id)`` so the same subject always lands on the same variant; +the persistence layer records every assignment for audit and rollout +analysis. + +The service is intentionally thin: it owns variant CRUD plus the +assignment computation. Higher-level orchestration (rollout +percentages, ramp-up curves, kill-switches) layers on top via the +existing settings registry. +""" + +from synthorg.experiments.models import ( + ExperimentAssignment, + ExperimentVariant, +) +from synthorg.experiments.protocol import ExperimentRepository +from synthorg.experiments.service import ExperimentService + +__all__ = ( + "ExperimentAssignment", + "ExperimentRepository", + "ExperimentService", + "ExperimentVariant", +) diff --git a/src/synthorg/experiments/in_memory_repository.py b/src/synthorg/experiments/in_memory_repository.py new file mode 100644 index 0000000000..b08891d02d --- /dev/null +++ b/src/synthorg/experiments/in_memory_repository.py @@ -0,0 +1,139 @@ +"""In-memory :class:`ExperimentRepository` implementation. + +Used for tests and for deployments that do not require durability on +the experiment registry. The repository is async-safe via an internal +:class:`asyncio.Lock` so concurrent writes do not race; it does not +persist anything across process restarts. +""" + +# ruff: noqa: D102 -- protocol-method overrides; docstrings live on the protocol. + +import asyncio +from dataclasses import dataclass + +from pydantic import AwareDatetime # noqa: TC002 -- runtime annotation + +from synthorg.core.types import NotBlankStr # noqa: TC001 -- runtime annotation +from synthorg.experiments.models import ( # noqa: TC001 -- runtime annotation + ExperimentAssignment, + ExperimentVariant, +) + + +@dataclass(frozen=True, slots=True) +class _VariantKey: + experiment: str + variant: str + + +@dataclass(frozen=True, slots=True) +class _AssignmentKey: + experiment: str + subject_id: str + + +class InMemoryExperimentRepository: + """Backing-store-free implementation of :class:`ExperimentRepository`.""" + + def __init__(self) -> None: + self._variants: dict[_VariantKey, ExperimentVariant] = {} + self._variant_order: list[_VariantKey] = [] + self._assignments: dict[_AssignmentKey, ExperimentAssignment] = {} + self._lock = asyncio.Lock() + + async def save(self, variant: ExperimentVariant) -> None: + key = _VariantKey( + experiment=str(variant.experiment), + variant=str(variant.variant), + ) + async with self._lock: + if key not in self._variants: + self._variant_order.append(key) + self._variants[key] = variant + + async def list_for_experiment( + self, + experiment: NotBlankStr, + ) -> tuple[ExperimentVariant, ...]: + async with self._lock: + matches = [ + self._variants[k] + for k in self._variant_order + if k.experiment == str(experiment) + ] + return tuple(matches) + + async def delete( + self, + *, + experiment: NotBlankStr, + variant: NotBlankStr, + ) -> bool: + key = _VariantKey(experiment=str(experiment), variant=str(variant)) + async with self._lock: + if key not in self._variants: + return False + del self._variants[key] + self._variant_order = [k for k in self._variant_order if k != key] + return True + + async def record_assignment( + self, + assignment: ExperimentAssignment, + ) -> None: + key = _AssignmentKey( + experiment=str(assignment.experiment), + subject_id=str(assignment.subject_id), + ) + async with self._lock: + self._assignments[key] = assignment + + async def get_assignment( + self, + *, + experiment: NotBlankStr, + subject_id: NotBlankStr, + ) -> ExperimentAssignment | None: + key = _AssignmentKey( + experiment=str(experiment), + subject_id=str(subject_id), + ) + async with self._lock: + return self._assignments.get(key) + + async def list_assignments( + self, + experiment: NotBlankStr, + *, + limit: int, + offset: int, + ) -> tuple[tuple[ExperimentAssignment, ...], int]: + async with self._lock: + matches = sorted( + ( + a + for k, a in self._assignments.items() + if k.experiment == str(experiment) + ), + key=lambda a: a.assigned_at, + reverse=True, + ) + total = len(matches) + offset = max(0, offset) + end = offset + max(0, limit) + return tuple(matches[offset:end]), total + + async def assigned_at(self, *, now: AwareDatetime) -> AwareDatetime: + return now + + async def clear(self) -> None: + """Drop every variant and assignment. + + Used by tests between scenarios; not part of the + ``ExperimentRepository`` protocol because production + repositories should not expose a clear-all surface. + """ + async with self._lock: + self._variants.clear() + self._variant_order.clear() + self._assignments.clear() diff --git a/src/synthorg/experiments/models.py b/src/synthorg/experiments/models.py new file mode 100644 index 0000000000..08b8f46367 --- /dev/null +++ b/src/synthorg/experiments/models.py @@ -0,0 +1,57 @@ +"""Pydantic models for the A/B experiment registry.""" + +from typing import Final + +from pydantic import AwareDatetime, BaseModel, ConfigDict, Field + +from synthorg.core.types import NotBlankStr # noqa: TC001 -- Pydantic field annotation + +_MAX_VARIANT_WEIGHT: Final[int] = 1000 + + +class ExperimentVariant(BaseModel): + """A single variant registered against an experiment key. + + Attributes: + experiment: Unique identifier for the experiment (the "what is + under test"; e.g. ``"intake_prompt_v2"``). + variant: Variant name within the experiment (e.g. ``"control"``, + ``"treatment"``). + weight: Relative weight used during deterministic assignment. + Must be a positive integer; the assignment computes + ``hash(subject) modulo total_weight`` and walks the variant + list in registration order to pick the variant whose + cumulative weight bracket contains the hash. + description: Operator-facing description. + created_at: When the variant was registered. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False, extra="forbid") + + experiment: NotBlankStr = Field(description="Experiment key (kebab-case)") + variant: NotBlankStr = Field(description="Variant name within the experiment") + weight: int = Field( + ge=1, + le=_MAX_VARIANT_WEIGHT, + description="Relative selection weight", + ) + description: str = Field(default="", description="Operator notes") + created_at: AwareDatetime = Field(description="Registration timestamp (UTC)") + + +class ExperimentAssignment(BaseModel): + """A recorded assignment of a subject to a variant. + + Attributes: + experiment: Experiment key. + subject_id: Hashed subject identifier (agent id, user id, etc.). + variant: Variant the subject was assigned. + assigned_at: When the assignment was first computed. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False, extra="forbid") + + experiment: NotBlankStr = Field(description="Experiment key") + subject_id: NotBlankStr = Field(description="Subject identifier") + variant: NotBlankStr = Field(description="Variant the subject was assigned to") + assigned_at: AwareDatetime = Field(description="Assignment timestamp (UTC)") diff --git a/src/synthorg/experiments/protocol.py b/src/synthorg/experiments/protocol.py new file mode 100644 index 0000000000..a4446cdce9 --- /dev/null +++ b/src/synthorg/experiments/protocol.py @@ -0,0 +1,89 @@ +"""Repository protocol for the A/B experiment registry.""" + +from typing import Protocol, runtime_checkable + +from pydantic import AwareDatetime # noqa: TC002 -- runtime Protocol annotation + +from synthorg.core.types import ( + NotBlankStr, # noqa: TC001 -- runtime Protocol annotation +) +from synthorg.experiments.models import ( # noqa: TC001 -- runtime Protocol annotation + ExperimentAssignment, + ExperimentVariant, +) + + +@runtime_checkable +class ExperimentRepository(Protocol): + """Persistence boundary for variant registration and assignment.""" + + async def save(self, variant: ExperimentVariant) -> None: + """Insert or replace ``variant`` keyed on ``(experiment, variant)``.""" + ... + + async def list_for_experiment( + self, + experiment: NotBlankStr, + ) -> tuple[ExperimentVariant, ...]: + """Return every registered variant for ``experiment``. + + Ordering is by registration timestamp (oldest first) so the + assignment hash walk is deterministic across processes. + """ + ... + + async def delete( + self, + *, + experiment: NotBlankStr, + variant: NotBlankStr, + ) -> bool: + """Remove a variant. Returns ``True`` when a row was deleted.""" + ... + + async def record_assignment( + self, + assignment: ExperimentAssignment, + ) -> None: + """Insert or update the row keyed on ``(experiment, subject_id)``. + + Durable backends may enforce a unique constraint on the + composite key and raise + :class:`synthorg.core.domain_errors.ConflictError` when a + concurrent writer lands the row first. Callers handle the + conflict by re-reading via :meth:`get_assignment`. + """ + ... + + async def get_assignment( + self, + *, + experiment: NotBlankStr, + subject_id: NotBlankStr, + ) -> ExperimentAssignment | None: + """Return the previously-recorded assignment, or ``None`` if absent.""" + ... + + async def list_assignments( + self, + experiment: NotBlankStr, + *, + limit: int, + offset: int, + ) -> tuple[tuple[ExperimentAssignment, ...], int]: + """Return ``(page, total)`` for the experiment's assignments. + + Ordering is by ``assigned_at`` descending so the most recent + assignments appear first; total carries the unbounded count so + the controller can render pagination metadata. + """ + ... + + async def assigned_at(self, *, now: AwareDatetime) -> AwareDatetime: + """Return the canonical assignment timestamp for repository writes. + + The default implementation echoes ``now``; backends override + when they need to normalise to their own clock (e.g. for + replay-safe ordering). + """ + ... diff --git a/src/synthorg/experiments/service.py b/src/synthorg/experiments/service.py new file mode 100644 index 0000000000..3911b3c62c --- /dev/null +++ b/src/synthorg/experiments/service.py @@ -0,0 +1,270 @@ +"""A/B experiment registry service. + +Wraps :class:`ExperimentRepository` with the variant-registration and +deterministic-assignment logic. Operators interact with the service +exclusively through the controller (REST) or MCP surface; the service +itself never accepts raw dicts. + +Assignment is deterministic: same ``(experiment, subject_id)`` +ALWAYS lands on the same variant for a given variant set. Adding a +new variant to an experiment may shift previously-recorded +assignments because the cumulative-weight bracket walk changes; +the service records every assignment on first computation so the +historical assignment is preserved across variant edits (lookup +returns the recorded assignment when one exists). +""" + +import hashlib +from typing import TYPE_CHECKING, Final + +from synthorg.core.clock import Clock, SystemClock +from synthorg.core.domain_errors import ConflictError, NotFoundError, ValidationError +from synthorg.core.types import NotBlankStr +from synthorg.experiments.models import ( + _MAX_VARIANT_WEIGHT, + ExperimentAssignment, + ExperimentVariant, +) +from synthorg.observability import get_logger +from synthorg.observability.events.experiments import ( + EXPERIMENT_ASSIGNMENT_COMPUTED, + EXPERIMENT_ASSIGNMENT_REPLAYED, + EXPERIMENT_NOT_FOUND, + EXPERIMENT_VARIANT_DELETED, + EXPERIMENT_VARIANT_INVALID_WEIGHT, + EXPERIMENT_VARIANT_REGISTERED, +) + +if TYPE_CHECKING: + from synthorg.experiments.protocol import ExperimentRepository + +logger = get_logger(__name__) + +_HASH_DIGEST_BYTES: Final[int] = 4 +"""Number of leading SHA-256 digest bytes folded into the bucket. Four +bytes produce a ``uint32`` so the cumulative-weight walk stays +deterministic across Python versions (the canonical hashlib algorithm +is stable). The natural ``uint32`` range already bounds the bucket +before the final ``% modulus`` reduction, so no intermediate ceiling +modulo is needed.""" + + +class ExperimentService: + """Variant registration + deterministic assignment surface.""" + + __slots__ = ("_clock", "_repo") + + def __init__( + self, + *, + repository: ExperimentRepository, + clock: Clock | None = None, + ) -> None: + self._repo = repository + self._clock: Clock = clock if clock is not None else SystemClock() + + async def register_variant( + self, + *, + experiment: NotBlankStr, + variant: NotBlankStr, + weight: int, + description: str = "", + ) -> ExperimentVariant: + """Insert or replace a variant. + + Raises: + ValidationError: When ``weight`` is non-positive (mirrors + the Pydantic model bound for callers that build a + ``weight`` dynamically rather than from a frozen DTO). + """ + if weight < 1 or weight > _MAX_VARIANT_WEIGHT: + logger.warning( + EXPERIMENT_VARIANT_INVALID_WEIGHT, + experiment=str(experiment), + variant=str(variant), + weight=weight, + ) + msg = f"weight must be between 1 and {_MAX_VARIANT_WEIGHT}" + raise ValidationError(msg) + record = ExperimentVariant( + experiment=experiment, + variant=variant, + weight=weight, + description=description, + created_at=self._clock.now(), + ) + await self._repo.save(record) + logger.info( + EXPERIMENT_VARIANT_REGISTERED, + experiment=str(experiment), + variant=str(variant), + weight=weight, + ) + return record + + async def list_variants( + self, + experiment: NotBlankStr, + ) -> tuple[ExperimentVariant, ...]: + """Return every variant registered for the experiment.""" + return await self._repo.list_for_experiment(experiment) + + async def delete_variant( + self, + *, + experiment: NotBlankStr, + variant: NotBlankStr, + ) -> bool: + """Remove a variant; returns ``True`` if a row was deleted.""" + removed = await self._repo.delete( + experiment=experiment, + variant=variant, + ) + if removed: + logger.info( + EXPERIMENT_VARIANT_DELETED, + experiment=str(experiment), + variant=str(variant), + ) + return removed + + async def assign( + self, + *, + experiment: NotBlankStr, + subject_id: NotBlankStr, + ) -> ExperimentAssignment: + """Assign ``subject_id`` to a variant deterministically. + + On the first call for a subject, computes the assignment by + hashing ``(experiment, subject_id)`` and walking the variant + list's cumulative-weight bracket. Subsequent calls return the + recorded assignment unchanged even if the variant set has + since shifted; the historical assignment is the authoritative + record. + + Raises: + NotFoundError: When the experiment has no registered + variants. + """ + recorded = await self._repo.get_assignment( + experiment=experiment, + subject_id=subject_id, + ) + if recorded is not None: + logger.info( + EXPERIMENT_ASSIGNMENT_REPLAYED, + experiment=str(experiment), + subject_id=str(subject_id), + variant=str(recorded.variant), + ) + return recorded + variants = await self._repo.list_for_experiment(experiment) + if not variants: + logger.warning( + EXPERIMENT_NOT_FOUND, + experiment=str(experiment), + subject_id=str(subject_id), + reason="no_variants_registered", + ) + msg = f"Experiment {experiment!r} has no registered variants" + raise NotFoundError(msg) + chosen = self._choose_variant(experiment, subject_id, variants) + assignment = ExperimentAssignment( + experiment=experiment, + subject_id=subject_id, + variant=NotBlankStr(chosen.variant), + assigned_at=self._clock.now(), + ) + try: + await self._repo.record_assignment(assignment) + except ConflictError: + # A concurrent writer won the insert race against a durable + # repository whose ``record_assignment`` enforces a unique + # constraint on ``(experiment, subject_id)``. Re-read the + # canonical assignment instead of failing: the choice is + # deterministic so the winning row carries the same variant, + # only the ``assigned_at`` timestamp differs. + canonical = await self._repo.get_assignment( + experiment=experiment, + subject_id=subject_id, + ) + if canonical is None: + raise + return canonical + # Re-fetch after a successful record so a concurrent writer that + # landed first (race between get_assignment and + # record_assignment under a last-write-wins backend like the + # in-memory repo) is the authoritative record. The choice is + # deterministic so both writers chose the same variant; only + # ``assigned_at`` differs, and the first writer's timestamp is + # the canonical one. + canonical = await self._repo.get_assignment( + experiment=experiment, + subject_id=subject_id, + ) + result = canonical if canonical is not None else assignment + logger.info( + EXPERIMENT_ASSIGNMENT_COMPUTED, + experiment=str(experiment), + subject_id=str(subject_id), + variant=str(result.variant), + variant_count=len(variants), + ) + return result + + async def list_assignments( + self, + experiment: NotBlankStr, + *, + limit: int, + offset: int, + ) -> tuple[tuple[ExperimentAssignment, ...], int]: + """Return ``(page, total)`` for the experiment's assignments.""" + return await self._repo.list_assignments( + experiment, + limit=limit, + offset=offset, + ) + + @staticmethod + def _choose_variant( + experiment: NotBlankStr, + subject_id: NotBlankStr, + variants: tuple[ExperimentVariant, ...], + ) -> ExperimentVariant: + """Walk the cumulative weight bracket and pick the matching variant.""" + total = sum(v.weight for v in variants) + bucket = ExperimentService._stable_bucket(experiment, subject_id, modulus=total) + cumulative = 0 + for v in variants: + cumulative += v.weight + if bucket < cumulative: + return v + return variants[-1] + + @staticmethod + def _stable_bucket( + experiment: NotBlankStr, + subject_id: NotBlankStr, + *, + modulus: int, + ) -> int: + """Return a deterministic bucket in ``[0, modulus)``. + + SHA-256 is used (rather than Python's ``hash``) so two + processes with different ``PYTHONHASHSEED`` produce identical + assignments. The leading 4 bytes are folded into a ``uint32`` + and then reduced modulo ``modulus``; the bias from non-power + moduli is negligible at the variant-weight scale this service + targets. + """ + material = f"{experiment}\x1f{subject_id}".encode() + digest = hashlib.sha256(material).digest() + raw = int.from_bytes( + digest[:_HASH_DIGEST_BYTES], + byteorder="big", + signed=False, + ) + return raw % modulus diff --git a/src/synthorg/integrations/mcp_catalog/in_memory_installations.py b/src/synthorg/integrations/mcp_catalog/in_memory_installations.py index 2821cbd8d8..f140971603 100644 --- a/src/synthorg/integrations/mcp_catalog/in_memory_installations.py +++ b/src/synthorg/integrations/mcp_catalog/in_memory_installations.py @@ -124,3 +124,20 @@ async def delete(self, catalog_entry_id: NotBlankStr) -> bool: backend="in_memory", ) return removed + + async def clear(self) -> int: + """Drop every installation; return the number of rows removed. + + Used by tests between scenarios and by the dev-mode reset + endpoint. Production deployments use the durable backends so + this method is never reachable in serious environments. + """ + async with self._get_lock(): + removed = len(self._store) + self._store.clear() + return removed + + async def size(self) -> int: + """Return the count of installations currently held in memory.""" + async with self._get_lock(): + return len(self._store) diff --git a/src/synthorg/llm/__init__.py b/src/synthorg/llm/__init__.py new file mode 100644 index 0000000000..6ef3877081 --- /dev/null +++ b/src/synthorg/llm/__init__.py @@ -0,0 +1,12 @@ +"""Cross-cutting LLM helpers: model pinning, profile metadata. + +The :class:`ModelPinMetadata` model is the source of truth for the +model + sampling parameters a prompt class commits to. Every prompt +class that calls an LLM exposes a ``metadata: ModelPinMetadata`` +property so the eval pipeline can reconstruct the exact call shape +without re-reading the class implementation. +""" + +from synthorg.llm.metadata import ModelPinMetadata + +__all__ = ("ModelPinMetadata",) diff --git a/src/synthorg/llm/metadata.py b/src/synthorg/llm/metadata.py new file mode 100644 index 0000000000..ee100b318d --- /dev/null +++ b/src/synthorg/llm/metadata.py @@ -0,0 +1,41 @@ +"""Model-pin metadata for prompt classes. + +Every prompt class that wraps an LLM call exposes a +:class:`ModelPinMetadata` instance via a ``metadata`` property. The +metadata captures: + +- ``prompt_class_id``: stable identifier for the prompt class (used + by the golden-eval pipeline to locate fixtures and by audit + dashboards to slice cost / latency by prompt purpose). +- ``model``: pinned model identifier the class was validated against. + Changing the model requires a metadata bump plus eval refresh. +- ``model_version_pinned_at``: when the pin was last validated. + Operators reading the dashboard see this timestamp and know whether + the prompt has been re-evaluated against the live provider recently. +- ``temperature`` and ``top_p``: deterministic sampling parameters + for the call. Pinned so eval results stay reproducible. + +The model is frozen and ``extra="forbid"`` so an accidental rename of +a field surfaces at construction time rather than as a silent +field-name drift in dashboards. +""" + +from pydantic import AwareDatetime, BaseModel, ConfigDict, Field + +from synthorg.core.types import NotBlankStr # noqa: TC001 -- pydantic field annotation + + +class ModelPinMetadata(BaseModel): + """Pinned-model metadata for a prompt class.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + prompt_class_id: NotBlankStr = Field( + description="Stable identifier for the prompt class", + ) + model: NotBlankStr = Field(description="Pinned model identifier") + model_version_pinned_at: AwareDatetime = Field( + description="Last validation timestamp for the pin", + ) + temperature: float = Field(ge=0.0, le=2.0, description="Sampling temperature") + top_p: float = Field(ge=0.0, le=1.0, description="Nucleus-sampling top-p") diff --git a/src/synthorg/memory/procedural/models.py b/src/synthorg/memory/procedural/models.py index 78158d98a5..852c36fd0f 100644 --- a/src/synthorg/memory/procedural/models.py +++ b/src/synthorg/memory/procedural/models.py @@ -241,6 +241,12 @@ class ProceduralMemoryConfig(BaseModel): le=2.0, description="Sampling temperature for the proposer", ) + top_p: float = Field( + default=1.0, + ge=0.0, + le=1.0, + description="Nucleus-sampling top-p for the proposer", + ) max_tokens: int = Field( default=1500, gt=0, diff --git a/src/synthorg/memory/procedural/proposer.py b/src/synthorg/memory/procedural/proposer.py index 2502c57925..a178f80baf 100644 --- a/src/synthorg/memory/procedural/proposer.py +++ b/src/synthorg/memory/procedural/proposer.py @@ -137,12 +137,14 @@ def __init__( self._cost_tracker = cost_tracker self._completion_config = CompletionConfig( temperature=config.temperature, + top_p=config.top_p, max_tokens=config.max_tokens, ) logger.debug( PROCEDURAL_MEMORY_PROPOSER_INIT, model=config.model, temperature=config.temperature, + top_p=config.top_p, max_tokens=config.max_tokens, min_confidence=config.min_confidence, ) diff --git a/src/synthorg/meta/evolution/outcome_models.py b/src/synthorg/meta/evolution/outcome_models.py index 2856a8a45a..4d291726c7 100644 --- a/src/synthorg/meta/evolution/outcome_models.py +++ b/src/synthorg/meta/evolution/outcome_models.py @@ -24,7 +24,7 @@ class EvolutionOutcomeRecord(BaseModel): :class:`~synthorg.meta.models.ProposalAltitude` values). applied: ``True`` when the proposal was applied and did not roll back; ``False`` for rejected / rolled-back / failed. - proposed_at: When the proposal was originally generated. + proposed_at: When the proposal was generated. recorded_at: When the terminal outcome was recorded. Must be greater than or equal to ``proposed_at`` -- an outcome can only be recorded at or after the proposal was made. diff --git a/src/synthorg/meta/evolution/outcome_store_protocol.py b/src/synthorg/meta/evolution/outcome_store_protocol.py index fbea6151f9..bc09384d73 100644 --- a/src/synthorg/meta/evolution/outcome_store_protocol.py +++ b/src/synthorg/meta/evolution/outcome_store_protocol.py @@ -58,7 +58,7 @@ async def record( :class:`ProposalAltitude`). applied: Whether the adaptation was applied (``True``) or rejected / rolled back (``False``). - proposed_at: When the proposal was originally generated. + proposed_at: When the proposal was generated. """ ... diff --git a/src/synthorg/meta/mcp/handlers/common_args.py b/src/synthorg/meta/mcp/handlers/common_args.py index acd80727de..e01055d4c8 100644 --- a/src/synthorg/meta/mcp/handlers/common_args.py +++ b/src/synthorg/meta/mcp/handlers/common_args.py @@ -1,9 +1,10 @@ -"""Centralized argument-validation helpers for MCP tool handlers. +"""Centralised argument-validation helpers for MCP tool handlers. -Single source of truth for every argument extraction / coercion routine -that handlers reach for. Used to be scattered across 15 domain handlers -under ``_require_non_blank`` / ``_actor_id`` / ``_get_str`` / etc.; the -duplicates now live exactly here. +Single source of truth for every argument extraction / coercion +routine that handlers reach for: ``_require_non_blank``, +``_actor_id``, ``_get_str`` and their siblings live here so each +domain handler imports the same validators instead of carrying its +own near-identical copy. The module groups its helpers in three buckets: diff --git a/src/synthorg/observability/events/client.py b/src/synthorg/observability/events/client.py index e42633198d..683e299882 100644 --- a/src/synthorg/observability/events/client.py +++ b/src/synthorg/observability/events/client.py @@ -36,3 +36,16 @@ ) CLIENT_CONFIG_INVALID: Final[str] = "client.config.invalid" + +# Pool / store lookup failures emitted before the corresponding KeyError +# so operators see which CRUD operation rejected a missing identifier, +# not just a bare exception in the controller's error envelope. +CLIENT_NOT_FOUND: Final[str] = "client.pool.client_not_found" +CLIENT_REQUEST_NOT_FOUND: Final[str] = "client.request.not_found" +SIMULATION_RUN_NOT_FOUND: Final[str] = "simulation.run.not_found" + +# Persisted status transition (emitted at the caller AFTER the request +# is saved). Pure-constructor transitions inside ``with_status`` are +# covered by CLIENT_REQUEST_TRANSITION; this event records "the row in +# the store now carries the new status". +CLIENT_REQUEST_STATUS_TRANSITIONED: Final[str] = "client.request.status_transitioned" diff --git a/src/synthorg/observability/events/experiments.py b/src/synthorg/observability/events/experiments.py new file mode 100644 index 0000000000..9025be3cc7 --- /dev/null +++ b/src/synthorg/observability/events/experiments.py @@ -0,0 +1,13 @@ +"""Event-name constants for the A/B experiment registry.""" + +from typing import Final + +EXPERIMENT_VARIANT_REGISTERED: Final[str] = "experiments.variant.registered" +EXPERIMENT_VARIANT_DELETED: Final[str] = "experiments.variant.deleted" +EXPERIMENT_VARIANT_INVALID_WEIGHT: Final[str] = "experiments.variant.invalid_weight" +EXPERIMENT_ASSIGNMENT_COMPUTED: Final[str] = "experiments.assignment.computed" +EXPERIMENT_ASSIGNMENT_REPLAYED: Final[str] = "experiments.assignment.replayed" +# Emitted before the ``assign()`` raise so operators see which +# experiment was queried with no variants registered, rather than +# only the controller's error envelope. +EXPERIMENT_NOT_FOUND: Final[str] = "experiments.not_found" diff --git a/src/synthorg/observability/events/workers.py b/src/synthorg/observability/events/workers.py index 220809d42e..7b09aaaeee 100644 --- a/src/synthorg/observability/events/workers.py +++ b/src/synthorg/observability/events/workers.py @@ -36,6 +36,8 @@ # Task queue client WORKERS_TASK_QUEUE_CONNECT_FAILED: Final[str] = "workers.task_queue.connect_failed" +WORKERS_QUEUE_START_REJECTED: Final[str] = "workers.task_queue.start_rejected" +WORKERS_QUEUE_NOT_RUNNING: Final[str] = "workers.task_queue.not_running" WORKERS_TASK_QUEUE_UNSUBSCRIBE_FAILED: Final[str] = ( "workers.task_queue.unsubscribe_failed" ) @@ -49,6 +51,21 @@ # Main entry point WORKERS_MAIN_INVALID_WORKER_COUNT: Final[str] = "workers.main.invalid_worker_count" +WORKERS_MAIN_INVALID_EXECUTOR_CONFIG: Final[str] = ( + "workers.main.invalid_executor_config" +) WORKERS_MAIN_PLACEHOLDER_EXECUTOR_INVOKED: Final[str] = ( "workers.main.placeholder_executor_invoked" ) + +# HTTP-callback executor events +WORKERS_EXECUTOR_HTTP_INVOKED: Final[str] = "workers.executor.http_invoked" +WORKERS_EXECUTOR_HTTP_TERMINAL: Final[str] = "workers.executor.http_terminal" +WORKERS_EXECUTOR_HTTP_RETRY: Final[str] = "workers.executor.http_retry" +WORKERS_EXECUTOR_HTTP_FAILED: Final[str] = "workers.executor.http_failed" +WORKERS_EXECUTOR_INVALID_INIT_ARG: Final[str] = "workers.executor.invalid_init_arg" + +# Backend-side execution service events +WORKERS_EXECUTION_SERVICE_ATTEMPTED: Final[str] = "workers.execution_service.attempted" +WORKERS_EXECUTION_SERVICE_COMPLETED: Final[str] = "workers.execution_service.completed" +WORKERS_EXECUTION_SERVICE_NO_OP: Final[str] = "workers.execution_service.no_op" diff --git a/src/synthorg/observability/events/workflow.py b/src/synthorg/observability/events/workflow.py index b03606ac4c..1b60802b5f 100644 --- a/src/synthorg/observability/events/workflow.py +++ b/src/synthorg/observability/events/workflow.py @@ -42,6 +42,13 @@ SPRINT_LIFECYCLE_TRANSITION: str = "workflow.sprint.lifecycle_transition" """Sprint transitioned between lifecycle statuses.""" +SPRINT_STATUS_TRANSITIONED: str = "workflow.sprint.status_transitioned" +"""Sprint row in the store now carries the new status (emitted by the +caller AFTER persistence write). ``with_transition`` is a pure +constructor so the SPRINT_LIFECYCLE_TRANSITION event from the state +machine covers transition *intent*; this event records the persisted +state-of-record.""" + SPRINT_LIFECYCLE_TRANSITION_INVALID: str = ( "workflow.sprint.lifecycle_transition_invalid" ) @@ -130,6 +137,14 @@ SPRINT_CEREMONY_TRIGGER_FAILED: str = "workflow.sprint.ceremony_trigger_failed" """Ceremony trigger_event call failed (swallowed).""" +SPRINT_CEREMONY_BUDGET_SNAPSHOT_FAILED: str = ( + "workflow.sprint.ceremony_budget_snapshot_failed" +) +"""Budget snapshot callable raised inside the scheduler; the +CeremonyEvalContext falls back to (0.0, 0.0) for this evaluation tick +so a transient budget-service failure cannot block ceremony +scheduling.""" + SPRINT_CEREMONY_EVAL_CONTEXT_INVALID: str = ( "workflow.sprint.ceremony_eval_context_invalid" ) diff --git a/src/synthorg/observability/prometheus_collector.py b/src/synthorg/observability/prometheus_collector.py index 4dc99a7c4c..17af2fe3fd 100644 --- a/src/synthorg/observability/prometheus_collector.py +++ b/src/synthorg/observability/prometheus_collector.py @@ -39,6 +39,9 @@ ) from synthorg.observability.prometheus_push_metrics import PushMetrics from synthorg.observability.prometheus_recording import RecordingMixin +from synthorg.observability.prometheus_recording_streams import ( + StreamRecordingMixin, +) if TYPE_CHECKING: from synthorg.api.state import AppState @@ -145,7 +148,7 @@ async def _fetch_tool_names(app_state: AppState) -> frozenset[str] | None: return None -class PrometheusCollector(RecordingMixin): +class PrometheusCollector(RecordingMixin, StreamRecordingMixin): """Collects business metrics from SynthOrg services for Prometheus. Uses a dedicated ``CollectorRegistry`` to avoid polluting the global @@ -161,7 +164,7 @@ class PrometheusCollector(RecordingMixin): prefix: Metric name prefix (default ``"synthorg"``). """ - def __init__(self, *, prefix: str = "synthorg") -> None: + def __init__(self, *, prefix: str = "synthorg") -> None: # noqa: PLR0915 -- single-shot wiring of every metric family self._prefix = prefix self.registry = CollectorRegistry() @@ -280,6 +283,13 @@ def __init__(self, *, prefix: str = "synthorg") -> None: self._mcp_handler_duration = self._push.mcp_handler_duration self._budget_query_duration = self._push.budget_query_duration self._audit_chain_verifications = self._push.audit_chain_verifications + self._ws_connection_lifetime = self._push.ws_connection_lifetime + self._ws_revalidation_outcomes = self._push.ws_revalidation_outcomes + self._ws_active_connections = self._push.ws_active_connections + self._pg_pool_size = self._push.pg_pool_size + self._pg_pool_active_connections = self._push.pg_pool_active_connections + self._pg_pool_acquire_duration = self._push.pg_pool_acquire_duration + self._pg_pool_exhausted = self._push.pg_pool_exhausted logger.debug(METRICS_COLLECTOR_INITIALIZED, prefix=prefix) @@ -347,8 +357,41 @@ async def refresh(self, app_state: AppState) -> None: utc_midnight, ) await self._refresh_task_metrics(app_state) + self._refresh_pg_pool_metrics(app_state) logger.debug(METRICS_SCRAPE_COMPLETED) + def _refresh_pg_pool_metrics(self, app_state: AppState) -> None: + """Push Postgres pool size / active gauges from the live pool. + + Skipped silently when the backend is not Postgres or is not + yet connected; the pool's ``get_stats`` snapshot is the + authoritative source for ``pool_size`` and ``pool_available``. + """ + if not app_state.has_persistence: + return + backend = app_state.persistence + if backend.kind != "postgres": + return + pool = getattr(backend, "_pool", None) + if pool is None: + return + try: + stats = pool.get_stats() + except MemoryError, RecursionError: + raise + except Exception: + logger.warning(METRICS_SCRAPE_FAILED, component="pg_pool_stats") + return + size = stats.get("pool_size") + available = stats.get("pool_available") + if isinstance(size, int): + self.record_pg_pool_size(backend="primary", size=size) + if isinstance(available, int): + self.record_pg_pool_active( + backend="primary", + active=max(0, size - available), + ) + async def _rebuild_label_snapshot( self, app_state: AppState, diff --git a/src/synthorg/observability/prometheus_labels.py b/src/synthorg/observability/prometheus_labels.py index d3fd51c603..b9328e655a 100644 --- a/src/synthorg/observability/prometheus_labels.py +++ b/src/synthorg/observability/prometheus_labels.py @@ -35,6 +35,7 @@ "VALID_MCP_HANDLER_OUTCOMES", "VALID_OTLP_KINDS", "VALID_OTLP_OUTCOMES", + "VALID_PG_BACKENDS", "VALID_PROVIDER_ERROR_CLASSES", "VALID_SETTINGS_NAMESPACES", "VALID_STATUS_CLASSES", @@ -43,6 +44,8 @@ "VALID_TOOL_OUTCOMES", "VALID_VERDICTS", "VALID_WORKFLOW_EXECUTION_STATUSES", + "VALID_WS_REVALIDATION_OUTCOMES", + "VALID_WS_TRANSPORTS", "_LabelSnapshot", "_reset_label_snapshot_for_tests", "_reset_mcp_tool_names_for_tests", @@ -325,6 +328,11 @@ def status_class(status_code: int) -> str: VALID_AUDIT_VERIFICATION_OUTCOMES: Final[frozenset[str]] = frozenset( {"valid", "broken"} ) +VALID_WS_TRANSPORTS: Final[frozenset[str]] = frozenset({"websocket", "sse"}) +VALID_WS_REVALIDATION_OUTCOMES: Final[frozenset[str]] = frozenset( + {"pass", "fail", "budget_exhausted"} +) +VALID_PG_BACKENDS: Final[frozenset[str]] = frozenset({"primary", "replica"}) # -- Snapshot-backed registry-bound label validation ----------------------- diff --git a/src/synthorg/observability/prometheus_push_metrics.py b/src/synthorg/observability/prometheus_push_metrics.py index 794c161f8f..dd6d7d63e3 100644 --- a/src/synthorg/observability/prometheus_push_metrics.py +++ b/src/synthorg/observability/prometheus_push_metrics.py @@ -331,3 +331,57 @@ def __init__( ["outcome"], registry=registry, ) + + # -- WS lifetime / revalidation / concurrent connections ----- + # Wall-time histogram of WS connection lifetimes; alerts on + # truncated tail (clients dropping shortly after auth) and on + # silent long-lived hangs. + self.ws_connection_lifetime = Histogram( + f"{prefix}_ws_connection_lifetime_seconds", + "WebSocket connection lifetime in seconds, by transport", + ["transport"], + buckets=(1.0, 5.0, 30.0, 60.0, 300.0, 1800.0, 3600.0, 14400.0), + registry=registry, + ) + self.ws_revalidation_outcomes = PromCounter( + f"{prefix}_ws_revalidation_total", + ("Per-frame WS revalidation outcomes (pass / fail / budget_exhausted)"), + ["outcome"], + registry=registry, + ) + self.ws_active_connections = Gauge( + f"{prefix}_ws_active_connections", + "Currently-open WebSocket connections", + registry=registry, + ) + + # -- Postgres connection pool metrics ------------------------ + # ``backend`` label allows multiple Postgres pools to coexist + # (primary read-write + read-only replicas). The pool's + # ``stats()`` snapshot drives the gauges; the counter ticks on + # every saturation event. + self.pg_pool_size = Gauge( + f"{prefix}_pg_pool_size", + "Configured Postgres connection pool size", + ["backend"], + registry=registry, + ) + self.pg_pool_active_connections = Gauge( + f"{prefix}_pg_pool_active_connections", + "Active connections currently checked out of the pool", + ["backend"], + registry=registry, + ) + self.pg_pool_acquire_duration = Histogram( + f"{prefix}_pg_pool_acquire_duration_seconds", + "Wall time spent waiting for a Postgres connection", + ["backend"], + buckets=(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0), + registry=registry, + ) + self.pg_pool_exhausted = PromCounter( + f"{prefix}_pg_pool_exhausted_total", + "Pool acquisition timed out (no connection available)", + ["backend"], + registry=registry, + ) diff --git a/src/synthorg/observability/prometheus_recording_streams.py b/src/synthorg/observability/prometheus_recording_streams.py new file mode 100644 index 0000000000..638a20b5ac --- /dev/null +++ b/src/synthorg/observability/prometheus_recording_streams.py @@ -0,0 +1,125 @@ +"""WebSocket lifetime + Postgres pool recording methods. + +Split out of :mod:`synthorg.observability.prometheus_recording` so the +main module stays under the 800-line ceiling mandated by ``CLAUDE.md``. +The :class:`StreamRecordingMixin` is composed onto +:class:`PrometheusCollector` alongside the original ``RecordingMixin``; +the attributes it touches (``self._ws_connection_lifetime``, etc.) are +created by the collector's ``__init__`` and the public API surface for +callers (``collector.record_ws_*`` / ``collector.record_pg_pool_*``) +is unchanged. +""" + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from prometheus_client import Counter as PromCounter + from prometheus_client import Gauge, Histogram + +from synthorg.observability.prometheus_labels import ( + VALID_PG_BACKENDS, + VALID_WS_REVALIDATION_OUTCOMES, + VALID_WS_TRANSPORTS, + require_finite, + require_label, + require_non_negative, +) + + +class StreamRecordingMixin: + """WebSocket + Postgres pool push-time recording methods.""" + + _ws_connection_lifetime: Histogram + _ws_revalidation_outcomes: PromCounter + _ws_active_connections: Gauge + _pg_pool_size: Gauge + _pg_pool_active_connections: Gauge + _pg_pool_acquire_duration: Histogram + _pg_pool_exhausted: PromCounter + + def record_ws_connection_lifetime( + self, + *, + transport: str, + duration_sec: float, + ) -> None: + """Observe a WebSocket connection's wall-clock lifetime. + + Called from the WS controller's close path; ``duration_sec`` + is the time between the auth-ok handshake and the close + notification. + """ + require_label("ws transport", transport, VALID_WS_TRANSPORTS) + require_finite("record_ws_connection_lifetime: duration_sec", duration_sec) + # Negative durations would silently poison the histogram (e.g. + # NTP-adjusted monotonic clocks should never go backwards, but + # a future Clock-seam replacement might; reject at the source). + require_non_negative( + "record_ws_connection_lifetime: duration_sec", + duration_sec, + ) + self._ws_connection_lifetime.labels(transport=transport).observe( + duration_sec, + ) + + def record_ws_revalidation_outcome(self, *, outcome: str) -> None: + """Increment the per-frame revalidation outcome counter.""" + require_label( + "ws revalidation outcome", + outcome, + VALID_WS_REVALIDATION_OUTCOMES, + ) + self._ws_revalidation_outcomes.labels(outcome=outcome).inc() + + def set_ws_active_connections(self, *, count: int) -> None: + """Replace the gauge with the current connection count.""" + require_non_negative("set_ws_active_connections: count", count) + self._ws_active_connections.set(count) + + def inc_ws_active_connections(self) -> None: + """Atomically increment the active-WS gauge by one. + + ``prometheus_client.Gauge`` is internally thread-safe, so the + caller does not need to coordinate with the scrape thread. + """ + self._ws_active_connections.inc() + + def dec_ws_active_connections(self) -> None: + """Atomically decrement the active-WS gauge by one. + + ``prometheus_client.Gauge`` is internally thread-safe, so the + caller does not need to coordinate with the scrape thread. + """ + self._ws_active_connections.dec() + + def record_pg_pool_size(self, *, backend: str, size: int) -> None: + """Replace the configured pool-size gauge.""" + require_label("pg backend", backend, VALID_PG_BACKENDS) + require_non_negative("record_pg_pool_size: size", size) + self._pg_pool_size.labels(backend=backend).set(size) + + def record_pg_pool_active(self, *, backend: str, active: int) -> None: + """Replace the active-connection gauge.""" + require_label("pg backend", backend, VALID_PG_BACKENDS) + require_non_negative("record_pg_pool_active: active", active) + self._pg_pool_active_connections.labels(backend=backend).set(active) + + def record_pg_pool_acquire( + self, + *, + backend: str, + duration_sec: float, + ) -> None: + """Observe a connection-acquisition latency.""" + require_label("pg backend", backend, VALID_PG_BACKENDS) + require_finite("record_pg_pool_acquire: duration_sec", duration_sec) + # Negative durations would silently poison the histogram. + require_non_negative("record_pg_pool_acquire: duration_sec", duration_sec) + self._pg_pool_acquire_duration.labels(backend=backend).observe( + duration_sec, + ) + + def record_pg_pool_exhausted(self, *, backend: str) -> None: + """Increment the pool-exhaustion counter.""" + require_label("pg backend", backend, VALID_PG_BACKENDS) + self._pg_pool_exhausted.labels(backend=backend).inc() diff --git a/src/synthorg/persistence/_shared/pagination.py b/src/synthorg/persistence/_shared/pagination.py index f1b677c7e0..241e7e164a 100644 --- a/src/synthorg/persistence/_shared/pagination.py +++ b/src/synthorg/persistence/_shared/pagination.py @@ -1,12 +1,12 @@ """Shared pagination-argument validation for SQLite + Postgres repos. Both backends accept ``limit: int`` / ``offset: int`` on their cursor -paginated read methods; the validation is the same in both places -(reject non-int, reject ``bool``, reject ``limit < 1`` / ``offset < 0``) -and was previously duplicated verbatim across two files. Extracting -the helper keeps the validation rule in one place; backend-specific -bits stay in the call sites (the ``event`` constant + extra context -kwargs to log). +paginated read methods. The validation rule (reject non-int, reject +``bool``, reject ``limit < 1`` / ``offset < 0``) is identical across +backends and lives here as a single helper so a future tightening +applies everywhere atomically. Backend-specific concerns (the +``event`` constant emitted on rejection and extra context kwargs to +log) stay in the call sites. """ from typing import Final diff --git a/src/synthorg/persistence/postgres/backend.py b/src/synthorg/persistence/postgres/backend.py index b6600be41e..892b7d3164 100644 --- a/src/synthorg/persistence/postgres/backend.py +++ b/src/synthorg/persistence/postgres/backend.py @@ -17,7 +17,7 @@ from collections.abc import AsyncIterator # noqa: TC003 from contextlib import asynccontextmanager from datetime import UTC, datetime -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from psycopg.rows import dict_row from pydantic import BaseModel @@ -495,6 +495,21 @@ def backend_name(self) -> NotBlankStr: """Human-readable backend identifier.""" return NotBlankStr("postgres") + @property + def kind(self) -> Literal["sqlite", "postgres"]: + """Return the backend discriminator (``"postgres"``).""" + return "postgres" + + @property + def config(self) -> PostgresConfig: + """Public read-only view of the backend's Postgres config. + + Exposed so callers needing the connection details (the + backup-handler factory) do not have to reach for the + private ``_config`` attribute. + """ + return self._config + def _require_connected[T](self, repo: T | None, name: str) -> T: """Return *repo* or raise if the backend is not connected. diff --git a/src/synthorg/persistence/protocol.py b/src/synthorg/persistence/protocol.py index 44a150aad9..254f8a3c91 100644 --- a/src/synthorg/persistence/protocol.py +++ b/src/synthorg/persistence/protocol.py @@ -5,7 +5,7 @@ """ from contextlib import AbstractAsyncContextManager # noqa: TC003 -from typing import Any, Protocol, runtime_checkable +from typing import Any, Literal, Protocol, runtime_checkable from synthorg.budget.config import BudgetConfig # noqa: TC001 from synthorg.core.agent import AgentIdentity # noqa: TC001 @@ -200,6 +200,18 @@ class PersistenceBackend(Protocol): custom_rules: Repository for custom signal rule persistence. """ + @property + def kind(self) -> Literal["sqlite", "postgres"]: + """Return the backend's discriminator string. + + One of ``"sqlite"`` or ``"postgres"``. Used by call sites that + need to pick a backend-specific helper (e.g. backup handler + factories) without ``isinstance`` checks. The ``Literal`` type + means mypy rejects an implementation that returns any other + string. + """ + ... + async def connect(self) -> None: """Establish connection to the storage backend. diff --git a/src/synthorg/persistence/sqlite/backend.py b/src/synthorg/persistence/sqlite/backend.py index 587b422d3b..5356796dff 100644 --- a/src/synthorg/persistence/sqlite/backend.py +++ b/src/synthorg/persistence/sqlite/backend.py @@ -6,7 +6,7 @@ from collections.abc import AsyncIterator # noqa: TC003 from contextlib import asynccontextmanager from datetime import UTC, datetime -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import aiosqlite from pydantic import BaseModel @@ -306,6 +306,22 @@ def _clear_state(self) -> None: # noqa: PLR0915 -- repo registry reset intentio self._oauth_states = None self._webhook_receipts = None + @property + def kind(self) -> Literal["sqlite", "postgres"]: + """Return the backend discriminator (``"sqlite"``).""" + return "sqlite" + + @property + def config(self) -> SQLiteConfig: + """Public read-only view of the backend's config. + + Exposed so callers that need backend-specific details (the + backup-handler factory walks the path; tests assert against + the resolved sqlite path) do not have to reach for the + private ``_config`` attribute. + """ + return self._config + async def connect(self) -> None: """Open the SQLite database and configure WAL mode.""" async with self._lifecycle_lock: diff --git a/src/synthorg/providers/models.py b/src/synthorg/providers/models.py index 3be960e47a..c74ef6259e 100644 --- a/src/synthorg/providers/models.py +++ b/src/synthorg/providers/models.py @@ -297,11 +297,17 @@ class CompletionConfig(BaseModel): default=(), description="Stop sequences", ) - top_p: float | None = Field( - default=None, + top_p: float = Field( + default=1.0, ge=0.0, le=1.0, - description="Nucleus sampling threshold", + description=( + "Nucleus-sampling threshold. Defaults to 1.0 (full " + "distribution, no truncation) so every completion call " + "has an explicit deterministic value without each site " + "having to repeat it. Override when the prompt class " + "needs a custom value alongside ``temperature``." + ), ) timeout: float | None = Field( default=None, diff --git a/src/synthorg/security/config.py b/src/synthorg/security/config.py index cc99eef2a9..1af0544704 100644 --- a/src/synthorg/security/config.py +++ b/src/synthorg/security/config.py @@ -149,6 +149,12 @@ class LlmFallbackConfig(BaseModel): argument_truncation: ArgumentTruncationStrategy = ( ArgumentTruncationStrategy.PER_VALUE ) + # Sampling parameters for the security-evaluation completion call. + # Pinned to deterministic defaults so verdicts stay reproducible + # across runs; operators tuning this MUST re-run the golden eval + # suite (``tests/prompts/golden/llm_security_evaluator``). + temperature: float = Field(default=0.0, ge=0.0, le=2.0) + top_p: float = Field(default=1.0, ge=0.0, le=1.0) class SecurityPolicyRule(BaseModel): diff --git a/src/synthorg/security/llm_evaluator.py b/src/synthorg/security/llm_evaluator.py index 2395d24c8c..88ad73be80 100644 --- a/src/synthorg/security/llm_evaluator.py +++ b/src/synthorg/security/llm_evaluator.py @@ -280,7 +280,8 @@ async def _call_llm( model, tools=[_SECURITY_VERDICT_TOOL], config=CompletionConfig( - temperature=0.0, + temperature=self._config.temperature, + top_p=self._config.top_p, max_tokens=256, ), ), diff --git a/src/synthorg/workers/__main__.py b/src/synthorg/workers/__main__.py index 28524bc07d..16ef6274d4 100644 --- a/src/synthorg/workers/__main__.py +++ b/src/synthorg/workers/__main__.py @@ -3,13 +3,22 @@ Launched from the Go CLI via ``synthorg worker start`` (see ``cli/cmd/worker_start.go``). Wires a :class:`JetStreamTaskQueue` against the current ``NatsConfig`` and runs a pool of -:class:`Worker` instances with a placeholder executor. +:class:`Worker` instances against the HTTP-callback executor +(:class:`~synthorg.workers.executor.TaskExecutionExecutor`). -The placeholder executor acks each claim as ``SUCCESS`` after -logging it. Wiring the real agent runtime (``agent_engine``) and -the HTTP transition callback is a follow-up; this module exists so -the ``synthorg worker start`` command has something to exec while -the task queue plumbing lands incrementally. +The executor POSTs to ``/api/v1/tasks/{task_id}/execute`` on the +backend; the backend dispatches to a pluggable +:class:`WorkerExecutionService` so the agent-runtime invocation is +configurable per deployment. The default +:class:`LifecycleAdvancingExecutionService` walks the task one +lifecycle step forward, which is sufficient for end-to-end claim +round-trip tests; production deployments override the service to +invoke the full agent engine. + +The legacy ``_placeholder_executor`` is kept as an opt-in fallback +(``--executor placeholder``) so the dispatch plumbing can still be +smoke-tested without a running backend (e.g. NATS-only conformance +runs). """ import argparse @@ -18,9 +27,12 @@ import sys from typing import Final +import httpx + from synthorg.communication.config import NatsConfig from synthorg.observability import get_logger from synthorg.observability.events.workers import ( + WORKERS_MAIN_INVALID_EXECUTOR_CONFIG, WORKERS_MAIN_INVALID_WORKER_COUNT, WORKERS_MAIN_PLACEHOLDER_EXECUTOR_INVOKED, ) @@ -29,7 +41,11 @@ from synthorg.settings.mirrors import parse_int from synthorg.workers.claim import JetStreamTaskQueue, TaskClaim, TaskClaimStatus from synthorg.workers.config import QueueConfig -from synthorg.workers.worker import run_worker_pool +from synthorg.workers.executor import ( + DEFAULT_HTTP_TIMEOUT_SECONDS, + TaskExecutionExecutor, +) +from synthorg.workers.worker import TaskExecutor, run_worker_pool logger = get_logger(__name__) @@ -37,9 +53,10 @@ async def _placeholder_executor(claim: TaskClaim) -> TaskClaimStatus: """Acknowledge the claim without executing any task logic. - Real agent runtime integration lands in a follow-up; this - placeholder exists so operators can smoke-test the dispatch - path end-to-end (engine -> NATS -> worker -> ack). + Smoke-test fallback for the dispatch path; only used when the + operator explicitly passes ``--executor placeholder``. A missing + backend URL is handled by ``_resolve_executor()`` exiting with + ``SystemExit`` rather than silently falling back to this executor. """ logger.info( WORKERS_MAIN_PLACEHOLDER_EXECUTOR_INVOKED, @@ -81,6 +98,33 @@ def _build_parser() -> argparse.ArgumentParser: default=os.environ.get("SYNTHORG_NATS_STREAM_PREFIX", "SYNTHORG"), help="JetStream stream name prefix (default: SYNTHORG).", ) + parser.add_argument( + "--api-base-url", + default=os.environ.get("SYNTHORG_API_BASE_URL"), + help=( + "Backend base URL for the HTTP-callback executor " + "(default: env SYNTHORG_API_BASE_URL)." + ), + ) + parser.add_argument( + "--auth-token", + default=os.environ.get("SYNTHORG_WORKER_AUTH_TOKEN"), + help=( + "Bearer token sent to the backend execute endpoint " + "(default: env SYNTHORG_WORKER_AUTH_TOKEN)." + ), + ) + parser.add_argument( + "--executor", + default="http", + choices=("http", "placeholder"), + help=( + "Executor implementation. ``http`` (default) calls the " + "backend's /tasks/{id}/execute endpoint; ``placeholder`` " + "acks every claim without any backend interaction " + "(NATS-only smoke tests)." + ), + ) return parser @@ -126,23 +170,82 @@ async def _async_main(argv: list[str]) -> int: stream_name_prefix=args.stream_prefix, ) + # Resolve the executor BEFORE starting the queue so a missing-config + # SystemExit from ``_resolve_executor`` does not leak a started + # JetStream consumer (the queue's stop() in the finally below would + # never run because the SystemExit unwinds straight past the try). + executor, http_client = _resolve_executor(args) task_queue = JetStreamTaskQueue( queue_config=queue_config, nats_config=nats_config, ) - await task_queue.start() + # ``queue_started`` gates the ``task_queue.stop()`` call so a + # ``start()`` failure does not call ``stop()`` on a queue that + # never bound the consumer, and a flag-flip lets the owned + # ``http_client`` close cleanly regardless of which stage failed. + queue_started = False try: + await task_queue.start() + queue_started = True await run_worker_pool( queue_config=queue_config, task_queue=task_queue, - executor=_placeholder_executor, + executor=executor, worker_count=args.workers, ) finally: - await task_queue.stop() + if http_client is not None: + await http_client.aclose() + if queue_started: + await task_queue.stop() return 0 +def _resolve_executor( + args: argparse.Namespace, +) -> tuple[TaskExecutor, httpx.AsyncClient | None]: + """Build the configured executor and (if HTTP) the owned client. + + The HTTP executor owns an :class:`httpx.AsyncClient` for the + lifetime of the worker pool so connection pooling persists + across claims. The caller closes the client in the ``finally`` + block to drain in-flight requests at shutdown. + """ + if args.executor == "placeholder": + return _placeholder_executor, None + if not args.api_base_url: + msg = "--executor http requires --api-base-url (or SYNTHORG_API_BASE_URL)" + logger.error( + WORKERS_MAIN_INVALID_EXECUTOR_CONFIG, + executor=args.executor, + missing_flag="--api-base-url", + missing_env="SYNTHORG_API_BASE_URL", + error=msg, + ) + raise SystemExit(msg) + if not args.auth_token: + msg = "--executor http requires --auth-token (or SYNTHORG_WORKER_AUTH_TOKEN)" + logger.error( + WORKERS_MAIN_INVALID_EXECUTOR_CONFIG, + executor=args.executor, + missing_flag="--auth-token", + missing_env="SYNTHORG_WORKER_AUTH_TOKEN", + error=msg, + ) + raise SystemExit(msg) + # Match the executor's per-request timeout at the client level so + # the connection-pool defaults line up with the other AsyncClient + # call sites in the codebase (every other site passes ``timeout=`` + # explicitly rather than relying on the 5 s httpx default). + http_client = httpx.AsyncClient(timeout=DEFAULT_HTTP_TIMEOUT_SECONDS) + executor = TaskExecutionExecutor( + api_base_url=args.api_base_url, + auth_token=args.auth_token, + http_client=http_client, + ) + return executor, http_client + + def main(argv: list[str] | None = None) -> int: """Synchronous entry point that delegates to the asyncio runner.""" effective = sys.argv[1:] if argv is None else argv diff --git a/src/synthorg/workers/claim.py b/src/synthorg/workers/claim.py index d3e2d8f2bb..dd7ac4dc8e 100644 --- a/src/synthorg/workers/claim.py +++ b/src/synthorg/workers/claim.py @@ -24,6 +24,8 @@ from synthorg.core.types import NotBlankStr # noqa: TC001 from synthorg.observability import get_logger, safe_error_description from synthorg.observability.events.workers import ( + WORKERS_QUEUE_NOT_RUNNING, + WORKERS_QUEUE_START_REJECTED, WORKERS_TASK_QUEUE_ACK_MALFORMED_FAILED, WORKERS_TASK_QUEUE_CLAIM_PARSE_FAILED, WORKERS_TASK_QUEUE_CONNECT_FAILED, @@ -156,6 +158,10 @@ async def start(self) -> None: client is drained before the exception propagates. """ if self._running: + logger.warning( + WORKERS_QUEUE_START_REJECTED, + reason="already_running", + ) msg = "JetStreamTaskQueue.start() called while already running" raise RuntimeError(msg) try: @@ -346,6 +352,11 @@ async def publish_claim(self, claim: TaskClaim) -> None: claim: The task claim to enqueue. """ if self._js is None: + logger.warning( + WORKERS_QUEUE_NOT_RUNNING, + operation="publish_claim", + task_id=claim.task_id, + ) msg = "Task queue is not running" raise BusStreamError(msg) subject = f"{self._queue_config.ready_subject_prefix}.{claim.task_id}" @@ -365,6 +376,10 @@ async def next_claim( from nats.errors import TimeoutError as NatsTimeoutError # noqa: PLC0415 if self._sub is None: + logger.warning( + WORKERS_QUEUE_NOT_RUNNING, + operation="next_claim", + ) msg = "Task queue is not running" raise BusStreamError(msg) try: diff --git a/src/synthorg/workers/execution_service.py b/src/synthorg/workers/execution_service.py new file mode 100644 index 0000000000..22517d291c --- /dev/null +++ b/src/synthorg/workers/execution_service.py @@ -0,0 +1,175 @@ +"""Backend-side service called by the worker-callable execute endpoint. + +When the worker pool fetches a JetStream claim, it posts to +``POST /api/v1/tasks/{task_id}/execute``. The controller delegates to +:class:`WorkerExecutionService.execute_once` so the agent-runtime +invocation is configurable per deployment: a baseline implementation +walks the task through its lifecycle via the existing :class:`TaskEngine`, +while production deployments override the service to invoke the full +:class:`~synthorg.engine.agent_engine.AgentEngine`. + +The service is intentionally a thin protocol-driven seam: the +controller does not care which implementation is wired, only that +``execute_once`` returns the post-execution :class:`Task`. This keeps +the API contract stable while the agent-runtime invocation evolves +across deployments. +""" + +from typing import TYPE_CHECKING, Final, Protocol + +from synthorg.core.domain_errors import NotFoundError +from synthorg.core.enums import TaskStatus +from synthorg.core.task import ( + Task, # noqa: TC001 -- runtime Protocol/return-type annotation +) +from synthorg.observability import get_logger +from synthorg.observability.events.workers import ( + WORKERS_EXECUTION_SERVICE_ATTEMPTED, + WORKERS_EXECUTION_SERVICE_COMPLETED, + WORKERS_EXECUTION_SERVICE_NO_OP, +) + +if TYPE_CHECKING: + from synthorg.engine.task_engine import TaskEngine + +logger = get_logger(__name__) + +_EXECUTABLE_STATUSES: Final[frozenset[TaskStatus]] = frozenset( + { + TaskStatus.ASSIGNED, + TaskStatus.IN_PROGRESS, + } +) + + +class WorkerExecutionService(Protocol): + """Contract for the worker-callable execution surface. + + Deployments override this protocol to plug a specific + agent-runtime invocation. The default implementation + (:class:`LifecycleAdvancingExecutionService`) walks the task + forward through the lifecycle without invoking an LLM, which is + sufficient for smoke tests and for the dispatcher / queue / + worker / API integration tests that pin the full claim + round-trip. + """ + + async def execute_once( + self, + *, + task_id: str, + previous_status: str | None, + new_status: str, + idempotency_key: str, + requested_by: str, + ) -> Task: + """Execute one step of the task and return the post-step state. + + Implementations MUST persist the resulting status through the + ``TaskEngine`` so the single-writer invariant holds, and + return the typed ``Task`` for the controller to envelope. + """ + ... + + +class LifecycleAdvancingExecutionService: + """Default :class:`WorkerExecutionService` implementation. + + Advances the task one transition forward when it is in an + executable state (``ASSIGNED`` or ``IN_PROGRESS``); returns the + current state unchanged otherwise. This is the baseline contract + the dispatcher + queue + worker tests pin: a claim arrives, + the service rolls the lifecycle forward, the worker sees a + terminal-or-not response and acks accordingly. + + Production deployments replace this implementation with one that + invokes the :class:`~synthorg.engine.agent_engine.AgentEngine` + against the task body. The agent-engine implementation lives + outside this baseline because it carries the full agent-runtime + dependency chain (LLM provider, tool registry, memory backend), + none of which belong in the dispatch path itself. + """ + + __slots__ = ("_task_engine",) + + def __init__(self, *, task_engine: TaskEngine) -> None: + self._task_engine = task_engine + + async def execute_once( + self, + *, + task_id: str, + previous_status: str | None, + new_status: str, + idempotency_key: str, + requested_by: str, + ) -> Task: + """Walk the task one step forward through the lifecycle. + + ``ASSIGNED`` -> ``IN_PROGRESS`` -> ``IN_REVIEW`` -> ``COMPLETED`` + is the canonical happy path. Tasks in any other status are + returned unchanged; the worker maps that into a retry so the + next dispatch picks up the new state. + """ + task = await self._task_engine.get_task(task_id) + if task is None: + logger.warning( + WORKERS_EXECUTION_SERVICE_NO_OP, + task_id=task_id, + reason="task_not_found", + previous_status=previous_status, + new_status=new_status, + idempotency_key=idempotency_key, + ) + msg = f"Task {task_id!r} not found" + raise NotFoundError(msg) + current_status = task.status + logger.info( + WORKERS_EXECUTION_SERVICE_ATTEMPTED, + task_id=task_id, + current_status=current_status.value, + previous_status=previous_status, + new_status=new_status, + idempotency_key=idempotency_key, + ) + target = self._next_status(current_status) + if target is None: + logger.info( + WORKERS_EXECUTION_SERVICE_NO_OP, + task_id=task_id, + current_status=current_status.value, + reason="not_in_executable_status", + ) + return task + advanced, _ = await self._task_engine.transition_task( + task_id, + target, + requested_by=requested_by, + reason=f"worker execution step from {current_status.value}", + ) + logger.info( + WORKERS_EXECUTION_SERVICE_COMPLETED, + task_id=task_id, + from_status=task.status.value, + to_status=advanced.status.value, + ) + return advanced + + @staticmethod + def _next_status(current: TaskStatus) -> TaskStatus | None: + """Return the next baseline transition target for the lifecycle. + + Returns ``None`` for statuses outside the executable window; + the worker maps that into a retry so any subsequent dispatch + picks up the new state. + """ + if current == TaskStatus.ASSIGNED: + return TaskStatus.IN_PROGRESS + if current == TaskStatus.IN_PROGRESS: + return TaskStatus.IN_REVIEW + if current == TaskStatus.IN_REVIEW: + return TaskStatus.COMPLETED + if current in _EXECUTABLE_STATUSES: + # Defensive: unreachable today but documented contract. + return None + return None diff --git a/src/synthorg/workers/executor.py b/src/synthorg/workers/executor.py new file mode 100644 index 0000000000..c89c18075b --- /dev/null +++ b/src/synthorg/workers/executor.py @@ -0,0 +1,290 @@ +"""HTTP-callback task executor for the distributed worker pool. + +The worker pool fetches :class:`TaskClaim` envelopes from JetStream; +this module turns each claim into a real task execution via the +backend's HTTP API. Workers never write to persistence directly +(single-writer invariant lives on the ``TaskEngine`` in the backend), +so the executor's only side effect is a typed HTTP call to +``POST /api/v1/tasks/{task_id}/execute``. + +Outcome mapping: + +- HTTP 2xx with a terminal task status (``completed`` / ``cancelled`` + / ``failed``) returns ``TaskClaimStatus.SUCCESS`` so JetStream acks. +- HTTP 2xx with a non-terminal status (the executor decided the task + is not yet runnable) returns ``TaskClaimStatus.RETRY`` so JetStream + nacks for a backoff. +- HTTP 409 (``ConflictError`` envelope) returns + ``TaskClaimStatus.FAILED`` because the task moved out of an + executable status between dispatch and execution; redelivery would + not help. The terminal ack means the claim is removed. +- HTTP 404 returns ``TaskClaimStatus.FAILED`` for the same reason. +- HTTP 5xx, ``httpx.TransportError``, and ``httpx.TimeoutException`` + return ``TaskClaimStatus.RETRY`` so JetStream redelivers within the + ``max_deliver`` budget. + +Auth: the executor uses a Bearer token whose source is documented in +``docs/guides/workers-and-background-tasks.md``. The token is read +once at construction. +""" + +from typing import TYPE_CHECKING, Any, Final +from urllib.parse import quote + +import httpx + +from synthorg.observability import get_logger, safe_error_description +from synthorg.observability.events.workers import ( + WORKERS_EXECUTOR_HTTP_FAILED, + WORKERS_EXECUTOR_HTTP_INVOKED, + WORKERS_EXECUTOR_HTTP_RETRY, + WORKERS_EXECUTOR_HTTP_TERMINAL, + WORKERS_EXECUTOR_INVALID_INIT_ARG, +) +from synthorg.workers.claim import TaskClaim, TaskClaimStatus + +if TYPE_CHECKING: + from synthorg.core.clock import Clock + +logger = get_logger(__name__) + +DEFAULT_HTTP_TIMEOUT_SECONDS: Final[float] = 60.0 +"""Per-request wall-clock timeout for ``POST /tasks/{id}/execute``. + +Long enough to cover a single agent step (typical LLM call plus tool +invocation budget) without holding the JetStream ack-wait window +indefinitely. The worker's own ``ack_wait`` is the outer bound; this +inner timeout exists so a server hang surfaces as a retryable +transport error rather than a silent JetStream redelivery. Exposed +publicly so the worker entry point can mirror it on the shared +``httpx.AsyncClient`` baseline.""" + +_TERMINAL_STATUSES: Final[frozenset[str]] = frozenset( + {"completed", "cancelled", "failed"} +) + + +class TaskExecutionExecutor: + """HTTP-callback executor for :class:`Worker`. + + Args: + api_base_url: Base URL of the SynthOrg backend (e.g. + ``http://backend:3001``). Trailing slashes are stripped at + construction so the executor's path concatenation cannot + produce ``//``. + auth_token: Bearer token used for every execute call. The + backend's auth middleware validates it via the typed JWT + boundary (``parse_typed("jwt", ...)``). + http_client: Pre-constructed ``httpx.AsyncClient``. Injectable + so tests can mount ``MockTransport`` without spinning a + real backend. + timeout_seconds: Per-request wall-clock budget. Defaults to + :data:`DEFAULT_HTTP_TIMEOUT_SECONDS`. + clock: Optional clock seam for the diagnostic log timestamps + (not part of the HTTP envelope). Unused today; reserved so + future retry/backoff logic stays test-deterministic. + """ + + __slots__ = ( + "_auth_token", + "_base_url", + "_clock", + "_http", + "_timeout_seconds", + ) + + def __init__( + self, + *, + api_base_url: str, + auth_token: str, + http_client: httpx.AsyncClient, + timeout_seconds: float = DEFAULT_HTTP_TIMEOUT_SECONDS, + clock: Clock | None = None, + ) -> None: + if not api_base_url: + msg = "api_base_url must be non-empty" + logger.warning( + WORKERS_EXECUTOR_INVALID_INIT_ARG, + param="api_base_url", + error=msg, + ) + raise ValueError(msg) + if not auth_token: + msg = "auth_token must be non-empty" + # Do NOT log the token value -- only the parameter name and + # the canonical error message. The token is a Bearer + # credential and would otherwise flow into structured logs. + logger.warning( + WORKERS_EXECUTOR_INVALID_INIT_ARG, + param="auth_token", + error=msg, + ) + raise ValueError(msg) + if timeout_seconds <= 0.0: + msg = "timeout_seconds must be positive" + logger.warning( + WORKERS_EXECUTOR_INVALID_INIT_ARG, + param="timeout_seconds", + value=timeout_seconds, + error=msg, + ) + raise ValueError(msg) + self._base_url = api_base_url.rstrip("/") + self._auth_token = auth_token + self._http = http_client + self._timeout_seconds = timeout_seconds + self._clock = clock + + async def __call__(self, claim: TaskClaim) -> TaskClaimStatus: + """Execute the claim by calling the backend execute endpoint.""" + # URL-encode the task_id segment so reserved characters in the + # claim identifier cannot produce a malformed path. ``safe=""`` + # forces slashes inside the id to be escaped too. + encoded_task_id = quote(str(claim.task_id), safe="") + url = f"{self._base_url}/api/v1/tasks/{encoded_task_id}/execute" + logger.info( + WORKERS_EXECUTOR_HTTP_INVOKED, + task_id=claim.task_id, + new_status=claim.new_status, + url=url, + ) + try: + response = await self._http.post( + url, + headers={ + "Authorization": f"Bearer {self._auth_token}", + "Accept": "application/json", + "Content-Type": "application/json", + }, + json={ + "previous_status": claim.previous_status, + "new_status": claim.new_status, + "idempotency_key": claim.idempotency_key, + }, + timeout=self._timeout_seconds, + ) + except httpx.TimeoutException as exc: + logger.warning( + WORKERS_EXECUTOR_HTTP_RETRY, + task_id=claim.task_id, + reason="timeout", + error_type=type(exc).__name__, + error=safe_error_description(exc), + ) + return TaskClaimStatus.RETRY + except httpx.TransportError as exc: + logger.warning( + WORKERS_EXECUTOR_HTTP_RETRY, + task_id=claim.task_id, + reason="transport_error", + error_type=type(exc).__name__, + error=safe_error_description(exc), + ) + return TaskClaimStatus.RETRY + + return self._map_response(claim, response) + + def _map_response( + self, + claim: TaskClaim, + response: httpx.Response, + ) -> TaskClaimStatus: + """Translate the HTTP response into a :class:`TaskClaimStatus`. + + The mapping is deterministic and documented in the module + docstring so operators reading worker logs can predict + ack/nack behaviour from the status alone. + """ + if response.status_code == httpx.codes.NOT_FOUND: + logger.warning( + WORKERS_EXECUTOR_HTTP_FAILED, + task_id=claim.task_id, + status_code=response.status_code, + outcome="task_not_found", + ) + return TaskClaimStatus.FAILED + if response.status_code == httpx.codes.CONFLICT: + logger.warning( + WORKERS_EXECUTOR_HTTP_FAILED, + task_id=claim.task_id, + status_code=response.status_code, + outcome="task_status_conflict", + ) + return TaskClaimStatus.FAILED + if response.status_code >= httpx.codes.INTERNAL_SERVER_ERROR: + logger.warning( + WORKERS_EXECUTOR_HTTP_RETRY, + task_id=claim.task_id, + status_code=response.status_code, + reason="server_error", + ) + return TaskClaimStatus.RETRY + if not response.is_success: + logger.warning( + WORKERS_EXECUTOR_HTTP_FAILED, + task_id=claim.task_id, + status_code=response.status_code, + outcome="non_retryable_4xx", + ) + return TaskClaimStatus.FAILED + + payload = self._safe_json(response) + terminal_status = self._extract_terminal_status(payload) + if terminal_status is not None: + logger.info( + WORKERS_EXECUTOR_HTTP_TERMINAL, + task_id=claim.task_id, + terminal_status=terminal_status, + ) + # All terminal task statuses (``completed`` / ``cancelled`` + # / ``failed``) map to SUCCESS so the JetStream claim is + # acked. A task that finished in ``failed`` status is still + # a successful execution from the worker's perspective; the + # business-logic failure is recorded on the task itself, + # not on the claim, so redelivery would not help. + return TaskClaimStatus.SUCCESS + # 2xx but no terminal status: the backend acknowledged the + # request and may have advanced the task to an intermediate + # state. Treat as retry so the next claim re-runs from the new + # state, rather than acking now and losing the dispatch signal. + logger.info( + WORKERS_EXECUTOR_HTTP_RETRY, + task_id=claim.task_id, + reason="non_terminal_status", + ) + return TaskClaimStatus.RETRY + + @staticmethod + def _safe_json(response: httpx.Response) -> dict[str, Any]: + """Return the JSON body, or an empty dict if it cannot be parsed. + + The execute endpoint always returns a typed envelope on + success, but defensive parsing avoids crashing the worker on a + misconfigured proxy that strips the body. + """ + try: + body = response.json() + except ValueError, httpx.DecodingError: + return {} + if isinstance(body, dict): + return body + return {} + + @staticmethod + def _extract_terminal_status(payload: dict[str, Any]) -> str | None: + """Return the task's terminal status if the envelope reports one. + + The envelope shape is ``{"data": {"status": "", ...}}`` + per ``ApiResponse[Task]``. The helper is liberal about the + nested ``data`` key so a wrapper change does not break worker + outcome mapping silently. + """ + candidate = payload.get("data") if isinstance(payload, dict) else None + if isinstance(candidate, dict): + value = candidate.get("status") + else: + value = payload.get("status") if isinstance(payload, dict) else None + if isinstance(value, str) and value in _TERMINAL_STATUSES: + return value + return None diff --git a/tests/conformance/persistence/conftest.py b/tests/conformance/persistence/conftest.py index 3bb9cd6066..5a2c46cb53 100644 --- a/tests/conformance/persistence/conftest.py +++ b/tests/conformance/persistence/conftest.py @@ -193,6 +193,17 @@ def _acquire_shared_postgres(state_file: Path) -> dict[str, Any]: reason = f"Could not start Postgres test container: {type(exc).__name__}: {exc}" state_file.write_text(json.dumps({"skip_reason": reason})) pytest.skip(reason) + # ``container.start()`` blocks on testcontainers' internal readiness + # probe (``pg_isready`` via the wait strategy), but on Docker Desktop + # the vpnkit / gvisor port-proxy occasionally takes another ~1-2s + # before the published port routes cleanly. Probe once from the host + # side so the first peer worker that arrives sees an accepting + # connection rather than racing the proxy. + _wait_for_postgres_accept( + host=container.get_container_host_ip(), + port=int(container.get_exposed_port(5432)), + timeout_seconds=15.0, + ) data = { "container_id": container.get_wrapped_container().id, "host": container.get_container_host_ip(), @@ -206,6 +217,33 @@ def _acquire_shared_postgres(state_file: Path) -> dict[str, Any]: return data +def _wait_for_postgres_accept( + *, + host: str, + port: int, + timeout_seconds: float, +) -> None: + """Poll a TCP connect against ``host:port`` until it accepts. + + Bounds total wait by ``timeout_seconds``; on expiry returns + without raising so the existing testcontainers wait strategy + handles the error path (``container.start()`` would already have + raised if postgres itself never came up). The poll is a thin + belt-and-braces guard against the Docker Desktop port-proxy + accept gap; production deployments never hit it. + """ + import socket + import time + + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + try: + with socket.create_connection((host, port), timeout=1.0): + return + except OSError: + time.sleep(0.2) + + def _release_shared_postgres(state_file: Path) -> None: """Decrement the refcount and tear down when this worker is the last. diff --git a/tests/conformance/persistence/test_backup_round_trip.py b/tests/conformance/persistence/test_backup_round_trip.py index 7e41d47925..3f002d6c2e 100644 --- a/tests/conformance/persistence/test_backup_round_trip.py +++ b/tests/conformance/persistence/test_backup_round_trip.py @@ -12,6 +12,7 @@ import shutil from pathlib import Path +from typing import assert_never, cast import pytest @@ -28,6 +29,28 @@ pytestmark = pytest.mark.integration +def _build_handler( + backend: PersistenceBackend, +) -> SQLitePersistenceComponentHandler | PostgresPersistenceComponentHandler: + """Pick a backup handler for ``backend`` via its ``kind`` discriminator. + + Uses the public ``backend.kind`` plus ``backend.config`` accessors + so the conformance test does not depend on either the backend's + concrete class or its private ``_config`` attribute. + """ + if backend.kind == "sqlite": + sqlite_backend = cast(SQLitePersistenceBackend, backend) + return SQLitePersistenceComponentHandler( + db_path=Path(sqlite_backend.config.path), + ) + if backend.kind == "postgres": + if shutil.which("pg_dump") is None or shutil.which("pg_restore") is None: + pytest.skip("pg_dump / pg_restore binaries are not available on PATH") + postgres_backend = cast(PostgresPersistenceBackend, backend) + return PostgresPersistenceComponentHandler(config=postgres_backend.config) + assert_never(backend.kind) + + async def test_backup_handler_round_trip( backend: PersistenceBackend, tmp_path: Path, @@ -36,21 +59,7 @@ async def test_backup_handler_round_trip( target_dir = tmp_path / "backup" target_dir.mkdir() - if isinstance(backend, SQLitePersistenceBackend): - db_path = Path(backend._config.path) - handler: ( - SQLitePersistenceComponentHandler | PostgresPersistenceComponentHandler - ) = SQLitePersistenceComponentHandler(db_path=db_path) - elif isinstance(backend, PostgresPersistenceBackend): - if shutil.which("pg_dump") is None or shutil.which("pg_restore") is None: - pytest.skip("pg_dump / pg_restore binaries are not available on PATH") - handler = PostgresPersistenceComponentHandler( - config=backend._config, - ) - else: # pragma: no cover - defensive - msg = f"Unknown backend type: {type(backend).__name__}" - raise TypeError(msg) - + handler = _build_handler(backend) size = await handler.backup(target_dir) assert size > 0 assert await handler.validate_source(target_dir) is True diff --git a/tests/unit/api/rate_limits/test_in_memory_inflight_lifecycle.py b/tests/unit/api/rate_limits/test_in_memory_inflight_lifecycle.py new file mode 100644 index 0000000000..5a41f503e6 --- /dev/null +++ b/tests/unit/api/rate_limits/test_in_memory_inflight_lifecycle.py @@ -0,0 +1,32 @@ +"""Lifecycle teardown coverage for :class:`InMemoryInflightStore`. + +Defensive coverage for the rate-limit inflight tracker: every state +collection drops on ``close()`` and double-close is idempotent. A +worker pool that resets the limiter between scenarios must not leave +counters or locks behind. +""" + +import pytest + +from synthorg.api.rate_limits.in_memory_inflight import InMemoryInflightStore + +pytestmark = pytest.mark.unit + + +async def test_close_clears_all_collections() -> None: + store = InMemoryInflightStore() + async with store.acquire(key="agent-1", max_inflight=2): + pass + assert "agent-1" in store._counters + await store.close() + assert not store._counters + assert not store._locks + assert not store._lock_refs + + +async def test_close_is_idempotent() -> None: + store = InMemoryInflightStore() + await store.close() + # Second close must not raise. + await store.close() + assert not store._counters diff --git a/tests/unit/client/test_pool_logging.py b/tests/unit/client/test_pool_logging.py new file mode 100644 index 0000000000..17b06cf7e4 --- /dev/null +++ b/tests/unit/client/test_pool_logging.py @@ -0,0 +1,87 @@ +"""Verify ``ClientPool`` raises log the rejection event before bubbling.""" + +from typing import TYPE_CHECKING + +import pytest +import structlog + +from synthorg.client.models import ( + ClientFeedback, + ClientProfile, + TaskRequirement, +) +from synthorg.client.pool import ClientPool +from synthorg.observability.events.client import CLIENT_NOT_FOUND + +if TYPE_CHECKING: + from synthorg.client.models import GenerationContext, ReviewContext + + +class _StubClient: + def __init__(self, profile: ClientProfile) -> None: + self.profile = profile + + async def submit_requirement( + self, + context: GenerationContext, + ) -> TaskRequirement | None: + del context + return None + + async def review_deliverable( + self, + context: ReviewContext, + ) -> ClientFeedback: + del context + return ClientFeedback( + task_id="stub", + client_id=self.profile.client_id, + accepted=True, + ) + + +def _profile(client_id: str = "c-1") -> ClientProfile: + return ClientProfile( + client_id=client_id, + name=f"Client {client_id}", + persona="test", + expertise_domains=(), + strictness_level=0.5, + ) + + +@pytest.mark.unit +class TestPoolMissingClientLogs: + """Every missing-client raise emits CLIENT_NOT_FOUND with the operation.""" + + @pytest.mark.parametrize( + ("operation", "call"), + [ + ("remove", lambda pool: pool.remove("missing")), + ("deactivate", lambda pool: pool.deactivate("missing")), + ("reactivate", lambda pool: pool.reactivate("missing")), + ("is_active", lambda pool: pool.is_active("missing")), + ("get_profile", lambda pool: pool.get_profile("missing")), + ], + ) + async def test_logs_before_raise( + self, + operation: str, + call: object, + ) -> None: + pool = ClientPool() + with structlog.testing.capture_logs() as cap, pytest.raises(KeyError): + await call(pool) # type: ignore[operator] + events = [e for e in cap if e["event"] == CLIENT_NOT_FOUND] + assert len(events) == 1 + assert events[0]["log_level"] == "warning" + assert events[0]["client_id"] == "missing" + assert events[0]["operation"] == operation + + async def test_present_client_does_not_log(self) -> None: + pool = ClientPool() + profile = _profile("c-1") + await pool.add(profile=profile, client=_StubClient(profile)) + with structlog.testing.capture_logs() as cap: + await pool.get_profile("c-1") + assert not [e for e in cap if e["event"] == CLIENT_NOT_FOUND] diff --git a/tests/unit/client/test_store_logging.py b/tests/unit/client/test_store_logging.py new file mode 100644 index 0000000000..c677734b8c --- /dev/null +++ b/tests/unit/client/test_store_logging.py @@ -0,0 +1,20 @@ +"""Verify ``RequestStore.get`` logs CLIENT_REQUEST_NOT_FOUND before raise.""" + +import pytest +import structlog + +from synthorg.client.store import RequestStore +from synthorg.observability.events.client import CLIENT_REQUEST_NOT_FOUND + + +@pytest.mark.unit +class TestRequestStoreMissingLogs: + async def test_get_missing_logs_before_raise(self) -> None: + store = RequestStore() + with structlog.testing.capture_logs() as cap, pytest.raises(KeyError): + await store.get("missing-id") + events = [e for e in cap if e["event"] == CLIENT_REQUEST_NOT_FOUND] + assert len(events) == 1 + assert events[0]["log_level"] == "warning" + assert events[0]["request_id"] == "missing-id" + assert events[0]["operation"] == "get" diff --git a/tests/unit/engine/test_approval_gate.py b/tests/unit/engine/test_approval_gate.py index 8f771eb0a5..a0afc5af0a 100644 --- a/tests/unit/engine/test_approval_gate.py +++ b/tests/unit/engine/test_approval_gate.py @@ -1,12 +1,15 @@ """Tests for ApprovalGate service.""" +from datetime import UTC, datetime from unittest.mock import AsyncMock, MagicMock import pytest +from synthorg.core.types import NotBlankStr from synthorg.engine.approval_gate import ApprovalGate from synthorg.persistence.parked_context_protocol import ParkedContextRepository from synthorg.security.timeout.park_service import ParkService +from synthorg.security.timeout.parked_context import ParkedContext from tests.unit.engine.approval_helpers import make_escalation as _make_escalation pytestmark = pytest.mark.unit @@ -16,17 +19,29 @@ def park_service() -> MagicMock: """ParkService mock with a default parked context return value.""" svc = MagicMock(spec=ParkService) - parked = MagicMock() - parked.id = "parked-1" - parked.approval_id = "approval-1" + # Construct a real ``ParkedContext`` rather than a mock at the + # typed boundary: ``ParkedContext`` is a Pydantic model whose + # fields are not class-level attributes, so ``create_autospec`` (the + # spec_set=True path ``mock_of[T]`` uses) cannot see them and would + # refuse the ``id`` / ``approval_id`` overrides. A real instance + # carries the same fields the test needs without any mocking + # plumbing. + parked = ParkedContext( + id=NotBlankStr("parked-1"), + execution_id=NotBlankStr("exec-1"), + agent_id=NotBlankStr("agent-1"), + approval_id=NotBlankStr("approval-1"), + parked_at=datetime(2026, 5, 15, tzinfo=UTC), + context_json="{}", + ) svc.park.return_value = parked return svc @pytest.fixture -def parked_mock(park_service: MagicMock) -> MagicMock: +def parked_mock(park_service: MagicMock) -> ParkedContext: """The default parked context returned by park_service.park().""" - result: MagicMock = park_service.park.return_value + result: ParkedContext = park_service.park.return_value return result diff --git a/tests/unit/engine/test_task_engine_integration.py b/tests/unit/engine/test_task_engine_integration.py index b932376e3f..ef9919f2ef 100644 --- a/tests/unit/engine/test_task_engine_integration.py +++ b/tests/unit/engine/test_task_engine_integration.py @@ -318,21 +318,18 @@ async def slow_save(task: object) -> None: eng._queue.put_nowait(envelope) # The processing loop is genuinely stuck on ``block.wait()``, so - # ``_drain_processing`` will reliably hit its inner ``wait_for`` + # the processing-drain stage will hit its inner ``wait_for`` # timeout, cancel the processing task, and fail queued futures # before returning. ``stop()`` then returns cleanly. # - # Budget choice: ``stop(timeout=T)`` adds an outer hard-deadline - # at ``2 * T`` (see ``TaskEngine.stop``). If cleanup overruns - # that hard deadline, ``TimeoutError`` propagates to the caller - # and the engine is marked unrestartable. The original 50 ms - # budget left only ~50 ms for cancellation + future-cleanup, - # which intermittently raced under xdist load on CI runners - # and surfaced as ``TimeoutError`` escaping ``stop()``. 500 ms - # gives a 500 ms cleanup margin -- comfortably above the few - # milliseconds the cleanup actually needs even under heavy - # contention -- without lengthening the test meaningfully (the - # inner drain still fires after 500 ms, not 1 s). + # Budget choice: ``stop(timeout=T)`` splits ``T`` evenly between + # the processing-drain and observer-drain stages (see + # ``_drain_all``) so each stage has ``T/2`` whether or not the + # other stage runs long. With ``T=0.5`` each stage gets 250 ms, + # which is well above the few ms cancellation/cleanup actually + # needs even under xdist contention. The outer hard deadline of + # ``2*T`` therefore never fires on a healthy drain, and the test + # is deterministic regardless of CI load. await eng.stop(timeout=0.5) # The queued envelope (not yet processed) must be failed diff --git a/tests/unit/experiments/__init__.py b/tests/unit/experiments/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/experiments/test_service.py b/tests/unit/experiments/test_service.py new file mode 100644 index 0000000000..8fff8c0678 --- /dev/null +++ b/tests/unit/experiments/test_service.py @@ -0,0 +1,174 @@ +"""Unit coverage for :class:`ExperimentService`. + +Pins the deterministic-assignment contract: identical subjects always +land on the same variant, weight skews the population correctly, and +once-recorded assignments survive variant edits. +""" + +from datetime import UTC, datetime + +import pytest + +from synthorg.core.types import NotBlankStr +from synthorg.experiments.in_memory_repository import InMemoryExperimentRepository +from synthorg.experiments.service import ExperimentService +from tests._shared.fake_clock import FakeClock + +pytestmark = pytest.mark.unit + + +def _service() -> tuple[ExperimentService, InMemoryExperimentRepository, FakeClock]: + repo = InMemoryExperimentRepository() + clock = FakeClock(start=datetime(2026, 5, 15, tzinfo=UTC)) + svc = ExperimentService(repository=repo, clock=clock) + return svc, repo, clock + + +async def test_assign_deterministic_same_subject_same_variant() -> None: + svc, _, _ = _service() + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("control"), + weight=1, + ) + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("treatment"), + weight=1, + ) + a = await svc.assign(experiment=NotBlankStr("exp"), subject_id=NotBlankStr("u-1")) + b = await svc.assign(experiment=NotBlankStr("exp"), subject_id=NotBlankStr("u-1")) + assert a.variant == b.variant + + +async def test_assign_respects_weight_distribution() -> None: + svc, _, _ = _service() + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("control"), + weight=1, + ) + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("treatment"), + weight=9, + ) + counts = {"control": 0, "treatment": 0} + for i in range(1000): + result = await svc.assign( + experiment=NotBlankStr("exp"), + subject_id=NotBlankStr(f"u-{i}"), + ) + counts[str(result.variant)] += 1 + # 90/10 split tolerated within sane bounds. + assert counts["treatment"] > 7 * counts["control"] + + +async def test_assign_replays_recorded_assignment_after_variant_change() -> None: + svc, repo, _ = _service() + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("control"), + weight=1, + ) + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("treatment"), + weight=1, + ) + first = await svc.assign( + experiment=NotBlankStr("exp"), + subject_id=NotBlankStr("u-stable"), + ) + # Add a new variant; recorded assignment must still be returned. + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("variant-c"), + weight=10, + ) + second = await svc.assign( + experiment=NotBlankStr("exp"), + subject_id=NotBlankStr("u-stable"), + ) + assert second.variant == first.variant + stored = await repo.get_assignment( + experiment=NotBlankStr("exp"), + subject_id=NotBlankStr("u-stable"), + ) + assert stored is not None + assert stored.variant == first.variant + + +async def test_assign_raises_when_no_variants_registered() -> None: + svc, _, _ = _service() + from synthorg.core.domain_errors import NotFoundError + + with pytest.raises(NotFoundError, match="no registered variants"): + await svc.assign( + experiment=NotBlankStr("nope"), + subject_id=NotBlankStr("u-1"), + ) + + +async def test_register_variant_idempotent_replace() -> None: + svc, repo, _ = _service() + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("v1"), + weight=1, + ) + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("v1"), + weight=5, + ) + variants = await repo.list_for_experiment(NotBlankStr("exp")) + assert len(variants) == 1 + assert variants[0].weight == 5 + + +async def test_delete_variant_returns_false_when_absent() -> None: + svc, _, _ = _service() + removed = await svc.delete_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("ghost"), + ) + assert removed is False + + +async def test_list_assignments_paginates_in_recency_order() -> None: + svc, _, clock = _service() + await svc.register_variant( + experiment=NotBlankStr("exp"), + variant=NotBlankStr("v"), + weight=1, + ) + for i in range(5): + await svc.assign( + experiment=NotBlankStr("exp"), + subject_id=NotBlankStr(f"u-{i}"), + ) + # Advance the FakeClock so each assignment carries a distinct + # timestamp; without this the recency sort would degenerate to + # stable insertion order and the assertion below would pass for + # an ascending-sort regression too. + clock.advance(1) + page, total = await svc.list_assignments( + NotBlankStr("exp"), + limit=3, + offset=0, + ) + assert total == 5 + assert len(page) == 3 + # Assignments were created in order u-0 ... u-4 with strictly + # increasing timestamps; the page contract is "most recent first," + # so the first three subject ids on page must be the last three + # created in reverse order. Without this assert an ordering + # regression (e.g. a repo switching from descending to ascending + # sort) would still pass the total / length checks. + assert [str(item.subject_id) for item in page] == ["u-4", "u-3", "u-2"] + # The ``assigned_at`` timestamps must also be strictly decreasing + # across the page; this is the underlying contract that any future + # durable repo must honour even when subject ids are unordered. + timestamps = [item.assigned_at for item in page] + assert timestamps == sorted(timestamps, reverse=True) diff --git a/tests/unit/integrations/mcp_catalog/test_in_memory_installations_lifecycle.py b/tests/unit/integrations/mcp_catalog/test_in_memory_installations_lifecycle.py new file mode 100644 index 0000000000..75875c198f --- /dev/null +++ b/tests/unit/integrations/mcp_catalog/test_in_memory_installations_lifecycle.py @@ -0,0 +1,41 @@ +"""Lifecycle / cleanup coverage for the in-memory MCP installations repo. + +Pins the new ``clear()`` and ``size()`` surface so future regressions +cannot reintroduce the empty-cleanup stub. +""" + +from datetime import UTC, datetime + +import pytest + +from synthorg.core.types import NotBlankStr +from synthorg.integrations.mcp_catalog.in_memory_installations import ( + InMemoryMcpInstallationRepository, +) +from synthorg.integrations.mcp_catalog.installations import McpInstallation + +pytestmark = pytest.mark.unit + + +def _installation(catalog_entry_id: str = "ce-1") -> McpInstallation: + return McpInstallation( + catalog_entry_id=NotBlankStr(catalog_entry_id), + connection_name=NotBlankStr("default"), + installed_at=datetime(2026, 5, 15, tzinfo=UTC), + ) + + +async def test_clear_returns_count_and_empties_store() -> None: + repo = InMemoryMcpInstallationRepository() + await repo.save(_installation("ce-1")) + await repo.save(_installation("ce-2")) + assert await repo.size() == 2 + removed = await repo.clear() + assert removed == 2 + assert await repo.size() == 0 + + +async def test_clear_returns_zero_when_empty() -> None: + repo = InMemoryMcpInstallationRepository() + removed = await repo.clear() + assert removed == 0 diff --git a/tests/unit/observability/test_events.py b/tests/unit/observability/test_events.py index cc6358f5d4..6767d309e7 100644 --- a/tests/unit/observability/test_events.py +++ b/tests/unit/observability/test_events.py @@ -329,6 +329,8 @@ def test_all_domain_modules_discovered(self) -> None: "registry", # Resilience handler validation and event logging. "resilience", + # A/B experiment variant registry + deterministic assignment. + "experiments", } discovered = {info.name for info in pkgutil.iter_modules(events.__path__)} assert discovered == expected diff --git a/tests/unit/persistence/test_protocol.py b/tests/unit/persistence/test_protocol.py index b7441ee858..f3b569a478 100644 --- a/tests/unit/persistence/test_protocol.py +++ b/tests/unit/persistence/test_protocol.py @@ -2,7 +2,7 @@ from collections.abc import AsyncIterator, Mapping, Sequence from contextlib import asynccontextmanager -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import pytest @@ -699,6 +699,10 @@ async def list_items( class _FakeBackend: + @property + def kind(self) -> Literal["sqlite", "postgres"]: + return "sqlite" + async def connect(self) -> None: pass diff --git a/tests/unit/providers/conftest.py b/tests/unit/providers/conftest.py index 0f2ff74f47..8bdd489311 100644 --- a/tests/unit/providers/conftest.py +++ b/tests/unit/providers/conftest.py @@ -74,7 +74,7 @@ class CompletionConfigFactory(ModelFactory[CompletionConfig]): temperature = 0.7 max_tokens = 1024 stop_sequences = () - top_p = None + top_p = 1.0 timeout = None diff --git a/tests/unit/providers/test_models.py b/tests/unit/providers/test_models.py index 710fd80517..2744f915e7 100644 --- a/tests/unit/providers/test_models.py +++ b/tests/unit/providers/test_models.py @@ -323,7 +323,7 @@ def test_all_defaults(self) -> None: assert cfg.temperature is None assert cfg.max_tokens is None assert cfg.stop_sequences == () - assert cfg.top_p is None + assert cfg.top_p == 1.0 assert cfg.timeout is None def test_temperature_range(self) -> None: diff --git a/tests/unit/scripts/test_check_no_migration_framing.py b/tests/unit/scripts/test_check_no_migration_framing.py index 4add1bbcdd..091a285764 100644 --- a/tests/unit/scripts/test_check_no_migration_framing.py +++ b/tests/unit/scripts/test_check_no_migration_framing.py @@ -453,6 +453,50 @@ class TestPatternParametrized: ("we used to", '"""we used to dispatch from this module."""\n'), ("Phase N", "# Phase 4: typed-args refactor\n"), ("Round-N", '"""Round-7 review surfaced this."""\n'), + ( + "previously lived", + '"""Loop that previously lived in every mutation method."""\n', + ), + ( + "previously inlined", + '"""Helpers that were previously inlined in app.py."""\n', + ), + ( + "previously duplicated", + '"""Validation that was previously duplicated across files."""\n', + ), + ( + "previously scattered", + '"""Used to be scattered across 15 handlers."""\n', + ), + ( + "were previously owned", + '"""Helpers that were previously owned by the bus module."""\n', + ), + ( + "were previously emitted", + '"""Events that were previously emitted by the legacy hook."""\n', + ), + ( + "were previously wrapped", + '"""Calls that were previously wrapped in retry shims."""\n', + ), + ( + "used to be", + '"""These checks used to be tolerated in per-agent fan-out."""\n', + ), + ( + "originally generated", + '"""proposed_at: When the proposal was originally generated."""\n', + ), + ( + "originally promised", + '"""length matches the bytes the inner app originally promised."""\n', + ), + ( + "originally-claimed", + '"""Passes the originally-claimed record to unregister."""\n', + ), ], ) def test_pattern_fires(self, src_dir: Path, label: str, fixture: str) -> None: @@ -460,6 +504,42 @@ def test_pattern_fires(self, src_dir: Path, label: str, fixture: str) -> None: issues = _scan(src_dir, "src/synthorg/x.py", fixture) assert issues, (label, issues) + @pytest.mark.parametrize( + ("label", "fixture"), + [ + ( + "runtime previously stored", + '"""Rotating the key orphans all previously stored ciphertext."""\n', + ), + ( + "runtime previously applied", + '"""Reject a row whose content changed since previously applied."""\n', + ), + ( + "runtime previously compacted", + '"""Reject a conversation that was previously compacted upstream."""\n', + ), + ( + "runtime previously completed", + '"""If the task was previously completed, skip the re-run."""\n', + ), + ], + ) + def test_runtime_state_descriptions_not_flagged( + self, + src_dir: Path, + label: str, + fixture: str, + ) -> None: + """Runtime-state descriptions are not migration framing. + + Bare ``previously`` plus a runtime verb (``stored ciphertext``, + ``compacted conversation``) describes program state, not a + code-move. The targeted verb lists must NOT catch these. + """ + issues = _scan(src_dir, "src/synthorg/x.py", fixture) + assert not issues, (label, issues) + class TestMarkerHelpers: """Direct unit tests for marker-detection helpers.""" diff --git a/tests/unit/workers/test_claim_lifecycle.py b/tests/unit/workers/test_claim_lifecycle.py index 425c2ababb..158a1bcf54 100644 --- a/tests/unit/workers/test_claim_lifecycle.py +++ b/tests/unit/workers/test_claim_lifecycle.py @@ -21,8 +21,17 @@ import pytest import synthorg.workers.claim as claim_module +from synthorg.communication.bus.errors import BusStreamError from synthorg.communication.config import NatsConfig -from synthorg.workers.claim import _MAX_CLAIM_PAYLOAD_BYTES, JetStreamTaskQueue +from synthorg.observability.events.workers import ( + WORKERS_QUEUE_NOT_RUNNING, + WORKERS_QUEUE_START_REJECTED, +) +from synthorg.workers.claim import ( + _MAX_CLAIM_PAYLOAD_BYTES, + JetStreamTaskQueue, + TaskClaim, +) from synthorg.workers.config import QueueConfig @@ -302,3 +311,67 @@ async def test_next_claim_oversize_payload_re_raises_memory_error() -> None: with pytest.raises(MemoryError): await queue.next_claim(timeout=1.0) + + +@pytest.mark.unit +async def test_start_when_running_logs_rejection( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A second ``start()`` while already running logs the rejection event.""" + spy = _patch_logger(monkeypatch) + queue = _make_queue() + queue._running = True + + with pytest.raises(RuntimeError, match="already running"): + await queue.start() + + assert spy.warning.called + matched = [ + c + for c in spy.warning.call_args_list + if c.args and c.args[0] == WORKERS_QUEUE_START_REJECTED + ] + assert len(matched) == 1 + assert matched[0].kwargs["reason"] == "already_running" + + +@pytest.mark.unit +async def test_publish_claim_before_start_logs_not_running( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """``publish_claim`` before ``start()`` logs the not-running event.""" + spy = _patch_logger(monkeypatch) + queue = _make_queue() + claim = TaskClaim(task_id="task-A", new_status="assigned") + + with pytest.raises(BusStreamError, match="not running"): + await queue.publish_claim(claim) + + matched = [ + c + for c in spy.warning.call_args_list + if c.args and c.args[0] == WORKERS_QUEUE_NOT_RUNNING + ] + assert len(matched) == 1 + assert matched[0].kwargs["operation"] == "publish_claim" + assert matched[0].kwargs["task_id"] == "task-A" + + +@pytest.mark.unit +async def test_next_claim_before_start_logs_not_running( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """``next_claim`` before ``start()`` logs the not-running event.""" + spy = _patch_logger(monkeypatch) + queue = _make_queue() + + with pytest.raises(BusStreamError, match="not running"): + await queue.next_claim(timeout=1.0) + + matched = [ + c + for c in spy.warning.call_args_list + if c.args and c.args[0] == WORKERS_QUEUE_NOT_RUNNING + ] + assert len(matched) == 1 + assert matched[0].kwargs["operation"] == "next_claim" diff --git a/tests/unit/workers/test_executor.py b/tests/unit/workers/test_executor.py new file mode 100644 index 0000000000..ba5c7eeeb8 --- /dev/null +++ b/tests/unit/workers/test_executor.py @@ -0,0 +1,196 @@ +"""Unit coverage for :class:`TaskExecutionExecutor`. + +The executor turns each :class:`TaskClaim` into a typed HTTP POST +against the backend's ``/api/v1/tasks/{id}/execute`` endpoint. These +tests pin the outcome-mapping contract documented in the module +docstring: every HTTP shape lands on exactly one +:class:`TaskClaimStatus` so worker ack/nack behaviour is predictable +from logs alone. +""" + +from typing import Any + +import httpx +import pytest + +from synthorg.workers.claim import TaskClaim, TaskClaimStatus +from synthorg.workers.executor import TaskExecutionExecutor + +pytestmark = pytest.mark.unit + + +def _claim() -> TaskClaim: + return TaskClaim(task_id="task-A", new_status="assigned") + + +def _client(handler: httpx.MockTransport) -> httpx.AsyncClient: + return httpx.AsyncClient(transport=handler) + + +async def test_terminal_completed_returns_success() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={"data": {"id": "task-A", "status": "completed"}}, + ) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.SUCCESS + + +async def test_terminal_failed_returns_success() -> None: + """A 2xx response with terminal ``failed`` status still acks the claim. + + The worker's claim outcome reflects whether the execution attempt + completed end-to-end, not whether the task itself succeeded. A task + that terminated in ``failed`` status is recorded on the task; the + JetStream claim should be acked so the dispatcher does not redeliver. + """ + + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={"data": {"id": "task-A", "status": "failed"}}, + ) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.SUCCESS + + +async def test_non_terminal_200_returns_retry() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={"data": {"id": "task-A", "status": "in_progress"}}, + ) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.RETRY + + +async def test_404_returns_failed() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response(404, json={}) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.FAILED + + +async def test_409_returns_failed() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response(409, json={}) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.FAILED + + +async def test_500_returns_retry() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response(500, json={}) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.RETRY + + +async def test_transport_error_returns_retry() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + raise httpx.ConnectError("") # noqa: EM101 -- minimal test stub + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.RETRY + + +async def test_timeout_returns_retry() -> None: + def handler(_request: httpx.Request) -> httpx.Response: + raise httpx.ReadTimeout("") # noqa: EM101 -- minimal test stub + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="t", + http_client=http, + ) + outcome = await executor(_claim()) + assert outcome is TaskClaimStatus.RETRY + + +async def test_bearer_token_header_sent() -> None: + captured: dict[str, Any] = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["auth"] = request.headers.get("authorization") + return httpx.Response( + 200, + json={"data": {"id": "task-A", "status": "completed"}}, + ) + + async with _client(httpx.MockTransport(handler)) as http: + executor = TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="sekret", + http_client=http, + ) + await executor(_claim()) + assert captured["auth"] == "Bearer sekret" + + +async def test_rejects_empty_api_base_url() -> None: + async with httpx.AsyncClient() as http: + with pytest.raises(ValueError, match="api_base_url"): + TaskExecutionExecutor( + api_base_url="", + auth_token="t", + http_client=http, + ) + + +async def test_rejects_empty_token() -> None: + async with httpx.AsyncClient() as http: + with pytest.raises(ValueError, match="auth_token"): + TaskExecutionExecutor( + api_base_url="http://backend", + auth_token="", + http_client=http, + ) diff --git a/web/src/__tests__/stores/artifacts.test.ts b/web/src/__tests__/stores/artifacts.test.ts index f3f7c6d997..e474c51f70 100644 --- a/web/src/__tests__/stores/artifacts.test.ts +++ b/web/src/__tests__/stores/artifacts.test.ts @@ -311,9 +311,14 @@ describe('useArtifactsStore', () => { } useArtifactsStore.getState().updateFromWsEvent(event) - // The store debounces/schedules the refetch; allow microtasks to flush. - await new Promise((resolve) => setTimeout(resolve, 0)) - expect(fetchCount).toBeGreaterThan(0) + // The store schedules the refetch on the microtask queue. Use + // ``vi.waitFor`` so the assertion polls the queue deterministically + // rather than racing a fixed ``setTimeout(resolve, 0)`` against + // the scheduler. ``waitFor`` flushes microtasks on each tick and + // bails out as soon as the expectation passes. + await vi.waitFor(() => { + expect(fetchCount).toBeGreaterThan(0) + }) }) }) diff --git a/web/src/api/types/dtos.gen.ts b/web/src/api/types/dtos.gen.ts index fbeee79c4b..2ecb9e0ec2 100644 --- a/web/src/api/types/dtos.gen.ts +++ b/web/src/api/types/dtos.gen.ts @@ -57,6 +57,8 @@ export type DiscoveryPolicyResponseEnvelope = components['schemas']['ApiResponse export type EntityResponseEnvelope = components['schemas']['ApiResponse_EntityResponse_'] export type EntityVersionResponseEnvelope = components['schemas']['ApiResponse_EntityVersionResponse_'] export type EscalationResponseEnvelope = components['schemas']['ApiResponse_EscalationResponse_'] +export type ExperimentAssignmentEnvelope = components['schemas']['ApiResponse_ExperimentAssignment_'] +export type ExperimentVariantEnvelope = components['schemas']['ApiResponse_ExperimentVariant_'] export type FineTuneStatusEnvelope = components['schemas']['ApiResponse_FineTuneStatus_'] export type ForecastResponseEnvelope = components['schemas']['ApiResponse_ForecastResponse_'] export type HealthReportEnvelope = components['schemas']['ApiResponse_HealthReport_'] @@ -128,6 +130,7 @@ export type ApprovalChain = components['schemas']['ApprovalChain'] export type ApprovalResponse = components['schemas']['ApprovalResponse'] export type ApproveRequest = components['schemas']['ApproveRequest'] export type Artifact = components['schemas']['Artifact'] +export type AssignExperimentRequest = components['schemas']['AssignExperimentRequest'] export type AuditEntry = components['schemas']['AuditEntry'] export type Authority = components['schemas']['Authority'] export type AuthorityDeferenceConfig = components['schemas']['AuthorityDeferenceConfig'] @@ -223,8 +226,11 @@ export type EvaluationConfig = components['schemas']['EvaluationConfig'] export type EventBatch = components['schemas']['EventBatch'] export type EvidencePackage = components['schemas']['EvidencePackage'] export type EvidencePackageSignature = components['schemas']['EvidencePackageSignature'] +export type ExecuteTaskRequest = components['schemas']['ExecuteTaskRequest'] export type ExpectedArtifact = components['schemas']['ExpectedArtifact'] export type ExperienceConfig = components['schemas']['ExperienceConfig'] +export type ExperimentAssignment = components['schemas']['ExperimentAssignment'] +export type ExperimentVariant = components['schemas']['ExperimentVariant'] export type FilePart = components['schemas']['FilePart'] export type FineTuneRequest = components['schemas']['FineTuneRequest'] export type FineTuneRun = components['schemas']['FineTuneRun'] @@ -284,6 +290,7 @@ export type DriftReportResponsePage = components['schemas']['PaginatedResponse_D export type EntityResponsePage = components['schemas']['PaginatedResponse_EntityResponse_'] export type EntityVersionResponsePage = components['schemas']['PaginatedResponse_EntityVersionResponse_'] export type EscalationResponsePage = components['schemas']['PaginatedResponse_EscalationResponse_'] +export type ExperimentAssignmentPage = components['schemas']['PaginatedResponse_ExperimentAssignment_'] export type FineTuneRunPage = components['schemas']['PaginatedResponse_FineTuneRun_'] export type HealthReportPage = components['schemas']['PaginatedResponse_HealthReport_'] export type InstalledEntryPage = components['schemas']['PaginatedResponse_InstalledEntry_'] @@ -350,6 +357,7 @@ export type RateLimitsUpdateRequest = components['schemas']['RateLimitsUpdateReq export type ReadinessStatus = components['schemas']['ReadinessStatus'] export type RecommendedAction = components['schemas']['RecommendedAction'] export type RedundancyRate = components['schemas']['RedundancyRate'] +export type RegisterExperimentVariantRequest = components['schemas']['RegisterExperimentVariantRequest'] export type RejectDecision = components['schemas']['RejectDecision'] export type RejectRequest = components['schemas']['RejectRequest'] export type RemoveAllowlistEntryRequest = components['schemas']['RemoveAllowlistEntryRequest'] diff --git a/web/src/api/types/openapi.gen.ts b/web/src/api/types/openapi.gen.ts index 4a10ff997b..9a817f4d24 100644 --- a/web/src/api/types/openapi.gen.ts +++ b/web/src/api/types/openapi.gen.ts @@ -1605,6 +1605,58 @@ export type paths = { readonly patch?: never; readonly trace?: never; }; + readonly "/api/v1/experiments/{experiment}/assign": { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path?: never; + readonly cookie?: never; + }; + readonly get?: never; + readonly put?: never; + /** Assign */ + readonly post: operations["ApiV1ExperimentsAssignAssign"]; + readonly delete?: never; + readonly options?: never; + readonly head?: never; + readonly patch?: never; + readonly trace?: never; + }; + readonly "/api/v1/experiments/{experiment}/assignments": { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path?: never; + readonly cookie?: never; + }; + /** ListAssignments */ + readonly get: operations["ApiV1ExperimentsAssignmentsListAssignments"]; + readonly put?: never; + readonly post?: never; + readonly delete?: never; + readonly options?: never; + readonly head?: never; + readonly patch?: never; + readonly trace?: never; + }; + readonly "/api/v1/experiments/{experiment}/variants": { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path?: never; + readonly cookie?: never; + }; + /** ListVariants */ + readonly get: operations["ApiV1ExperimentsVariantsListVariants"]; + readonly put?: never; + /** RegisterVariant */ + readonly post: operations["ApiV1ExperimentsVariantsRegisterVariant"]; + readonly delete?: never; + readonly options?: never; + readonly head?: never; + readonly patch?: never; + readonly trace?: never; + }; readonly "/api/v1/healthz": { readonly parameters: { readonly query?: never; @@ -3712,6 +3764,23 @@ export type paths = { readonly patch?: never; readonly trace?: never; }; + readonly "/api/v1/tasks/{task_id}/execute": { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path?: never; + readonly cookie?: never; + }; + readonly get?: never; + readonly put?: never; + /** ExecuteTask */ + readonly post: operations["ApiV1TasksTaskIdExecuteExecuteTask"]; + readonly delete?: never; + readonly options?: never; + readonly head?: never; + readonly patch?: never; + readonly trace?: never; + }; readonly "/api/v1/tasks/{task_id}/transition": { readonly parameters: { readonly query?: never; @@ -4816,6 +4885,22 @@ export type components = { /** @description Whether the request succeeded (derived from ``error``). */ readonly success: boolean; }; + /** ApiResponse[ExperimentAssignment] */ + readonly ApiResponse_ExperimentAssignment_: { + readonly data: components["schemas"]["ExperimentAssignment"] | null; + readonly error: string | null; + readonly error_detail: components["schemas"]["ErrorDetail"] | null; + /** @description Whether the request succeeded (derived from ``error``). */ + readonly success: boolean; + }; + /** ApiResponse[ExperimentVariant] */ + readonly ApiResponse_ExperimentVariant_: { + readonly data: components["schemas"]["ExperimentVariant"] | null; + readonly error: string | null; + readonly error_detail: components["schemas"]["ErrorDetail"] | null; + /** @description Whether the request succeeded (derived from ``error``). */ + readonly success: boolean; + }; /** ApiResponse[FineTuneStatus] */ readonly ApiResponse_FineTuneStatus_: { readonly data: components["schemas"]["FineTuneStatus"] | null; @@ -5216,6 +5301,14 @@ export type components = { /** @description Whether the request succeeded (derived from ``error``). */ readonly success: boolean; }; + /** ApiResponse[tuple[ExperimentVariant, ...]] */ + readonly "ApiResponse_tuple_ExperimentVariant_..._": { + readonly data: readonly components["schemas"]["ExperimentVariant"][] | null; + readonly error: string | null; + readonly error_detail: components["schemas"]["ErrorDetail"] | null; + /** @description Whether the request succeeded (derived from ``error``). */ + readonly success: boolean; + }; /** ApiResponse[tuple[InterruptResponse, ...]] */ readonly "ApiResponse_tuple_InterruptResponse_..._": { readonly data: readonly components["schemas"]["InterruptResponse"][] | null; @@ -5550,6 +5643,11 @@ export type components = { * @enum {string} */ readonly ArtifactType: "code" | "tests" | "documentation"; + /** AssignExperimentRequest */ + readonly AssignExperimentRequest: { + /** @description Subject identifier (agent id, user id, project id, ...) */ + readonly subject_id: string; + }; /** AuditEntry */ readonly AuditEntry: { readonly action_type: string; @@ -7434,6 +7532,15 @@ export type components = { */ readonly signed_at: string; }; + /** ExecuteTaskRequest */ + readonly ExecuteTaskRequest: { + /** @description Per-dispatch idempotency key; backend dedups duplicate executions */ + readonly idempotency_key: string; + /** @description Task status that triggered the dispatch (typically 'assigned' or 'ready') */ + readonly new_status: string; + /** @description Task status before the triggering transition */ + readonly previous_status?: string | null; + }; /** ExpectedArtifact */ readonly ExpectedArtifact: { /** @description File or directory path for the artifact */ @@ -7469,6 +7576,39 @@ export type components = { /** @default 0.2 */ readonly weight: number; }; + /** ExperimentAssignment */ + readonly ExperimentAssignment: { + /** + * Format: date-time + * @description datetime with the constraint that the value must have timezone info + */ + readonly assigned_at: string; + /** @description Experiment key */ + readonly experiment: string; + /** @description Subject identifier */ + readonly subject_id: string; + /** @description Variant the subject was assigned to */ + readonly variant: string; + }; + /** ExperimentVariant */ + readonly ExperimentVariant: { + /** + * Format: date-time + * @description datetime with the constraint that the value must have timezone info + */ + readonly created_at: string; + /** + * @description Operator notes + * @default + */ + readonly description: string; + /** @description Experiment key (kebab-case) */ + readonly experiment: string; + /** @description Variant name within the experiment */ + readonly variant: string; + /** @description Relative selection weight */ + readonly weight: number; + }; /** FilePart */ readonly FilePart: { /** @description Optional MIME type */ @@ -8659,6 +8799,21 @@ export type components = { /** @description Whether the request succeeded (derived from ``error``). */ readonly success: boolean; }; + /** PaginatedResponse[ExperimentAssignment] */ + readonly PaginatedResponse_ExperimentAssignment_: { + /** @default [] */ + readonly data: readonly components["schemas"]["ExperimentAssignment"][]; + /** + * @description Data sources that failed gracefully (partial data) + * @default [] + */ + readonly degraded_sources: readonly string[]; + readonly error: string | null; + readonly error_detail: components["schemas"]["ErrorDetail"] | null; + readonly pagination: components["schemas"]["PaginationMeta"]; + /** @description Whether the request succeeded (derived from ``error``). */ + readonly success: boolean; + }; /** PaginatedResponse[FineTuneRun] */ readonly PaginatedResponse_FineTuneRun_: { /** @default [] */ @@ -9765,6 +9920,18 @@ export type components = { /** @description Mean redundancy */ readonly value: number; }; + /** RegisterExperimentVariantRequest */ + readonly RegisterExperimentVariantRequest: { + /** + * @description Operator notes + * @default + */ + readonly description: string; + /** @description Variant name within the experiment */ + readonly variant: string; + /** @description Relative selection weight */ + readonly weight: number; + }; /** RejectDecision */ readonly RejectDecision: { readonly reasoning: string; @@ -15844,6 +16011,139 @@ export interface operations { readonly 503: components["responses"]["ServiceUnavailable"]; }; }; + readonly ApiV1ExperimentsAssignAssign: { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path: { + /** @description Resource identifier */ + readonly experiment: string; + }; + readonly cookie?: never; + }; + readonly requestBody: { + readonly content: { + readonly "application/json": components["schemas"]["AssignExperimentRequest"]; + }; + }; + readonly responses: { + /** @description Document created, URL follows */ + readonly 201: { + headers: { + readonly [name: string]: unknown; + }; + content: { + readonly "application/json": components["schemas"]["ApiResponse_ExperimentAssignment_"]; + }; + }; + readonly 400: components["responses"]["BadRequest"]; + readonly 401: components["responses"]["Unauthorized"]; + readonly 403: components["responses"]["Forbidden"]; + readonly 404: components["responses"]["NotFound"]; + readonly 409: components["responses"]["Conflict"]; + readonly 429: components["responses"]["TooManyRequests"]; + readonly 500: components["responses"]["InternalError"]; + readonly 503: components["responses"]["ServiceUnavailable"]; + }; + }; + readonly ApiV1ExperimentsAssignmentsListAssignments: { + readonly parameters: { + readonly query?: { + /** @description Opaque pagination cursor returned by the previous page */ + readonly cursor?: string | null; + /** @description Page size (default 50, max 200) */ + readonly limit?: number; + }; + readonly header?: never; + readonly path: { + /** @description Resource identifier */ + readonly experiment: string; + }; + readonly cookie?: never; + }; + readonly requestBody?: never; + readonly responses: { + /** @description Request fulfilled, document follows */ + readonly 200: { + headers: { + readonly [name: string]: unknown; + }; + content: { + readonly "application/json": components["schemas"]["PaginatedResponse_ExperimentAssignment_"]; + }; + }; + readonly 400: components["responses"]["BadRequest"]; + readonly 401: components["responses"]["Unauthorized"]; + readonly 404: components["responses"]["NotFound"]; + readonly 429: components["responses"]["TooManyRequests"]; + readonly 500: components["responses"]["InternalError"]; + readonly 503: components["responses"]["ServiceUnavailable"]; + }; + }; + readonly ApiV1ExperimentsVariantsListVariants: { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path: { + /** @description Resource identifier */ + readonly experiment: string; + }; + readonly cookie?: never; + }; + readonly requestBody?: never; + readonly responses: { + /** @description Request fulfilled, document follows */ + readonly 200: { + headers: { + readonly [name: string]: unknown; + }; + content: { + readonly "application/json": components["schemas"]["ApiResponse_tuple_ExperimentVariant_..._"]; + }; + }; + readonly 400: components["responses"]["BadRequest"]; + readonly 401: components["responses"]["Unauthorized"]; + readonly 404: components["responses"]["NotFound"]; + readonly 429: components["responses"]["TooManyRequests"]; + readonly 500: components["responses"]["InternalError"]; + readonly 503: components["responses"]["ServiceUnavailable"]; + }; + }; + readonly ApiV1ExperimentsVariantsRegisterVariant: { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path: { + /** @description Resource identifier */ + readonly experiment: string; + }; + readonly cookie?: never; + }; + readonly requestBody: { + readonly content: { + readonly "application/json": components["schemas"]["RegisterExperimentVariantRequest"]; + }; + }; + readonly responses: { + /** @description Document created, URL follows */ + readonly 201: { + headers: { + readonly [name: string]: unknown; + }; + content: { + readonly "application/json": components["schemas"]["ApiResponse_ExperimentVariant_"]; + }; + }; + readonly 400: components["responses"]["BadRequest"]; + readonly 401: components["responses"]["Unauthorized"]; + readonly 403: components["responses"]["Forbidden"]; + readonly 404: components["responses"]["NotFound"]; + readonly 409: components["responses"]["Conflict"]; + readonly 429: components["responses"]["TooManyRequests"]; + readonly 500: components["responses"]["InternalError"]; + readonly 503: components["responses"]["ServiceUnavailable"]; + }; + }; readonly ApiV1HealthzLiveness: { readonly parameters: { readonly query?: never; @@ -20342,6 +20642,41 @@ export interface operations { readonly 503: components["responses"]["ServiceUnavailable"]; }; }; + readonly ApiV1TasksTaskIdExecuteExecuteTask: { + readonly parameters: { + readonly query?: never; + readonly header?: never; + readonly path: { + /** @description Resource identifier */ + readonly task_id: string; + }; + readonly cookie?: never; + }; + readonly requestBody: { + readonly content: { + readonly "application/json": components["schemas"]["ExecuteTaskRequest"]; + }; + }; + readonly responses: { + /** @description Request fulfilled, document follows */ + readonly 200: { + headers: { + readonly [name: string]: unknown; + }; + content: { + readonly "application/json": components["schemas"]["ApiResponse_Task_"]; + }; + }; + readonly 400: components["responses"]["BadRequest"]; + readonly 401: components["responses"]["Unauthorized"]; + readonly 403: components["responses"]["Forbidden"]; + readonly 404: components["responses"]["NotFound"]; + readonly 409: components["responses"]["Conflict"]; + readonly 429: components["responses"]["TooManyRequests"]; + readonly 500: components["responses"]["InternalError"]; + readonly 503: components["responses"]["ServiceUnavailable"]; + }; + }; readonly ApiV1TasksTaskIdTransitionTransitionTask: { readonly parameters: { readonly query?: never;