diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b5dcd970..1ec007b56d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- **Server startup no longer marks actively-running workflows as failed.** The `failOrphanedRuns()` call has been removed from `packages/server/src/index.ts` to match the CLI precedent (`packages/cli/src/cli.ts:256-258`). Per the new CLAUDE.md principle "No Autonomous Lifecycle Mutation Across Process Boundaries", a stuck `running` row is now transitioned explicitly by the user: via the per-row Cancel/Abandon buttons on the dashboard workflow card, or `archon workflow abandon ` from the CLI. (`archon workflow cleanup` is a separate command that deletes OLD terminal runs for disk hygiene — it does not handle stuck `running` rows.) Closes #1216. + ### Changed +- **Dashboard nav tab** now shows a numeric count of running workflows instead of a binary pulse dot. Reads from the existing `/api/dashboard/runs` `counts.running` field; same 10s polling interval. +- **Workflow run destructive actions** (Abandon, Cancel, Delete, Reject) now use a proper confirmation dialog matching the codebase-delete UX, replacing the browser's native `window.confirm()` popups. Each dialog includes context-appropriate copy describing what the action does to the run record. + - **Claude Code binary resolution** (breaking for compiled binary users): Archon no longer embeds the Claude Code SDK into compiled binaries. In compiled builds, you must install Claude Code separately (`curl -fsSL https://claude.ai/install.sh | bash` on macOS/Linux, `irm https://claude.ai/install.ps1 | iex` on Windows, or `npm install -g @anthropic-ai/claude-code`) and point Archon at the executable via `CLAUDE_BIN_PATH` env var or `assistants.claude.claudeBinaryPath` in `.archon/config.yaml`. The Claude Agent SDK accepts either the native compiled binary (from the curl/PowerShell installer at `~/.local/bin/claude`) or a JS `cli.js` (from the npm install). Dev mode (`bun run`) is unaffected — the SDK resolves via `node_modules` as before. The Docker image ships Claude Code pre-installed with `CLAUDE_BIN_PATH` pre-set, so `docker run` still works out of the box. Resolves silent "Module not found /Users/runner/..." failures on macOS (#1210) and Windows (#1087). ### Added diff --git a/packages/docs-web/src/content/docs/guides/authoring-workflows.md b/packages/docs-web/src/content/docs/guides/authoring-workflows.md index 3651ccae37..c4fdfc7830 100644 --- a/packages/docs-web/src/content/docs/guides/authoring-workflows.md +++ b/packages/docs-web/src/content/docs/guides/authoring-workflows.md @@ -474,7 +474,7 @@ This means a single transient crash may trigger up to **3 SDK retries** before a ## DAG Resume on Failure -When a `nodes:` (DAG) workflow fails (including due to a server restart), the next invocation automatically resumes from where it left off — no `--resume` flag required. +When a `nodes:` (DAG) workflow fails, the next invocation automatically resumes from where it left off — no `--resume` flag required. **How it works:** @@ -483,7 +483,14 @@ When a `nodes:` (DAG) workflow fails (including due to a server restart), the ne 3. Completed nodes are skipped; only failed and not-yet-run nodes are executed. 4. You receive a platform message like: `Resuming workflow — skipping 3 already-completed node(s).` -**Server restart**: If a server restart leaves runs in `running` status, they are automatically marked as `failed` on the next startup (with `metadata.failure_reason = 'server_restart'`). The next invocation of the same workflow at the same path auto-resumes from completed nodes. +**Crashed servers / orphaned runs**: Archon does **not** auto-fail `running` rows on server startup — that would kill workflows actively executing in another process (CLI, adapter). If a server crash leaves a row stuck as `running`, it remains visible in the dashboard (the Dashboard nav tab shows a count of running workflows). Transition it to a terminal status explicitly: + +- **Web UI**: click the Abandon or Cancel button on the workflow card. Abandon marks the run `cancelled` and keeps completed-node history. Cancel also terminates any in-flight subprocess. +- **CLI**: `archon workflow abandon ` (equivalent to the dashboard Abandon button). Run IDs are listed by `archon workflow status`. + +Once the row reaches a terminal status, the next invocation of the same workflow at the same path auto-resumes from completed nodes via the mechanism above. + +> Not to be confused with `archon workflow cleanup [days]`, which **deletes** old terminal runs (`completed`/`failed`/`cancelled`) from the database for disk hygiene. It does not transition `running` rows. **Known limitation**: AI session context from prior nodes is not restored. If a downstream node relies on in-context knowledge from a prior run's session (rather than artifacts), it may need to re-read those artifacts explicitly. diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index d8b1a4c4c8..3d0d1bdcf5 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -77,7 +77,6 @@ import { loadConfig, logConfig, getPort, - createWorkflowStore, } from '@archon/core'; import type { IPlatformAdapter } from '@archon/core'; import { createLogger, logArchonPaths, validateAppDefaultsPaths } from '@archon/paths'; @@ -208,12 +207,17 @@ export async function startServer(opts: ServerOptions = {}): Promise { // Start cleanup scheduler startCleanupScheduler(); - // Mark workflow runs orphaned by previous process termination as failed - void createWorkflowStore() - .failOrphanedRuns() - .catch(err => { - getLog().error({ err }, 'workflow.fail_orphans_failed'); - }); + // Note: orphaned-run cleanup intentionally NOT called at server startup. + // Running it here killed parallel workflow runs from other processes + // (CLI, adapters) by flipping their `running` rows to `failed` mid-flight. + // Same lesson the CLI already learned — see packages/cli/src/cli.ts:256-258. + // Per CLAUDE.md "No Autonomous Lifecycle Mutation Across Process Boundaries": + // surface ambiguous state to users and provide a one-click action instead. + // Users transition a stuck `running` row via the per-row Cancel/Abandon + // buttons in the Web UI dashboard, or `archon workflow abandon `. + // (`archon workflow cleanup` is a separate command that deletes OLD terminal + // rows for disk hygiene — it does not handle stuck `running` rows.) + // See #1216. // Log Archon paths configuration logArchonPaths(); diff --git a/packages/web/src/components/dashboard/ConfirmRunActionDialog.tsx b/packages/web/src/components/dashboard/ConfirmRunActionDialog.tsx new file mode 100644 index 0000000000..2292aef3ce --- /dev/null +++ b/packages/web/src/components/dashboard/ConfirmRunActionDialog.tsx @@ -0,0 +1,76 @@ +import type { ReactNode } from 'react'; +import { + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, + AlertDialogTrigger, +} from '@/components/ui/alert-dialog'; + +interface Props { + /** The element that opens the dialog when clicked (typically a button). */ + trigger: ReactNode; + /** Dialog title (e.g. "Abandon workflow?"). */ + title: string; + /** Body text — supports rich children (e.g. wrapping the workflow name in ). */ + description: ReactNode; + /** Confirm-button label (e.g. "Abandon", "Delete"). */ + confirmLabel: string; + /** Invoked when the user confirms. The current callsites are all + * fire-and-forget wrappers around React Query mutations whose error + * handling lives at the page level (`runAction` in `DashboardPage.tsx`). + * Widen to `Promise` only if a caller needs to await the action. */ + onConfirm: () => void; +} + +/** + * Confirmation dialog for destructive workflow-run actions. + * + * Wraps shadcn's AlertDialog with the trigger included as a slot, so callers + * pass their existing action button as the `trigger` prop. The Action button + * is destructive-styled by default (per `AlertDialogAction` in + * `@/components/ui/alert-dialog`), which is appropriate for every workflow + * lifecycle action this is used for (Abandon, Cancel, Delete, Reject). + * + * Replaces previous use of `window.confirm()` for these actions to match the + * codebase-delete UX in `sidebar/ProjectSelector.tsx`. + */ +export function ConfirmRunActionDialog({ + trigger, + title, + description, + confirmLabel, + onConfirm, +}: Props): React.ReactElement { + return ( + + {trigger} + + + {title} + +
{description}
+
+
+ + Cancel + { + // Caller's onConfirm is fire-and-forget over a parent-level + // runAction helper that surfaces errors via component state. + // We do NOT catch here; swallowing would hide failures the + // parent is positioned to display. + onConfirm(); + }} + > + {confirmLabel} + + +
+
+ ); +} diff --git a/packages/web/src/components/dashboard/WorkflowHistoryTable.tsx b/packages/web/src/components/dashboard/WorkflowHistoryTable.tsx index 015becb328..eea3bbfe38 100644 --- a/packages/web/src/components/dashboard/WorkflowHistoryTable.tsx +++ b/packages/web/src/components/dashboard/WorkflowHistoryTable.tsx @@ -3,6 +3,7 @@ import { Globe, Terminal, Hash, Send, GitBranch, Trash2 } from 'lucide-react'; import type { DashboardRunResponse } from '@/lib/api'; import { cn } from '@/lib/utils'; import { formatDuration, formatStarted } from '@/lib/format'; +import { ConfirmRunActionDialog } from './ConfirmRunActionDialog'; interface WorkflowHistoryTableProps { runs: DashboardRunResponse[]; @@ -101,21 +102,27 @@ export function WorkflowHistoryTable({ View Logs {onDelete && ( - + } + title="Delete workflow run?" + description={ + <> + Permanently delete the run record for {run.workflow_name}{' '} + and its events. This cannot be undone. + + } + confirmLabel="Delete" + onConfirm={(): void => { + onDelete(run.id); }} - className="text-text-tertiary hover:text-error transition-colors" - title="Delete run" - > - - + /> )} diff --git a/packages/web/src/components/dashboard/WorkflowRunCard.tsx b/packages/web/src/components/dashboard/WorkflowRunCard.tsx index 926c31092e..6a5042de55 100644 --- a/packages/web/src/components/dashboard/WorkflowRunCard.tsx +++ b/packages/web/src/components/dashboard/WorkflowRunCard.tsx @@ -22,6 +22,7 @@ import { cn } from '@/lib/utils'; import { formatDuration } from '@/lib/format'; import { useWorkflowStore } from '@/stores/workflow-store'; import type { WorkflowState } from '@/lib/types'; +import { ConfirmRunActionDialog } from './ConfirmRunActionDialog'; interface WorkflowRunCardProps { run: DashboardRunResponse; @@ -318,17 +319,25 @@ export function WorkflowRunCard({ )} {run.status === 'paused' && onReject && ( - + } + title="Reject workflow?" + description={ + <> + Reject the paused workflow {run.workflow_name}. The run will be + marked as failed and any pending iterations will not continue. + + } + confirmLabel="Reject" + onConfirm={(): void => { + onReject(run.id); }} - className="flex items-center gap-1 rounded-md px-2 py-1 text-xs text-error/80 hover:bg-error/10 hover:text-error transition-colors" - > - - Reject - + /> )} {run.status === 'failed' && onResume && ( + } + title="Abandon workflow?" + description={ + <> + Mark {run.workflow_name} as cancelled. Already-completed nodes + remain in the database; the run will not continue. + + } + confirmLabel="Abandon" + onConfirm={(): void => { + onAbandon(run.id); }} - className="flex items-center gap-1 rounded-md px-2 py-1 text-xs text-warning/80 hover:bg-warning/10 hover:text-warning transition-colors" - > - - Abandon - + /> )} {(run.status === 'running' || run.status === 'pending') && ( - + } + title="Cancel workflow?" + description={ + <> + Cancel {run.workflow_name}. The run will be marked as cancelled + and any in-flight subprocess will be terminated. + + } + confirmLabel="Cancel workflow" + onConfirm={(): void => { + onCancel(run.id); }} - className="flex items-center gap-1 rounded-md px-2 py-1 text-xs text-error/80 hover:bg-error/10 hover:text-error transition-colors" - > - - Cancel - + /> )} {onDelete && run.status !== 'running' && run.status !== 'pending' && ( - + } + title="Delete workflow run?" + description={ + <> + Permanently delete the run record for {run.workflow_name} and its + events. This cannot be undone. + + } + confirmLabel="Delete" + onConfirm={(): void => { + onDelete(run.id); }} - className="flex items-center gap-1 rounded-md px-2 py-1 text-xs text-text-tertiary hover:bg-error/10 hover:text-error transition-colors" - > - - Delete - + /> )} diff --git a/packages/web/src/components/layout/TopNav.tsx b/packages/web/src/components/layout/TopNav.tsx index 45924f5004..ac1feabde5 100644 --- a/packages/web/src/components/layout/TopNav.tsx +++ b/packages/web/src/components/layout/TopNav.tsx @@ -1,7 +1,7 @@ import { NavLink, Link } from 'react-router'; import { useQuery } from '@tanstack/react-query'; import { LayoutDashboard, MessageSquare, Workflow, Settings } from 'lucide-react'; -import { listWorkflowRuns, getUpdateCheck } from '@/lib/api'; +import { listDashboardRuns, getUpdateCheck } from '@/lib/api'; import { cn } from '@/lib/utils'; const tabs = [ @@ -12,12 +12,15 @@ const tabs = [ ] as const; export function TopNav(): React.ReactElement { - const { data: runningRuns } = useQuery({ - queryKey: ['workflowRuns', { status: 'running' }], - queryFn: () => listWorkflowRuns({ status: 'running', limit: 1 }), + // We only need `counts.running` — a server-side aggregate independent of + // the `runs` array. `limit: 1` minimises the `runs` payload that the API + // returns alongside the counts (we discard it). + const { data: dashboardRuns } = useQuery({ + queryKey: ['dashboardRuns', { status: 'running', forCount: true }], + queryFn: () => listDashboardRuns({ status: 'running', limit: 1 }), refetchInterval: 10_000, }); - const hasRunning = (runningRuns?.length ?? 0) > 0; + const runningCount = dashboardRuns?.counts.running ?? 0; const { data: updateCheck } = useQuery({ queryKey: ['update-check'], @@ -53,8 +56,13 @@ export function TopNav(): React.ReactElement { > {label} - {to === '/dashboard' && hasRunning && ( - + {to === '/dashboard' && runningCount > 0 && ( + + {runningCount} + )} ))}