From 197ecd4a237bd1715c1e0e3b15400f40af0fa690 Mon Sep 17 00:00:00 2001 From: chronark Date: Fri, 3 Oct 2025 18:13:28 +0200 Subject: [PATCH 1/3] chore: deleting dead code (kv package) --- go/pkg/kv/doc.go | 37 --- go/pkg/kv/store.go | 24 -- go/pkg/kv/stores/mysql/db.go | 31 --- go/pkg/kv/stores/mysql/delete.sql.go | 19 -- go/pkg/kv/stores/mysql/delete_expired.sql.go | 25 -- go/pkg/kv/stores/mysql/doc.go | 28 --- go/pkg/kv/stores/mysql/get.sql.go | 35 --- .../kv/stores/mysql/list_by_workspace.sql.go | 62 ----- go/pkg/kv/stores/mysql/models.go | 18 -- go/pkg/kv/stores/mysql/queries/delete.sql | 2 - .../stores/mysql/queries/delete_expired.sql | 2 - go/pkg/kv/stores/mysql/queries/get.sql | 3 - .../mysql/queries/list_by_workspace.sql | 7 - go/pkg/kv/stores/mysql/queries/set.sql | 7 - go/pkg/kv/stores/mysql/schema.sql | 13 -- go/pkg/kv/stores/mysql/set.sql.go | 39 ---- go/pkg/kv/stores/mysql/sqlc.json | 16 -- go/pkg/kv/stores/mysql/store.go | 213 ------------------ 18 files changed, 581 deletions(-) delete mode 100644 go/pkg/kv/doc.go delete mode 100644 go/pkg/kv/store.go delete mode 100644 go/pkg/kv/stores/mysql/db.go delete mode 100644 go/pkg/kv/stores/mysql/delete.sql.go delete mode 100644 go/pkg/kv/stores/mysql/delete_expired.sql.go delete mode 100644 go/pkg/kv/stores/mysql/doc.go delete mode 100644 go/pkg/kv/stores/mysql/get.sql.go delete mode 100644 go/pkg/kv/stores/mysql/list_by_workspace.sql.go delete mode 100644 go/pkg/kv/stores/mysql/models.go delete mode 100644 go/pkg/kv/stores/mysql/queries/delete.sql delete mode 100644 go/pkg/kv/stores/mysql/queries/delete_expired.sql delete mode 100644 go/pkg/kv/stores/mysql/queries/get.sql delete mode 100644 go/pkg/kv/stores/mysql/queries/list_by_workspace.sql delete mode 100644 go/pkg/kv/stores/mysql/queries/set.sql delete mode 100644 go/pkg/kv/stores/mysql/schema.sql delete mode 100644 go/pkg/kv/stores/mysql/set.sql.go delete mode 100644 go/pkg/kv/stores/mysql/sqlc.json delete mode 100644 go/pkg/kv/stores/mysql/store.go diff --git a/go/pkg/kv/doc.go b/go/pkg/kv/doc.go deleted file mode 100644 index 10969bdec1..0000000000 --- a/go/pkg/kv/doc.go +++ /dev/null @@ -1,37 +0,0 @@ -// Package kv provides a key-value store abstraction with TTL support and workspace isolation. -// -// The package defines a Store interface that can be implemented by different backends. -// Currently, a MySQL-based implementation is provided in the stores/mysql subpackage. -// -// Key features: -// - Automatic TTL expiration on read operations -// - Workspace-based isolation -// - Cursor-based pagination for listing operations -// - Primary/read-replica database connection support -// - Simple key-value model optimized for performance -// -// Example usage: -// -// import ( -// "github.com/unkeyed/unkey/go/pkg/kv" -// "github.com/unkeyed/unkey/go/pkg/kv/stores/mysql" -// ) -// -// store, err := mysql.NewStore(mysql.Config{ -// PrimaryDSN: "user:pass@tcp(localhost:3306)/db?parseTime=true", -// Logger: logger, -// }) -// if err != nil { -// // handle error -// } -// -// // Set a key with TTL -// ttl := 5 * time.Minute -// err = store.Set(ctx, "user:123", "workspace1", []byte("data"), &ttl) -// -// // Get a key -// data, found, err := store.Get(ctx, "user:123") -// -// // List keys by workspace with cursor pagination -// entries, err := store.ListByWorkspace(ctx, "workspace1", 0, 10) -package kv diff --git a/go/pkg/kv/store.go b/go/pkg/kv/store.go deleted file mode 100644 index 356bb0f924..0000000000 --- a/go/pkg/kv/store.go +++ /dev/null @@ -1,24 +0,0 @@ -package kv - -import ( - "context" - "time" -) - -// Store defines the interface for a key-value store with TTL support -type Store interface { - Get(ctx context.Context, key string) ([]byte, bool, error) - Set(ctx context.Context, key string, workspaceID string, value []byte, ttl *time.Duration) error - Delete(ctx context.Context, key string) error - ListByWorkspace(ctx context.Context, workspaceID string, cursor int64, limit int) ([]KvEntry, error) -} - -// KvEntry represents a key-value entry in the store -type KvEntry struct { - ID int64 - Key string - WorkspaceID string - Value []byte - TTL *int64 - CreatedAt int64 -} diff --git a/go/pkg/kv/stores/mysql/db.go b/go/pkg/kv/stores/mysql/db.go deleted file mode 100644 index 03ec3423c3..0000000000 --- a/go/pkg/kv/stores/mysql/db.go +++ /dev/null @@ -1,31 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 - -package mysql - -import ( - "context" - "database/sql" -) - -type DBTX interface { - ExecContext(context.Context, string, ...interface{}) (sql.Result, error) - PrepareContext(context.Context, string) (*sql.Stmt, error) - QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) - QueryRowContext(context.Context, string, ...interface{}) *sql.Row -} - -func New(db DBTX) *Queries { - return &Queries{db: db} -} - -type Queries struct { - db DBTX -} - -func (q *Queries) WithTx(tx *sql.Tx) *Queries { - return &Queries{ - db: tx, - } -} diff --git a/go/pkg/kv/stores/mysql/delete.sql.go b/go/pkg/kv/stores/mysql/delete.sql.go deleted file mode 100644 index a907385bea..0000000000 --- a/go/pkg/kv/stores/mysql/delete.sql.go +++ /dev/null @@ -1,19 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 -// source: delete.sql - -package mysql - -import ( - "context" -) - -const delete = `-- name: Delete :exec -DELETE FROM kv WHERE ` + "`" + `key` + "`" + ` = ? -` - -func (q *Queries) Delete(ctx context.Context, key string) error { - _, err := q.db.ExecContext(ctx, delete, key) - return err -} diff --git a/go/pkg/kv/stores/mysql/delete_expired.sql.go b/go/pkg/kv/stores/mysql/delete_expired.sql.go deleted file mode 100644 index 0229f04753..0000000000 --- a/go/pkg/kv/stores/mysql/delete_expired.sql.go +++ /dev/null @@ -1,25 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 -// source: delete_expired.sql - -package mysql - -import ( - "context" - "database/sql" -) - -const deleteExpired = `-- name: DeleteExpired :exec -DELETE FROM kv WHERE ` + "`" + `key` + "`" + ` = ? AND ttl IS NOT NULL AND ttl <= ? -` - -type DeleteExpiredParams struct { - Key string - Ttl sql.NullInt64 -} - -func (q *Queries) DeleteExpired(ctx context.Context, arg DeleteExpiredParams) error { - _, err := q.db.ExecContext(ctx, deleteExpired, arg.Key, arg.Ttl) - return err -} diff --git a/go/pkg/kv/stores/mysql/doc.go b/go/pkg/kv/stores/mysql/doc.go deleted file mode 100644 index d339b61132..0000000000 --- a/go/pkg/kv/stores/mysql/doc.go +++ /dev/null @@ -1,28 +0,0 @@ -// Package mysql provides a MySQL-backed implementation of the kv.Store interface. -// -// This implementation uses sqlc-generated code for type-safe database operations -// and supports both primary and read-replica database connections for optimal -// performance in production environments. -// -// The store automatically handles: -// - TTL expiration by deleting expired keys on read -// - Cursor-based pagination using created_at timestamps -// - Connection routing (reads to replica, writes to primary) -// - Auto-incrementing primary keys for efficient storage -// -// Database schema (inspired by GitHub's approach): -// -// CREATE TABLE kv ( -// id BIGINT(20) NOT NULL AUTO_INCREMENT, -// `key` VARCHAR(255) NOT NULL, -// workspace_id VARCHAR(255) NOT NULL, -// value BLOB NOT NULL, -// ttl BIGINT NULL, -// created_at BIGINT NOT NULL, -// -// PRIMARY KEY (id), -// UNIQUE KEY unique_key (`key`), -// INDEX idx_workspace_id (workspace_id), -// INDEX idx_ttl (ttl) -// ); -package mysql diff --git a/go/pkg/kv/stores/mysql/get.sql.go b/go/pkg/kv/stores/mysql/get.sql.go deleted file mode 100644 index e36a0c9be3..0000000000 --- a/go/pkg/kv/stores/mysql/get.sql.go +++ /dev/null @@ -1,35 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 -// source: get.sql - -package mysql - -import ( - "context" - "database/sql" -) - -const get = `-- name: Get :one -SELECT id, workspace_id, ` + "`" + `key` + "`" + `, value, ttl, created_at FROM kv -WHERE ` + "`" + `key` + "`" + ` = ? AND (ttl IS NULL OR ttl > ?) -` - -type GetParams struct { - Key string - Ttl sql.NullInt64 -} - -func (q *Queries) Get(ctx context.Context, arg GetParams) (Kv, error) { - row := q.db.QueryRowContext(ctx, get, arg.Key, arg.Ttl) - var i Kv - err := row.Scan( - &i.ID, - &i.WorkspaceID, - &i.Key, - &i.Value, - &i.Ttl, - &i.CreatedAt, - ) - return i, err -} diff --git a/go/pkg/kv/stores/mysql/list_by_workspace.sql.go b/go/pkg/kv/stores/mysql/list_by_workspace.sql.go deleted file mode 100644 index 1c1ad699bd..0000000000 --- a/go/pkg/kv/stores/mysql/list_by_workspace.sql.go +++ /dev/null @@ -1,62 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 -// source: list_by_workspace.sql - -package mysql - -import ( - "context" - "database/sql" -) - -const listByWorkspace = `-- name: ListByWorkspace :many -SELECT id, workspace_id, ` + "`" + `key` + "`" + `, value, ttl, created_at FROM kv -WHERE workspace_id = ? -AND (ttl IS NULL OR ttl > ?) -AND id > ? -ORDER BY id ASC -LIMIT ? -` - -type ListByWorkspaceParams struct { - WorkspaceID string - Ttl sql.NullInt64 - ID int64 - Limit int32 -} - -func (q *Queries) ListByWorkspace(ctx context.Context, arg ListByWorkspaceParams) ([]Kv, error) { - rows, err := q.db.QueryContext(ctx, listByWorkspace, - arg.WorkspaceID, - arg.Ttl, - arg.ID, - arg.Limit, - ) - if err != nil { - return nil, err - } - defer rows.Close() - var items []Kv - for rows.Next() { - var i Kv - if err := rows.Scan( - &i.ID, - &i.WorkspaceID, - &i.Key, - &i.Value, - &i.Ttl, - &i.CreatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} diff --git a/go/pkg/kv/stores/mysql/models.go b/go/pkg/kv/stores/mysql/models.go deleted file mode 100644 index ff7954c3ff..0000000000 --- a/go/pkg/kv/stores/mysql/models.go +++ /dev/null @@ -1,18 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 - -package mysql - -import ( - "database/sql" -) - -type Kv struct { - ID int64 - WorkspaceID string - Key string - Value []byte - Ttl sql.NullInt64 - CreatedAt int64 -} diff --git a/go/pkg/kv/stores/mysql/queries/delete.sql b/go/pkg/kv/stores/mysql/queries/delete.sql deleted file mode 100644 index 0887f03d4b..0000000000 --- a/go/pkg/kv/stores/mysql/queries/delete.sql +++ /dev/null @@ -1,2 +0,0 @@ --- name: Delete :exec -DELETE FROM kv WHERE `key` = ?; \ No newline at end of file diff --git a/go/pkg/kv/stores/mysql/queries/delete_expired.sql b/go/pkg/kv/stores/mysql/queries/delete_expired.sql deleted file mode 100644 index 9c681a23d2..0000000000 --- a/go/pkg/kv/stores/mysql/queries/delete_expired.sql +++ /dev/null @@ -1,2 +0,0 @@ --- name: DeleteExpired :exec -DELETE FROM kv WHERE `key` = ? AND ttl IS NOT NULL AND ttl <= ?; \ No newline at end of file diff --git a/go/pkg/kv/stores/mysql/queries/get.sql b/go/pkg/kv/stores/mysql/queries/get.sql deleted file mode 100644 index fe0c680b18..0000000000 --- a/go/pkg/kv/stores/mysql/queries/get.sql +++ /dev/null @@ -1,3 +0,0 @@ --- name: Get :one -SELECT * FROM kv -WHERE `key` = ? AND (ttl IS NULL OR ttl > ?); \ No newline at end of file diff --git a/go/pkg/kv/stores/mysql/queries/list_by_workspace.sql b/go/pkg/kv/stores/mysql/queries/list_by_workspace.sql deleted file mode 100644 index 295c50e3a7..0000000000 --- a/go/pkg/kv/stores/mysql/queries/list_by_workspace.sql +++ /dev/null @@ -1,7 +0,0 @@ --- name: ListByWorkspace :many -SELECT * FROM kv -WHERE workspace_id = ? -AND (ttl IS NULL OR ttl > ?) -AND id > ? -ORDER BY id ASC -LIMIT ?; \ No newline at end of file diff --git a/go/pkg/kv/stores/mysql/queries/set.sql b/go/pkg/kv/stores/mysql/queries/set.sql deleted file mode 100644 index 733a7b121d..0000000000 --- a/go/pkg/kv/stores/mysql/queries/set.sql +++ /dev/null @@ -1,7 +0,0 @@ --- name: Set :exec -INSERT INTO kv (`key`, workspace_id, value, ttl, created_at) -VALUES (?, ?, ?, ?, ?) -ON DUPLICATE KEY UPDATE - value = VALUES(value), - ttl = VALUES(ttl), - created_at = VALUES(created_at); \ No newline at end of file diff --git a/go/pkg/kv/stores/mysql/schema.sql b/go/pkg/kv/stores/mysql/schema.sql deleted file mode 100644 index cf8e5f1c74..0000000000 --- a/go/pkg/kv/stores/mysql/schema.sql +++ /dev/null @@ -1,13 +0,0 @@ -CREATE TABLE kv ( - id BIGINT(20) NOT NULL AUTO_INCREMENT, - workspace_id VARCHAR(255) NOT NULL, - `key` VARCHAR(255) NOT NULL, - value BLOB NOT NULL, - ttl BIGINT NULL, - created_at BIGINT NOT NULL, - - PRIMARY KEY (id), - UNIQUE KEY unique_key (`key`), - INDEX idx_workspace_id (workspace_id), - INDEX idx_ttl (ttl) -); diff --git a/go/pkg/kv/stores/mysql/set.sql.go b/go/pkg/kv/stores/mysql/set.sql.go deleted file mode 100644 index a58a489437..0000000000 --- a/go/pkg/kv/stores/mysql/set.sql.go +++ /dev/null @@ -1,39 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.28.0 -// source: set.sql - -package mysql - -import ( - "context" - "database/sql" -) - -const set = `-- name: Set :exec -INSERT INTO kv (` + "`" + `key` + "`" + `, workspace_id, value, ttl, created_at) -VALUES (?, ?, ?, ?, ?) -ON DUPLICATE KEY UPDATE - value = VALUES(value), - ttl = VALUES(ttl), - created_at = VALUES(created_at) -` - -type SetParams struct { - Key string - WorkspaceID string - Value []byte - Ttl sql.NullInt64 - CreatedAt int64 -} - -func (q *Queries) Set(ctx context.Context, arg SetParams) error { - _, err := q.db.ExecContext(ctx, set, - arg.Key, - arg.WorkspaceID, - arg.Value, - arg.Ttl, - arg.CreatedAt, - ) - return err -} diff --git a/go/pkg/kv/stores/mysql/sqlc.json b/go/pkg/kv/stores/mysql/sqlc.json deleted file mode 100644 index 07dffd8cb0..0000000000 --- a/go/pkg/kv/stores/mysql/sqlc.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "version": "2", - "sql": [ - { - "engine": "mysql", - "queries": "queries/", - "schema": "schema.sql", - "gen": { - "go": { - "package": "mysql", - "out": "." - } - } - } - ] -} diff --git a/go/pkg/kv/stores/mysql/store.go b/go/pkg/kv/stores/mysql/store.go deleted file mode 100644 index a1774a0ae8..0000000000 --- a/go/pkg/kv/stores/mysql/store.go +++ /dev/null @@ -1,213 +0,0 @@ -package mysql - -import ( - "context" - "database/sql" - "fmt" - "strings" - "time" - - _ "github.com/go-sql-driver/mysql" - - "github.com/unkeyed/unkey/go/pkg/fault" - "github.com/unkeyed/unkey/go/pkg/kv" - "github.com/unkeyed/unkey/go/pkg/otel/logging" - "github.com/unkeyed/unkey/go/pkg/retry" -) - -// Config defines the parameters needed to establish database connections. -type Config struct { - // The primary DSN for your database. This must support both reads and writes. - PrimaryDSN string - - // The readonly replica will be used for most read queries. - // If omitted, the primary is used. - ReadOnlyDSN string - - // Logger for database-related operations - Logger logging.Logger -} - -// Store implements the kv.Store interface using MySQL -type Store struct { - primary *sql.DB - readonly *sql.DB - queries *Queries - logger logging.Logger -} - -func open(dsn string, logger logging.Logger) (db *sql.DB, err error) { - if !strings.Contains(dsn, "parseTime=true") { - return nil, fault.New("DSN must contain parseTime=true") - } - - err = retry.New( - retry.Attempts(3), - retry.Backoff(func(n int) time.Duration { - return time.Duration(n) * time.Second - }), - ).Do(func() error { - db, err = sql.Open("mysql", dsn) - if err != nil { - logger.Info("mysql not ready yet, retrying...", "error", err.Error()) - } - return err - }) - - return db, err -} - -// NewStore creates a new MySQL-backed KV store -func NewStore(config Config) (kv.Store, error) { - primary, err := open(config.PrimaryDSN, config.Logger) - if err != nil { - return nil, fault.Wrap(err, fault.Internal("cannot open primary database")) - } - - readonly := primary // Default to primary for reads - if config.ReadOnlyDSN != "" { - readonly, err = open(config.ReadOnlyDSN, config.Logger) - if err != nil { - return nil, fault.Wrap(err, fault.Internal("cannot open readonly database")) - } - config.Logger.Info("kv store configured with separate read replica") - } else { - config.Logger.Info("kv store configured without separate read replica, using primary for reads") - } - - return &Store{ - primary: primary, - readonly: readonly, - queries: New(primary), - logger: config.Logger, - }, nil -} - -func (s *Store) Get(ctx context.Context, key string) ([]byte, bool, error) { - now := time.Now().UnixMilli() - - // Use readonly connection for Get operations - queries := New(s.readonly) - row, err := queries.Get(ctx, GetParams{ - Key: key, - Ttl: sql.NullInt64{Int64: now, Valid: true}, - }) - - if err != nil { - if err == sql.ErrNoRows { - return nil, false, nil - } - return nil, false, fmt.Errorf("failed to get key %s: %w", key, err) - } - - // Check if TTL is expired and delete if so - if row.Ttl.Valid && row.Ttl.Int64 <= now { - // Delete the expired key using primary connection - err = s.queries.DeleteExpired(ctx, DeleteExpiredParams{ - Key: key, - Ttl: sql.NullInt64{Int64: now, Valid: true}, - }) - if err != nil { - s.logger.Warn("failed to delete expired key", "key", key, "error", err.Error()) - } - return nil, false, nil - } - - return row.Value, true, nil -} - -func (s *Store) Set(ctx context.Context, key string, workspaceID string, value []byte, ttl *time.Duration) error { - now := time.Now().UnixMilli() - - var ttlValue sql.NullInt64 - if ttl != nil { - ttlMs := now + ttl.Milliseconds() - ttlValue = sql.NullInt64{Int64: ttlMs, Valid: true} - } - - err := s.queries.Set(ctx, SetParams{ - Key: key, - WorkspaceID: workspaceID, - Value: value, - Ttl: ttlValue, - CreatedAt: now, - }) - - if err != nil { - return fmt.Errorf("failed to set key %s: %w", key, err) - } - - return nil -} - -func (s *Store) Delete(ctx context.Context, key string) error { - err := s.queries.Delete(ctx, key) - if err != nil { - return fmt.Errorf("failed to delete key %s: %w", key, err) - } - return nil -} - -func (s *Store) ListByWorkspace(ctx context.Context, workspaceID string, cursor int64, limit int) ([]kv.KvEntry, error) { - now := time.Now().UnixMilli() - - // cursor = 0 means start from the beginning (oldest records first) - - // Use readonly connection for List operations - queries := New(s.readonly) - rows, err := queries.ListByWorkspace(ctx, ListByWorkspaceParams{ - WorkspaceID: workspaceID, - Ttl: sql.NullInt64{Int64: now, Valid: true}, - ID: cursor, - Limit: int32(limit), - }) - - if err != nil { - return nil, fmt.Errorf("failed to list by workspace: %w", err) - } - - return s.convertRows(rows) -} - -func (s *Store) convertRows(rows []Kv) ([]kv.KvEntry, error) { - entries := make([]kv.KvEntry, len(rows)) - - for i, row := range rows { - var ttl *int64 - if row.Ttl.Valid { - ttl = &row.Ttl.Int64 - } - - entries[i] = kv.KvEntry{ - ID: row.ID, - Key: row.Key, - WorkspaceID: row.WorkspaceID, - Value: row.Value, - TTL: ttl, - CreatedAt: row.CreatedAt, - } - } - - return entries, nil -} - -// Close closes the database connections -func (s *Store) Close() error { - var errs []error - - if err := s.primary.Close(); err != nil { - errs = append(errs, fmt.Errorf("failed to close primary connection: %w", err)) - } - - if s.readonly != s.primary { - if err := s.readonly.Close(); err != nil { - errs = append(errs, fmt.Errorf("failed to close readonly connection: %w", err)) - } - } - - if len(errs) > 0 { - return fmt.Errorf("errors closing connections: %v", errs) - } - - return nil -} From 0552d9354f8f893963fe5917811a3451c87d3ae6 Mon Sep 17 00:00:00 2001 From: chronark Date: Fri, 3 Oct 2025 18:26:52 +0200 Subject: [PATCH 2/3] chore: remove dead code (hydra) --- .../content/docs/cli/run/ctrl/index.mdx | 59 +- .../docs/infrastructure/database-schema.mdx | 2 +- deployment/Dockerfile.mysql | 1 - deployment/docker-compose.yaml | 1 - deployment/init-databases.sql | 2 - go/pkg/hydra/README.md | 519 --------- go/pkg/hydra/circuit_breaker_test.go | 96 -- go/pkg/hydra/complex_workflows_test.go | 535 --------- go/pkg/hydra/cron.go | 229 ---- go/pkg/hydra/data_consistency_test.go | 465 -------- go/pkg/hydra/debug_test.go | 74 -- go/pkg/hydra/doc.go | 252 ----- go/pkg/hydra/engine.go | 318 ------ go/pkg/hydra/engine_test.go | 164 --- go/pkg/hydra/marshaller.go | 58 - go/pkg/hydra/marshaller_test.go | 64 -- go/pkg/hydra/metrics/example_usage.go | 89 -- go/pkg/hydra/metrics/metrics.go | 351 ------ go/pkg/hydra/simple_consistency_test.go | 270 ----- go/pkg/hydra/sleep.go | 133 --- go/pkg/hydra/step.go | 311 ----- go/pkg/hydra/step_atomicity_test.go | 217 ---- go/pkg/hydra/step_idempotency_test.go | 168 --- go/pkg/hydra/store/db.go | 27 - go/pkg/hydra/store/generate.go | 6 - go/pkg/hydra/store/models.go | 245 ---- go/pkg/hydra/store/querier.go | 41 - go/pkg/hydra/store/queries.go | 22 - go/pkg/hydra/store/queries/workflows.sql | 189 ---- go/pkg/hydra/store/schema.sql | 72 -- go/pkg/hydra/store/sqlc.json | 44 - go/pkg/hydra/store/workflows.sql.go | 962 ---------------- go/pkg/hydra/store_coverage_test.go | 184 --- go/pkg/hydra/test_helpers.go | 76 -- go/pkg/hydra/testharness/events.go | 179 --- go/pkg/hydra/worker.go | 1003 ----------------- go/pkg/hydra/worker_heartbeat_test.go | 138 --- go/pkg/hydra/worker_polling_test.go | 314 ------ go/pkg/hydra/workflow.go | 315 ------ go/pkg/hydra/workflow_performance_test.go | 422 ------- 40 files changed, 27 insertions(+), 8590 deletions(-) delete mode 100644 go/pkg/hydra/README.md delete mode 100644 go/pkg/hydra/circuit_breaker_test.go delete mode 100644 go/pkg/hydra/complex_workflows_test.go delete mode 100644 go/pkg/hydra/cron.go delete mode 100644 go/pkg/hydra/data_consistency_test.go delete mode 100644 go/pkg/hydra/debug_test.go delete mode 100644 go/pkg/hydra/doc.go delete mode 100644 go/pkg/hydra/engine.go delete mode 100644 go/pkg/hydra/engine_test.go delete mode 100644 go/pkg/hydra/marshaller.go delete mode 100644 go/pkg/hydra/marshaller_test.go delete mode 100644 go/pkg/hydra/metrics/example_usage.go delete mode 100644 go/pkg/hydra/metrics/metrics.go delete mode 100644 go/pkg/hydra/simple_consistency_test.go delete mode 100644 go/pkg/hydra/sleep.go delete mode 100644 go/pkg/hydra/step.go delete mode 100644 go/pkg/hydra/step_atomicity_test.go delete mode 100644 go/pkg/hydra/step_idempotency_test.go delete mode 100644 go/pkg/hydra/store/db.go delete mode 100644 go/pkg/hydra/store/generate.go delete mode 100644 go/pkg/hydra/store/models.go delete mode 100644 go/pkg/hydra/store/querier.go delete mode 100644 go/pkg/hydra/store/queries.go delete mode 100644 go/pkg/hydra/store/queries/workflows.sql delete mode 100644 go/pkg/hydra/store/schema.sql delete mode 100644 go/pkg/hydra/store/sqlc.json delete mode 100644 go/pkg/hydra/store/workflows.sql.go delete mode 100644 go/pkg/hydra/store_coverage_test.go delete mode 100644 go/pkg/hydra/test_helpers.go delete mode 100644 go/pkg/hydra/testharness/events.go delete mode 100644 go/pkg/hydra/worker.go delete mode 100644 go/pkg/hydra/worker_heartbeat_test.go delete mode 100644 go/pkg/hydra/worker_polling_test.go delete mode 100644 go/pkg/hydra/workflow.go delete mode 100644 go/pkg/hydra/workflow_performance_test.go diff --git a/apps/engineering/content/docs/cli/run/ctrl/index.mdx b/apps/engineering/content/docs/cli/run/ctrl/index.mdx index 874e9dab4e..304e132607 100644 --- a/apps/engineering/content/docs/cli/run/ctrl/index.mdx +++ b/apps/engineering/content/docs/cli/run/ctrl/index.mdx @@ -10,7 +10,7 @@ unkey run ctrl [flags] ``` -Some flags are required for this command to work properly. + Some flags are required for this command to work properly. ## Flags @@ -21,7 +21,7 @@ HTTP port for the control plane server to listen on. Default: 8080 - **Type:** integer - **Default:** `8080` - **Environment:** `UNKEY_HTTP_PORT` - + Enable colored log output. Default: true @@ -29,21 +29,21 @@ Enable colored log output. Default: true - **Type:** boolean - **Default:** `true` - **Environment:** `UNKEY_LOGS_COLOR` - + Cloud platform identifier for this node. Used for logging and metrics. - **Type:** string - **Environment:** `UNKEY_PLATFORM` - + Container image identifier. Used for logging and metrics. - **Type:** string - **Environment:** `UNKEY_IMAGE` - + Geographic region identifier. Used for logging and routing. Default: unknown @@ -51,7 +51,7 @@ Geographic region identifier. Used for logging and routing. Default: unknown - **Type:** string - **Default:** `"unknown"` - **Environment:** `AWS_REGION` - + Unique identifier for this instance. Auto-generated if not provided. @@ -59,28 +59,21 @@ Unique identifier for this instance. Auto-generated if not provided. - **Type:** string - **Default:** `"ins_5PkxT8"` - **Environment:** `UNKEY_INSTANCE_ID` - + MySQL connection string for primary database. Required for all deployments. Example: user:pass@host:3306/unkey?parseTime=true - **Type:** string - **Environment:** `UNKEY_DATABASE_PRIMARY` - + MySQL connection string for partition database. Required for all deployments. Example: user:pass@host:3306/partition_002?parseTime=true - **Type:** string - **Environment:** `UNKEY_DATABASE_PARTITION` - - - -MySQL connection string for hydra database. Required for all deployments. Example: user:pass@host:3306/hydra?parseTime=true - -- **Type:** string -- **Environment:** `UNKEY_DATABASE_HYDRA` - + Enable OpenTelemetry tracing and metrics @@ -88,7 +81,7 @@ Enable OpenTelemetry tracing and metrics - **Type:** boolean - **Default:** `false` - **Environment:** `UNKEY_OTEL` - + Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provided. Default: 0.25 @@ -96,42 +89,42 @@ Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provi - **Type:** float - **Default:** `0.25` - **Environment:** `UNKEY_OTEL_TRACE_SAMPLING_RATE` - + Path to TLS certificate file for HTTPS. Both cert and key must be provided to enable HTTPS. - **Type:** string - **Environment:** `UNKEY_TLS_CERT_FILE` - + Path to TLS key file for HTTPS. Both cert and key must be provided to enable HTTPS. - **Type:** string - **Environment:** `UNKEY_TLS_KEY_FILE` - + Authentication token for control plane API access. Required for secure deployments. - **Type:** string - **Environment:** `UNKEY_AUTH_TOKEN` - + Full URL of the krane service for VM operations. Required for deployments. Example: https://krane.example.com:8080 - **Type:** string - **Environment:** `UNKEY_KRANE_ADDRESS` - + API key for simple authentication (demo purposes only). Will be replaced with JWT authentication. - **Type:** string - **Environment:** `UNKEY_API_KEY` - + Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/agent/agent.sock @@ -139,42 +132,42 @@ Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/age - **Type:** string - **Default:** `"/var/lib/spire/agent/agent.sock"` - **Environment:** `UNKEY_SPIFFE_SOCKET_PATH` - + Vault master keys for encryption - **Type:** string[] - **Environment:** `UNKEY_VAULT_MASTER_KEYS` - + S3 Compatible Endpoint URL - **Type:** string - **Environment:** `UNKEY_VAULT_S3_URL` - + S3 bucket name - **Type:** string - **Environment:** `UNKEY_VAULT_S3_BUCKET` - + S3 access key ID - **Type:** string - **Environment:** `UNKEY_VAULT_S3_ACCESS_KEY_ID` - + S3 secret access key - **Type:** string - **Environment:** `UNKEY_VAULT_S3_ACCESS_KEY_SECRET` - + Enable Let's Encrypt for acme challenges @@ -182,7 +175,7 @@ Enable Let's Encrypt for acme challenges - **Type:** boolean - **Default:** `false` - **Environment:** `UNKEY_ACME_ENABLED` - + Enable Cloudflare for wildcard certificates @@ -190,14 +183,14 @@ Enable Cloudflare for wildcard certificates - **Type:** boolean - **Default:** `false` - **Environment:** `UNKEY_ACME_CLOUDFLARE_ENABLED` - + Cloudflare API token for Let's Encrypt - **Type:** string - **Environment:** `UNKEY_ACME_CLOUDFLARE_API_TOKEN` - + Default domain for auto-generated hostnames @@ -205,4 +198,4 @@ Default domain for auto-generated hostnames - **Type:** string - **Default:** `"unkey.app"` - **Environment:** `UNKEY_DEFAULT_DOMAIN` - + diff --git a/apps/engineering/content/docs/infrastructure/database-schema.mdx b/apps/engineering/content/docs/infrastructure/database-schema.mdx index 3862bfa704..963545960d 100644 --- a/apps/engineering/content/docs/infrastructure/database-schema.mdx +++ b/apps/engineering/content/docs/infrastructure/database-schema.mdx @@ -8,7 +8,6 @@ description: "How database schemas are managed and applied in the Unkey platform Unkey uses multiple MySQL databases that are automatically created and initialized during development: - **unkey**: Main application database containing APIs, keys, workspaces, and related data -- **hydra**: Workflow orchestration engine database for managing deployment workflows - **partition_00X**: Dataplane partition database ## Schema Files @@ -17,6 +16,7 @@ Schema definitions are maintained in separate files: - `go/pkg/db/schema.sql` - Main Unkey application schema - `go/pkg/partition/schema.sql` - Dataplane schema + ## Docker Development Setup During local development, schemas are automatically applied via Docker: diff --git a/deployment/Dockerfile.mysql b/deployment/Dockerfile.mysql index 53e5494e3b..5aa2e0ee2d 100644 --- a/deployment/Dockerfile.mysql +++ b/deployment/Dockerfile.mysql @@ -6,7 +6,6 @@ COPY deployment/init-databases.sql /docker-entrypoint-initdb.d/00-init-databases # Copy schemas from their respective packages COPY go/pkg/db/schema.sql /docker-entrypoint-initdb.d/01-main-schema.sql COPY go/pkg/partition/db/schema.sql /docker-entrypoint-initdb.d/02-partition-schema.sql -COPY go/pkg/hydra/store/schema.sql /docker-entrypoint-initdb.d/03-hydra-schema.sql # Copy seed data for local development COPY deployment/04-seed-workspace.sql /docker-entrypoint-initdb.d/04-seed-workspace.sql diff --git a/deployment/docker-compose.yaml b/deployment/docker-compose.yaml index e509e093dd..fcc5da5c9d 100644 --- a/deployment/docker-compose.yaml +++ b/deployment/docker-compose.yaml @@ -336,7 +336,6 @@ services: - /var/run/docker.sock:/var/run/docker.sock environment: UNKEY_DATABASE_PRIMARY: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true" - UNKEY_DATABASE_HYDRA: "unkey:password@tcp(mysql:3306)/hydra?parseTime=true&interpolateParams=true" UNKEY_DATABASE_PARTITION: "unkey:password@tcp(mysql:3306)/partition_001?parseTime=true&interpolateParams=true" # Control plane configuration diff --git a/deployment/init-databases.sql b/deployment/init-databases.sql index fee86ce13e..a547aff02b 100644 --- a/deployment/init-databases.sql +++ b/deployment/init-databases.sql @@ -1,6 +1,5 @@ -- Initialize multiple databases for the Unkey deployment platform CREATE DATABASE IF NOT EXISTS unkey; -CREATE DATABASE IF NOT EXISTS hydra; CREATE DATABASE IF NOT EXISTS partition_001; -- Create the unkey user @@ -8,6 +7,5 @@ CREATE USER IF NOT EXISTS 'unkey'@'%' IDENTIFIED BY 'password'; -- Grant permissions to unkey user for all databases GRANT ALL PRIVILEGES ON unkey.* TO 'unkey'@'%'; -GRANT ALL PRIVILEGES ON hydra.* TO 'unkey'@'%'; GRANT ALL PRIVILEGES ON partition_001.* TO 'unkey'@'%'; FLUSH PRIVILEGES; diff --git a/go/pkg/hydra/README.md b/go/pkg/hydra/README.md deleted file mode 100644 index 5c38b8f594..0000000000 --- a/go/pkg/hydra/README.md +++ /dev/null @@ -1,519 +0,0 @@ -# Hydra 🌊 - -> **Distributed workflow orchestration engine for Go** - -Hydra is a robust, scalable workflow orchestration engine designed for reliable execution of multi-step business processes. Built with exactly-once execution guarantees, automatic retries, and comprehensive observability. - -## Features - -πŸš€ **Exactly-Once Execution** - Workflows and steps execute exactly once, even with failures -⚑ **Durable State** - All state persisted to database, survives crashes and restarts -πŸ”„ **Automatic Retries** - Configurable retry policies with exponential backoff -πŸ“Š **Rich Observability** - Built-in Prometheus metrics and structured logging -⏰ **Flexible Scheduling** - Immediate execution, cron schedules, and sleep states -πŸ—οΈ **Distributed Coordination** - Multiple workers with lease-based coordination -🎯 **Type Safety** - Strongly-typed workflows with compile-time guarantees -πŸ”§ **Checkpointing** - Automatic step result caching for fault tolerance - -## Quick Start - -### Installation - -```bash -go get github.com/unkeyed/unkey/go/pkg/hydra -``` - -### Basic Example - -```go -package main - -import ( - "context" - "fmt" - "time" - - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra" - "github.com/unkeyed/unkey/go/pkg/hydra/store/gorm" - "gorm.io/driver/mysql" - gormDriver "gorm.io/gorm" -) - -// Define your workflow -type OrderWorkflow struct{} - -func (w *OrderWorkflow) Name() string { - return "order-processing" -} - -func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error { - // Step 1: Validate payment - payment, err := hydra.Step(ctx, "validate-payment", func(stepCtx context.Context) (*Payment, error) { - return validatePayment(stepCtx, req.PaymentID) - }) - if err != nil { - return err - } - - // Step 2: Reserve inventory - _, err = hydra.Step(ctx, "reserve-inventory", func(stepCtx context.Context) (*Reservation, error) { - return reserveInventory(stepCtx, req.Items) - }) - if err != nil { - return err - } - - // Step 3: Process order - _, err = hydra.Step(ctx, "process-order", func(stepCtx context.Context) (*Order, error) { - return processOrder(stepCtx, payment, req.Items) - }) - - return err -} - -func main() { - // Set up database - db, err := gormDriver.Open(mysql.Open("dsn"), &gormDriver.Config{}) - if err != nil { - panic(err) - } - - // Create store - store := hydra.NewGORMStore(db, clock.New()) - - // Create engine - engine := hydra.New(hydra.Config{ - Store: store, - Namespace: "production", - }) - - // Create worker - worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{ - WorkerID: "worker-1", - Concurrency: 10, - }) - if err != nil { - panic(err) - } - - // Register workflow - err = hydra.RegisterWorkflow(worker, &OrderWorkflow{}) - if err != nil { - panic(err) - } - - // Start worker - ctx := context.Background() - err = worker.Start(ctx) - if err != nil { - panic(err) - } - defer worker.Shutdown(ctx) - - // Submit workflow - executionID, err := engine.StartWorkflow(ctx, "order-processing", &OrderRequest{ - CustomerID: "cust_123", - Items: []Item{{SKU: "item_456", Quantity: 2}}, - PaymentID: "pay_789", - }) - if err != nil { - panic(err) - } - - fmt.Printf("Started workflow: %s\n", executionID) -} -``` - -## Core Concepts - -### Engine -The central orchestration component that manages workflow lifecycle and coordinates execution across workers. - -```go -engine := hydra.New(hydra.Config{ - Store: store, - Namespace: "production", - Logger: logger, -}) -``` - -### Workers -Distributed processing units that poll for workflows, acquire leases, and execute workflow logic. - -```go -worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{ - WorkerID: "worker-1", - Concurrency: 20, - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 30 * time.Second, - ClaimTimeout: 5 * time.Minute, -}) -``` - -### Workflows -Business logic containers that define a series of steps with exactly-once execution guarantees. - -```go -type MyWorkflow struct{} - -func (w *MyWorkflow) Name() string { return "my-workflow" } - -func (w *MyWorkflow) Run(ctx hydra.WorkflowContext, req *MyRequest) error { - // Implement your business logic using hydra.Step() - return nil -} -``` - -### Steps -Individual units of work with automatic checkpointing and retry logic. - -```go -result, err := hydra.Step(ctx, "api-call", func(stepCtx context.Context) (*APIResponse, error) { - return apiClient.Call(stepCtx, request) -}) -``` - -## Advanced Features - -### Sleep States -Suspend workflows for time-based coordination: - -```go -// Sleep for 24 hours for manual approval -err = hydra.Sleep(ctx, 24*time.Hour) -if err != nil { - return err -} - -// Continue after sleep -result, err := hydra.Step(ctx, "post-approval", func(stepCtx context.Context) (string, error) { - return processApprovedRequest(stepCtx) -}) -``` - -### Cron Scheduling -Schedule workflows to run automatically: - -```go -err = engine.RegisterCron("0 0 * * *", "daily-report", func(ctx context.Context) error { - // Generate daily report - return generateDailyReport(ctx) -}) -``` - -### Error Handling & Retries -Configure retry behavior per workflow: - -```go -executionID, err := engine.StartWorkflow(ctx, "order-processing", request, - hydra.WithMaxAttempts(5), - hydra.WithRetryBackoff(2*time.Second), - hydra.WithTimeout(10*time.Minute), -) -``` - -### Custom Marshallers -Use custom serialization formats: - -```go -type ProtobufMarshaller struct{} - -func (p *ProtobufMarshaller) Marshal(v any) ([]byte, error) { - // Implement protobuf marshalling -} - -func (p *ProtobufMarshaller) Unmarshal(data []byte, v any) error { - // Implement protobuf unmarshalling -} - -engine := hydra.New(hydra.Config{ - Store: store, - Marshaller: &ProtobufMarshaller{}, -}) -``` - -## Observability - -### Prometheus Metrics - -Hydra provides comprehensive metrics out of the box: - -**Workflow Metrics:** -- `hydra_workflows_started_total` - Total workflows started -- `hydra_workflows_completed_total` - Total workflows completed/failed -- `hydra_workflow_duration_seconds` - Workflow execution time -- `hydra_workflow_queue_time_seconds` - Time spent waiting for execution -- `hydra_workflows_active` - Currently running workflows per worker - -**Step Metrics:** -- `hydra_steps_executed_total` - Total steps executed with status -- `hydra_step_duration_seconds` - Individual step execution time -- `hydra_steps_cached_total` - Steps served from checkpoint cache -- `hydra_steps_retried_total` - Step retry attempts - -**Worker Metrics:** -- `hydra_worker_polls_total` - Worker polling operations -- `hydra_worker_heartbeats_total` - Worker heartbeat operations -- `hydra_lease_acquisitions_total` - Workflow lease acquisitions -- `hydra_worker_concurrency_current` - Current workflow concurrency per worker - -### Example Grafana Queries - -```promql -# Workflow throughput -rate(hydra_workflows_completed_total[5m]) - -# Average workflow duration -rate(hydra_workflow_duration_seconds_sum[5m]) / rate(hydra_workflow_duration_seconds_count[5m]) - -# Step cache hit rate -rate(hydra_steps_cached_total[5m]) / rate(hydra_steps_executed_total[5m]) - -# Worker utilization -hydra_workflows_active / hydra_worker_concurrency_current -``` - -## Architecture - -Hydra uses a lease-based coordination model for distributed execution: - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Worker 1 β”‚ β”‚ Worker 2 β”‚ β”‚ Worker N β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Poll β”‚ β”‚ β”‚ β”‚ Poll β”‚ β”‚ β”‚ β”‚ Poll β”‚ β”‚ -β”‚ β”‚ Execute β”‚ β”‚ β”‚ β”‚ Execute β”‚ β”‚ β”‚ β”‚ Execute β”‚ β”‚ -β”‚ β”‚ Heartbeatβ”‚ β”‚ β”‚ β”‚ Heartbeatβ”‚ β”‚ β”‚ β”‚ Heartbeatβ”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ β”‚ β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Database β”‚ - β”‚ β”‚ - β”‚ β€’ Workflows β”‚ - β”‚ β€’ Steps β”‚ - β”‚ β€’ Leases β”‚ - β”‚ β€’ Cron Jobs β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -1. **Workers poll** the database for pending workflows -2. **Workers acquire leases** on available workflows for exclusive execution -3. **Workers execute** workflow logic with step-by-step checkpointing -4. **Workers send heartbeats** to maintain lease ownership -5. **Completed workflows** update status and release leases - -## Database Schema - -Hydra requires the following tables (auto-migrated with GORM): - -```sql --- Workflow executions -CREATE TABLE workflow_executions ( - id VARCHAR(255) PRIMARY KEY, - workflow_name VARCHAR(255) NOT NULL, - status VARCHAR(50) NOT NULL, - namespace VARCHAR(255) NOT NULL, - input_data LONGBLOB, - output_data LONGBLOB, - error_message TEXT, - max_attempts INT NOT NULL, - remaining_attempts INT NOT NULL, - created_at BIGINT NOT NULL, - started_at BIGINT, - completed_at BIGINT, - trigger_type VARCHAR(50), - trigger_source VARCHAR(255), - INDEX idx_workflow_executions_status_namespace (status, namespace), - INDEX idx_workflow_executions_workflow_name (workflow_name) -); - --- Workflow steps -CREATE TABLE workflow_steps ( - id VARCHAR(255) PRIMARY KEY, - execution_id VARCHAR(255) NOT NULL, - step_name VARCHAR(255) NOT NULL, - step_order INT NOT NULL, - status VARCHAR(50) NOT NULL, - namespace VARCHAR(255) NOT NULL, - input_data LONGBLOB, - output_data LONGBLOB, - error_message TEXT, - max_attempts INT NOT NULL, - remaining_attempts INT NOT NULL, - started_at BIGINT, - completed_at BIGINT, - UNIQUE KEY unique_execution_step (execution_id, step_name), - INDEX idx_workflow_steps_execution_id (execution_id) -); - --- Leases for coordination -CREATE TABLE leases ( - resource_id VARCHAR(255) PRIMARY KEY, - kind VARCHAR(50) NOT NULL, - namespace VARCHAR(255) NOT NULL, - worker_id VARCHAR(255) NOT NULL, - acquired_at BIGINT NOT NULL, - expires_at BIGINT NOT NULL, - heartbeat_at BIGINT NOT NULL, - INDEX idx_leases_expires_at (expires_at), - INDEX idx_leases_worker_id (worker_id) -); - --- Cron jobs -CREATE TABLE cron_jobs ( - id VARCHAR(255) PRIMARY KEY, - name VARCHAR(255) NOT NULL, - cron_spec VARCHAR(255) NOT NULL, - namespace VARCHAR(255) NOT NULL, - workflow_name VARCHAR(255), - enabled BOOLEAN NOT NULL DEFAULT TRUE, - created_at BIGINT NOT NULL, - updated_at BIGINT NOT NULL, - next_run_at BIGINT NOT NULL, - UNIQUE KEY unique_namespace_name (namespace, name), - INDEX idx_cron_jobs_next_run_at (next_run_at, enabled) -); -``` - -## Performance Considerations - -### Scaling Workers -- **Horizontal scaling**: Add more worker instances -- **Vertical scaling**: Increase concurrency per worker -- **Database optimization**: Ensure proper indexing and connection pooling - -### Optimizing Workflows -- **Idempotent steps**: Ensure steps can be safely retried -- **Minimize step payload size**: Reduce serialization overhead -- **Batch operations**: Combine multiple operations in single steps -- **Use appropriate timeouts**: Balance responsiveness vs. reliability - -### Database Tuning -```sql --- Recommended indexes for performance -CREATE INDEX idx_workflow_executions_polling -ON workflow_executions (status, namespace, created_at); - -CREATE INDEX idx_leases_cleanup -ON leases (expires_at); - -CREATE INDEX idx_workflow_steps_execution_order -ON workflow_steps (execution_id, step_order); -``` - -## Best Practices - -### Workflow Design -- βœ… **Keep workflows stateless** - Store state in steps, not workflow instances -- βœ… **Make steps idempotent** - Steps should be safe to retry -- βœ… **Use descriptive step names** - Names should be stable across deployments -- βœ… **Handle errors gracefully** - Distinguish between retryable and permanent errors -- βœ… **Minimize external dependencies** - Use timeouts and circuit breakers - -### Production Deployment -- βœ… **Monitor metrics** - Set up alerts for error rates and latency -- βœ… **Configure retries** - Set appropriate retry policies for your use case -- βœ… **Database backup** - Ensure workflow state is backed up -- βœ… **Graceful shutdown** - Handle SIGTERM to finish active workflows -- βœ… **Resource limits** - Set memory and CPU limits for workers - -## Examples - -### Order Processing Workflow -```go -type OrderWorkflow struct { - paymentService PaymentService - inventoryService InventoryService - shippingService ShippingService -} - -func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error { - // Validate and charge payment - payment, err := hydra.Step(ctx, "process-payment", func(stepCtx context.Context) (*Payment, error) { - return w.paymentService.ProcessPayment(stepCtx, &PaymentRequest{ - Amount: req.TotalAmount, - Method: req.PaymentMethod, - Customer: req.CustomerID, - }) - }) - if err != nil { - return err - } - - // Reserve inventory - reservation, err := hydra.Step(ctx, "reserve-inventory", func(stepCtx context.Context) (*Reservation, error) { - return w.inventoryService.ReserveItems(stepCtx, req.Items) - }) - if err != nil { - // Refund payment on inventory failure - hydra.Step(ctx, "refund-payment", func(stepCtx context.Context) (any, error) { - return nil, w.paymentService.RefundPayment(stepCtx, payment.ID) - }) - return err - } - - // Create shipping label - _, err = hydra.Step(ctx, "create-shipping", func(stepCtx context.Context) (*ShippingLabel, error) { - return w.shippingService.CreateLabel(stepCtx, &ShippingRequest{ - Address: req.ShippingAddress, - Items: req.Items, - Reservation: reservation.ID, - }) - }) - - return err -} -``` - -### Approval Workflow with Sleep -```go -func (w *ApprovalWorkflow) Run(ctx hydra.WorkflowContext, req *ApprovalRequest) error { - // Submit for review - _, err := hydra.Step(ctx, "submit-review", func(stepCtx context.Context) (*Review, error) { - return w.reviewService.SubmitForReview(stepCtx, req) - }) - if err != nil { - return err - } - - // Sleep for 48 hours to allow manual review - err = hydra.Sleep(ctx, 48*time.Hour) - if err != nil { - return err - } - - // Check approval status - approval, err := hydra.Step(ctx, "check-approval", func(stepCtx context.Context) (*Approval, error) { - return w.reviewService.GetApprovalStatus(stepCtx, req.ID) - }) - if err != nil { - return err - } - - if approval.Status == "approved" { - // Process approved request - _, err = hydra.Step(ctx, "process-approved", func(stepCtx context.Context) (any, error) { - return nil, w.processApprovedRequest(stepCtx, req) - }) - } - - return err -} -``` - -## Contributing - -We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details. - -## License - -This project is licensed under the MIT License - see the [LICENSE](../../LICENSE) file for details. - ---- - -**Need help?** Check out our [documentation](https://docs.unkey.com) or join our [Discord community](https://discord.gg/unkey). \ No newline at end of file diff --git a/go/pkg/hydra/circuit_breaker_test.go b/go/pkg/hydra/circuit_breaker_test.go deleted file mode 100644 index aa575ab22f..0000000000 --- a/go/pkg/hydra/circuit_breaker_test.go +++ /dev/null @@ -1,96 +0,0 @@ -package hydra - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" -) - -// TestCircuitBreakerIntegration verifies that circuit breakers are properly -// integrated into the worker and protect database operations -func TestCircuitBreakerIntegration(t *testing.T) { - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - // Create worker with circuit breaker protection - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: "circuit-breaker-test-worker", - Concurrency: 1, - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - require.NoError(t, err) - - // Register a simple workflow - circuitTestWorkflow := &circuitBreakerTestWorkflow{ - engine: engine, - name: "circuit-breaker-workflow", - } - - err = RegisterWorkflow(worker, circuitTestWorkflow) - require.NoError(t, err) - - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - - // Start worker - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Create a workflow to test circuit breaker protection - executionID, err := circuitTestWorkflow.Start(ctx, struct{}{}) - require.NoError(t, err) - require.NotEmpty(t, executionID) - - // Advance time to trigger worker polling - for i := 0; i < 5; i++ { - testClock.Tick(200 * time.Millisecond) - time.Sleep(10 * time.Millisecond) - } - - // Verify workflow was processed (circuit breaker didn't block) - finalWorkflow := waitForWorkflowCompletion(t, engine, executionID, 3*time.Second) - require.NotNil(t, finalWorkflow) - -} - -// TestCircuitBreakerCompilation ensures the circuit breaker types compile correctly -func TestCircuitBreakerCompilation(t *testing.T) { - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - // This test primarily ensures compilation works - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: "compilation-test-worker", - Concurrency: 1, - }) - require.NoError(t, err) - require.NotNil(t, worker) - -} - -// circuitBreakerTestWorkflow is a minimal workflow for testing circuit breaker integration -type circuitBreakerTestWorkflow struct { - engine *Engine - name string -} - -func (w *circuitBreakerTestWorkflow) Name() string { - return w.name -} - -func (w *circuitBreakerTestWorkflow) Run(ctx WorkflowContext, req any) error { - _, err := Step(ctx, "circuit-breaker-step", func(context.Context) (string, error) { - return "protected", nil - }) - return err -} - -func (w *circuitBreakerTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/complex_workflows_test.go b/go/pkg/hydra/complex_workflows_test.go deleted file mode 100644 index c391d49cc6..0000000000 --- a/go/pkg/hydra/complex_workflows_test.go +++ /dev/null @@ -1,535 +0,0 @@ -package hydra - -import ( - "context" - "fmt" - "math/rand" - "sync" - "sync/atomic" - "time" -) - -// ComplexBillingWorkflow simulates a realistic billing workflow with multiple steps, -// error handling, retries, and conditional logic -type ComplexBillingWorkflow struct { - engine *Engine - name string - failureRate float64 // Probability of step failure (0.0-1.0) - chaosEnabled bool - metrics *WorkflowMetrics -} - -// WorkflowMetrics tracks detailed execution metrics -type WorkflowMetrics struct { - StepsExecuted atomic.Int64 - StepsRetried atomic.Int64 - StepsFailed atomic.Int64 - WorkflowsCompleted atomic.Int64 - WorkflowsFailed atomic.Int64 - TotalDuration atomic.Int64 // in milliseconds - mu sync.RWMutex - StepDurations map[string][]time.Duration -} - -func NewWorkflowMetrics() *WorkflowMetrics { - return &WorkflowMetrics{ - StepDurations: make(map[string][]time.Duration), - } -} - -func (m *WorkflowMetrics) RecordStepDuration(stepName string, duration time.Duration) { - m.mu.Lock() - defer m.mu.Unlock() - m.StepDurations[stepName] = append(m.StepDurations[stepName], duration) -} - -func (w *ComplexBillingWorkflow) Name() string { - return w.name -} - -func (w *ComplexBillingWorkflow) Run(ctx WorkflowContext, req any) error { - startTime := time.Now() - defer func() { - w.metrics.TotalDuration.Add(time.Since(startTime).Milliseconds()) - }() - - // Step 1: Validate customer data - customerID, err := Step(ctx, "validate-customer", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("validate-customer") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("customer validation failed") - } - - // Simulate API call - time.Sleep(time.Duration(rand.Intn(50)+10) * time.Millisecond) - return "customer-123", nil - }) - - if err != nil { - // Retry with exponential backoff - w.metrics.StepsRetried.Add(1) - time.Sleep(100 * time.Millisecond) - - customerID, err = Step(ctx, "validate-customer-retry", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - time.Sleep(time.Duration(rand.Intn(30)+20) * time.Millisecond) - return "customer-123", nil - }) - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("customer validation failed after retry: %w", err) - } - } - - // Step 2: Calculate invoice amount (parallel with usage fetch) - var invoiceAmount float64 - - // Use goroutines to simulate parallel step execution - var wg sync.WaitGroup - var calcErr, usageErr error - - wg.Add(2) - - // Calculate invoice in parallel - go func() { - defer wg.Done() - var amountStr string - amountStr, calcErr = Step(ctx, "calculate-invoice", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("calculate-invoice") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("invoice calculation error") - } - - // Simulate complex calculation - time.Sleep(time.Duration(rand.Intn(100)+50) * time.Millisecond) - amount := float64(rand.Intn(10000)+100) / 100.0 - return fmt.Sprintf("%.2f", amount), nil - }) - - if calcErr == nil { - fmt.Sscanf(amountStr, "%f", &invoiceAmount) - } - err = calcErr - }() - - // Fetch usage data in parallel - go func() { - defer wg.Done() - _, fetchErr := Step(ctx, "fetch-usage-data", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("fetch-usage-data") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("usage data fetch failed") - } - - // Simulate database query - time.Sleep(time.Duration(rand.Intn(80)+30) * time.Millisecond) - return fmt.Sprintf("usage-%d-units", rand.Intn(1000)), nil - }) - - usageErr = fetchErr - }() - - wg.Wait() - - if calcErr != nil || usageErr != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("parallel steps failed: calc=%v, usage=%v", calcErr, usageErr) - } - - // Step 3: Apply discounts (conditional) - if invoiceAmount > 100 { - discountedAmount, discountErr := Step(ctx, "apply-discounts", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("apply-discounts") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("discount calculation failed") - } - - // Simulate discount calculation - time.Sleep(time.Duration(rand.Intn(40)+10) * time.Millisecond) - discount := invoiceAmount * 0.1 - return fmt.Sprintf("%.2f", invoiceAmount-discount), nil - }) - - if discountErr == nil { - fmt.Sscanf(discountedAmount, "%f", &invoiceAmount) - } - } - - // Step 4: Generate PDF invoice - _, err = Step(ctx, "generate-pdf", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("generate-pdf") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("PDF generation failed") - } - - // Simulate PDF generation (slow operation) - time.Sleep(time.Duration(rand.Intn(200)+100) * time.Millisecond) - return fmt.Sprintf("https://invoices.example.com/%s.pdf", customerID), nil - }) - - if err != nil { - // Non-critical failure, continue - // PDF generation is optional - _ = err // Intentionally ignored - } - - // Step 5: Send invoice email - _, err = Step(ctx, "send-email", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("send-email") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("email sending failed") - } - - // Simulate email API call - time.Sleep(time.Duration(rand.Intn(60)+20) * time.Millisecond) - return fmt.Sprintf("email-sent-to-%s", customerID), nil - }) - - if err != nil { - // Retry email sending - w.metrics.StepsRetried.Add(1) - _, retryErr := Step(ctx, "send-email-retry", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - time.Sleep(time.Duration(rand.Intn(40)+20) * time.Millisecond) - return "email-sent-on-retry", nil - }) - - if retryErr != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("email sending failed after retry: %w", retryErr) - } - } - - // Step 6: Update billing status - _, err = Step(ctx, "update-billing-status", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - if w.shouldFail("update-billing-status") { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("status update failed") - } - - // Simulate database update - time.Sleep(time.Duration(rand.Intn(30)+10) * time.Millisecond) - return "status-updated", nil - }) - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("billing status update failed: %w", err) - } - - w.metrics.WorkflowsCompleted.Add(1) - return nil -} - -func (w *ComplexBillingWorkflow) shouldFail(stepName string) bool { - if !w.chaosEnabled { - return false - } - - // Introduce targeted chaos for specific steps - failureRates := map[string]float64{ - "validate-customer": w.failureRate * 0.5, // Less likely to fail - "calculate-invoice": w.failureRate, - "fetch-usage-data": w.failureRate * 1.2, // More likely to fail - "generate-pdf": w.failureRate * 2.0, // Much more likely to fail - "send-email": w.failureRate * 1.5, - "update-billing-status": w.failureRate * 0.8, - } - - rate, ok := failureRates[stepName] - if !ok { - rate = w.failureRate - } - - return rand.Float64() < rate -} - -func (w *ComplexBillingWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// ComplexDataPipelineWorkflow simulates a data processing pipeline with -// conditional branching, loops, and complex error handling -type ComplexDataPipelineWorkflow struct { - engine *Engine - name string - chaosEnabled bool - metrics *WorkflowMetrics -} - -func (w *ComplexDataPipelineWorkflow) Name() string { - return w.name -} - -func (w *ComplexDataPipelineWorkflow) Run(ctx WorkflowContext, req any) error { - // Step 1: Fetch data sources - sources, err := Step(ctx, "fetch-data-sources", func(stepCtx context.Context) ([]string, error) { - w.metrics.StepsExecuted.Add(1) - - // Simulate fetching multiple data sources - time.Sleep(time.Duration(rand.Intn(50)+20) * time.Millisecond) - - numSources := rand.Intn(5) + 3 - sources := make([]string, numSources) - for i := 0; i < numSources; i++ { - sources[i] = fmt.Sprintf("source-%d", i) - } - return sources, nil - }) - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("failed to fetch data sources: %w", err) - } - - // Step 2: Process each source (loop with error handling) - var processedCount int - for i, source := range sources { - stepName := fmt.Sprintf("process-source-%d", i) - - _, stepErr := Step(ctx, stepName, func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - // Simulate processing with variable duration - processingTime := time.Duration(rand.Intn(100)+50) * time.Millisecond - time.Sleep(processingTime) - - // Random failures - if w.chaosEnabled && rand.Float64() < 0.2 { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("processing failed for %s", source) - } - - return fmt.Sprintf("processed-%s", source), nil - }) - - if stepErr != nil { - // Continue processing other sources - continue - } - processedCount++ - } - - // Step 3: Validate processing results - if processedCount < len(sources)/2 { - // Too many failures, trigger cleanup - _, cleanupErr := Step(ctx, "cleanup-failed-processing", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - time.Sleep(50 * time.Millisecond) - return "cleanup-complete", nil - }) - - if cleanupErr != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("cleanup failed: %w", cleanupErr) - } - - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("too many source processing failures: %d/%d", processedCount, len(sources)) - } - - // Step 4: Aggregate results - _, err = Step(ctx, "aggregate-results", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - // Simulate complex aggregation - time.Sleep(time.Duration(rand.Intn(150)+100) * time.Millisecond) - - if w.chaosEnabled && rand.Float64() < 0.1 { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("aggregation failed") - } - - return fmt.Sprintf("aggregated-%d-results", processedCount), nil - }) - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("result aggregation failed: %w", err) - } - - // Step 5: Publish results (with circuit breaker pattern) - var publishAttempts int - for publishAttempts < 3 { - _, err = Step(ctx, fmt.Sprintf("publish-attempt-%d", publishAttempts), func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - publishAttempts++ - - // Simulate flaky external service - if w.chaosEnabled && rand.Float64() < 0.4 { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("publish service unavailable") - } - - time.Sleep(time.Duration(rand.Intn(80)+40) * time.Millisecond) - return "published-successfully", nil - }) - - if err == nil { - break - } - - // Exponential backoff - w.metrics.StepsRetried.Add(1) - time.Sleep(time.Duration(publishAttempts*100) * time.Millisecond) - } - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("failed to publish after %d attempts: %w", publishAttempts, err) - } - - w.metrics.WorkflowsCompleted.Add(1) - return nil -} - -func (w *ComplexDataPipelineWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// ComplexStateMachineWorkflow tests complex state transitions and decision points -type ComplexStateMachineWorkflow struct { - engine *Engine - name string - chaosEnabled bool - metrics *WorkflowMetrics -} - -func (w *ComplexStateMachineWorkflow) Name() string { - return w.name -} - -func (w *ComplexStateMachineWorkflow) Run(ctx WorkflowContext, req any) error { - // Initialize with random state - initialState := rand.Intn(3) - - // Step 1: Determine initial action based on state - action, err := Step(ctx, "determine-initial-action", func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - actions := []string{"process", "review", "escalate"} - return actions[initialState], nil - }) - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return err - } - - // Step 2: Execute state machine transitions - currentState := action - transitions := 0 - maxTransitions := 10 - - for transitions < maxTransitions { - nextState, transitionErr := Step(ctx, fmt.Sprintf("transition-%d-from-%s", transitions, currentState), - func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - // Simulate state transition logic - time.Sleep(time.Duration(rand.Intn(50)+20) * time.Millisecond) - - // Random transition failures - if w.chaosEnabled && rand.Float64() < 0.15 { - w.metrics.StepsFailed.Add(1) - return "", fmt.Errorf("transition failed from %s", currentState) - } - - // State transition rules - switch currentState { - case "process": - if rand.Float64() < 0.7 { - return "review", nil - } - return "escalate", nil - case "review": - if rand.Float64() < 0.5 { - return "approve", nil - } else if rand.Float64() < 0.8 { - return "reject", nil - } - return "process", nil - case "escalate": - if rand.Float64() < 0.6 { - return "review", nil - } - return "terminate", nil - case "approve", "reject", "terminate": - return currentState, nil // Terminal states - default: - return "error", nil - } - }) - - if transitionErr != nil { - // Handle transition failure - _, recoveryErr := Step(ctx, fmt.Sprintf("recover-transition-%d", transitions), - func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - w.metrics.StepsRetried.Add(1) - time.Sleep(30 * time.Millisecond) - return "review", nil // Safe state - }) - - if recoveryErr != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("state machine recovery failed: %w", recoveryErr) - } - nextState = "review" - } - - currentState = nextState - transitions++ - - // Check for terminal states - if currentState == "approve" || currentState == "reject" || currentState == "terminate" { - break - } - } - - // Step 3: Finalize based on terminal state - _, err = Step(ctx, fmt.Sprintf("finalize-%s", currentState), func(stepCtx context.Context) (string, error) { - w.metrics.StepsExecuted.Add(1) - - switch currentState { - case "approve": - time.Sleep(80 * time.Millisecond) - return "approved-and-processed", nil - case "reject": - time.Sleep(40 * time.Millisecond) - return "rejected-and-notified", nil - case "terminate": - time.Sleep(20 * time.Millisecond) - return "terminated-with-cleanup", nil - default: - return "", fmt.Errorf("invalid terminal state: %s", currentState) - } - }) - - if err != nil { - w.metrics.WorkflowsFailed.Add(1) - return fmt.Errorf("finalization failed: %w", err) - } - - w.metrics.WorkflowsCompleted.Add(1) - return nil -} - -func (w *ComplexStateMachineWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/cron.go b/go/pkg/hydra/cron.go deleted file mode 100644 index 4477f55421..0000000000 --- a/go/pkg/hydra/cron.go +++ /dev/null @@ -1,229 +0,0 @@ -package hydra - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "strconv" - "strings" - "time" -) - -// CronHandler defines the function signature for cron job handlers -type CronHandler func(ctx context.Context, payload CronPayload) error - -type CronPayload struct { - CronJobID string `json:"cron_job_id"` - CronName string `json:"cron_name"` - ScheduledAt int64 `json:"scheduled_at"` // When this execution was scheduled - ActualRunAt int64 `json:"actual_run_at"` // When it actually ran - Namespace string `json:"namespace"` -} - -func (p CronPayload) Marshal() ([]byte, error) { - return json.Marshal(p) -} - -func (p *CronPayload) Unmarshal(data []byte) error { - return json.Unmarshal(data, p) -} - -func calculateNextRun(cronSpec string, from time.Time) int64 { - schedule, err := parseCronSpec(cronSpec) - if err != nil { - return from.Add(5 * time.Minute).UnixMilli() - } - - next := schedule.next(from) - return next.UnixMilli() -} - -type cronSchedule struct { - minute uint64 // bits 0-59 - hour uint64 // bits 0-23 - dom uint64 // bits 1-31, day of month - month uint64 // bits 1-12 - dow uint64 // bits 0-6, day of week (0=Sunday) -} - -func parseCronSpec(spec string) (*cronSchedule, error) { - fields := strings.Fields(spec) - if len(fields) != 5 { - return nil, errors.New("cron spec must have 5 fields") - } - - minute, err := parseField(fields[0], 0, 59) - if err != nil { - return nil, fmt.Errorf("invalid minute field: %w", err) - } - - hour, err := parseField(fields[1], 0, 23) - if err != nil { - return nil, fmt.Errorf("invalid hour field: %w", err) - } - - dom, err := parseField(fields[2], 1, 31) - if err != nil { - return nil, fmt.Errorf("invalid day of month field: %w", err) - } - - month, err := parseField(fields[3], 1, 12) - if err != nil { - return nil, fmt.Errorf("invalid month field: %w", err) - } - - dow, err := parseField(fields[4], 0, 6) - if err != nil { - return nil, fmt.Errorf("invalid day of week field: %w", err) - } - - return &cronSchedule{ - minute: minute, - hour: hour, - dom: dom, - month: month, - dow: dow, - }, nil -} - -func parseField(field string, minimum, maximum int) (uint64, error) { - if field == "*" { - var mask uint64 - for i := minimum; i <= maximum; i++ { - mask |= 1 << i - } - return mask, nil - } - - parts := strings.Split(field, ",") - var mask uint64 - - for _, part := range parts { - // nolint:nestif - if strings.Contains(part, "/") { - stepParts := strings.Split(part, "/") - if len(stepParts) != 2 { - return 0, errors.New("invalid step syntax") - } - - step, err := strconv.Atoi(stepParts[1]) - if err != nil || step <= 0 { - return 0, errors.New("invalid step value") - } - - rangeStart := minimum - rangeEnd := maximum - - if stepParts[0] != "*" { - if strings.Contains(stepParts[0], "-") { - rangeParts := strings.Split(stepParts[0], "-") - if len(rangeParts) != 2 { - return 0, errors.New("invalid range syntax") - } - rangeStart, err = strconv.Atoi(rangeParts[0]) - if err != nil || rangeStart < minimum || rangeStart > maximum { - return 0, errors.New("invalid range start") - } - rangeEnd, err = strconv.Atoi(rangeParts[1]) - if err != nil || rangeEnd < minimum || rangeEnd > maximum { - return 0, errors.New("invalid range end") - } - } else { - rangeStart, err = strconv.Atoi(stepParts[0]) - if err != nil || rangeStart < minimum || rangeStart > maximum { - return 0, errors.New("invalid step start value") - } - rangeEnd = rangeStart - } - } - - for i := rangeStart; i <= rangeEnd; i += step { - mask |= 1 << i - } - - } else if strings.Contains(part, "-") { - rangeParts := strings.Split(part, "-") - if len(rangeParts) != 2 { - return 0, errors.New("invalid range syntax") - } - - start, err := strconv.Atoi(rangeParts[0]) - if err != nil || start < minimum || start > maximum { - return 0, errors.New("invalid range start") - } - - end, err := strconv.Atoi(rangeParts[1]) - if err != nil || end < minimum || end > maximum { - return 0, errors.New("invalid range end") - } - - for i := start; i <= end; i++ { - mask |= 1 << i - } - - } else { - val, err := strconv.Atoi(part) - if err != nil || val < minimum || val > maximum { - return 0, errors.New("invalid single value") - } - mask |= 1 << val - } - } - - return mask, nil -} - -func (s *cronSchedule) next(t time.Time) time.Time { - next := t.Add(time.Minute).Truncate(time.Minute) - - end := t.Add(4 * 365 * 24 * time.Hour) // 4 years - - for next.Before(end) { - if s.matches(next) { - return next - } - - next = next.Add(time.Minute) - } - - return t.Add(365 * 24 * time.Hour) -} - -func (s *cronSchedule) matches(t time.Time) bool { - if s.minute&(1< 0 { - count++ - } - } - return count -} - -func (t *ConcurrentExecutionTracker) GetMissingWorkflows(allWorkflowIDs []string) []string { - t.mu.Lock() - defer t.mu.Unlock() - - var missing []string - for _, workflowID := range allWorkflowIDs { - if record, exists := t.executions[workflowID]; !exists || record.ExecutionCount == 0 { - missing = append(missing, workflowID) - } - } - return missing -} - -func (t *ConcurrentExecutionTracker) AnalyzeResults(testCtx *testing.T) ConsistencyResults { - t.mu.Lock() - defer t.mu.Unlock() - - results := ConsistencyResults{} - - for workflowID, record := range t.executions { - if record.ExecutionCount > 0 { - results.WorkflowsExecuted++ - } - - if record.ExecutionCount > 1 { - results.DuplicateExecutions++ - testCtx.Errorf("DUPLICATE EXECUTION: Workflow %s executed %d times by workers %v", - workflowID, record.ExecutionCount, record.WorkerIDs) - } - - if record.Failed { - results.FailedWorkflows++ - } - - // Detect race conditions (multiple workers starting execution within 100ms) - if len(record.Timestamps) > 1 { - for i := 1; i < len(record.Timestamps); i++ { - if record.Timestamps[i].Sub(record.Timestamps[i-1]) < 100*time.Millisecond { - results.RaceConditions++ - testCtx.Errorf("RACE CONDITION: Workflow %s had concurrent executions by %v", - workflowID, record.WorkerIDs) - break - } - } - } - } - - return results -} - -// consistencyTestWorkflow tracks executions to detect consistency violations -type consistencyTestWorkflow struct { - engine *Engine - name string - tracker *ConcurrentExecutionTracker -} - -func (w *consistencyTestWorkflow) Name() string { - return w.name -} - -func (w *consistencyTestWorkflow) Run(ctx WorkflowContext, req any) error { - workflowID := ctx.ExecutionID() - - // Record that this workflow started executing - w.tracker.RecordExecution(workflowID, "unknown-worker") // We could get worker ID from context - - // Simulate some work with a step - _, err := Step(ctx, "consistency-step", func(context.Context) (string, error) { - // Small delay to increase chance of race conditions - time.Sleep(10 * time.Millisecond) - return "consistent", nil - }) - - // Record completion - w.tracker.RecordCompletion(workflowID, err == nil) - - return err -} - -func (w *consistencyTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// raceConditionTestWorkflow creates multiple steps to test for race conditions -type raceConditionTestWorkflow struct { - engine *Engine - name string -} - -func (w *raceConditionTestWorkflow) Name() string { - return w.name -} - -func (w *raceConditionTestWorkflow) Run(ctx WorkflowContext, req any) error { - // Create multiple steps that might race with each other - const numSteps = 20 - - // Use a WaitGroup to ensure all steps complete - var wg sync.WaitGroup - var stepErrors atomic.Int64 - - for i := 0; i < numSteps; i++ { - wg.Add(1) - go func(stepIndex int) { - defer wg.Done() - - stepName := fmt.Sprintf("race-step-%d", stepIndex) - _, err := Step(ctx, stepName, func(context.Context) (string, error) { - return fmt.Sprintf("result-%d", stepIndex), nil - }) - - if err != nil { - stepErrors.Add(1) - } - }(i) - } - - wg.Wait() - - if stepErrors.Load() > 0 { - return fmt.Errorf("race condition test failed: %d step errors", stepErrors.Load()) - } - - return nil -} - -func (w *raceConditionTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// transactionTestWorkflow tests database transaction integrity -type transactionTestWorkflow struct { - engine *Engine - name string -} - -func (w *transactionTestWorkflow) Name() string { - return w.name -} - -func (w *transactionTestWorkflow) Run(ctx WorkflowContext, req any) error { - mode, ok := req.(string) - if !ok { - mode = "normal" - } - - // Create a step that tests transaction boundaries - _, err := Step(ctx, "transaction-step", func(stepCtx context.Context) (string, error) { - switch mode { - case "normal": - return "transaction-success", nil - case "error": - return "", fmt.Errorf("simulated step error") - default: - return "unknown-mode", nil - } - }) - - return err -} - -func (w *transactionTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/debug_test.go b/go/pkg/hydra/debug_test.go deleted file mode 100644 index 5d4f73df47..0000000000 --- a/go/pkg/hydra/debug_test.go +++ /dev/null @@ -1,74 +0,0 @@ -package hydra - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" -) - -// TestBasicWorkflowExecution tests the most basic workflow execution -func TestBasicWorkflowExecution(t *testing.T) { - realClock := clock.New() - engine := newTestEngineWithClock(t, realClock) - - // Create a very simple workflow - simpleWorkflow := &debugWorkflow{ - engine: engine, - name: "debug-workflow", - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Start worker - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: "debug-worker", - Concurrency: 1, - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 2 * time.Second, - ClaimTimeout: 10 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, simpleWorkflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Submit a single workflow - workflowID, err := simpleWorkflow.Start(ctx, "test-payload") - require.NoError(t, err) - require.NotEmpty(t, workflowID) - - // Wait for completion - finalWorkflow := waitForWorkflowCompletion(t, engine, workflowID, 8*time.Second) - require.NotNil(t, finalWorkflow, "Workflow should complete") - -} - -type debugWorkflow struct { - engine *Engine - name string -} - -func (w *debugWorkflow) Name() string { - return w.name -} - -func (w *debugWorkflow) Run(ctx WorkflowContext, req any) error { - // Very simple step - _, err := Step(ctx, "debug-step", func(context.Context) (string, error) { - time.Sleep(50 * time.Millisecond) - return "success", nil - }) - return err -} - -func (w *debugWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/doc.go b/go/pkg/hydra/doc.go deleted file mode 100644 index 2e165266a8..0000000000 --- a/go/pkg/hydra/doc.go +++ /dev/null @@ -1,252 +0,0 @@ -// Package hydra provides a distributed workflow orchestration engine designed -// for reliable execution of multi-step business processes at scale. -// -// Hydra implements the Temporal-style workflow pattern with durable execution, -// automatic retries, checkpointing, and distributed coordination. It supports -// both simple sequential workflows and complex long-running processes with -// sleep states, cron scheduling, and step-level fault tolerance. -// -// # Core Concepts -// -// Engine: The central orchestration component that manages workflow lifecycle, -// worker coordination, and persistence. Each engine instance operates within -// a specific namespace for tenant isolation. -// -// Workers: Distributed processing units that poll for pending workflows, -// acquire leases for exclusive execution, and run workflow logic. Workers -// support concurrent execution with configurable limits and automatic -// heartbeat management. -// -// Workflows: Business logic containers that define a series of steps to be -// executed. Workflows are stateless functions that can be suspended, resumed, -// and retried while maintaining exactly-once execution guarantees. -// -// Steps: Individual units of work within a workflow. Steps support automatic -// checkpointing, retry logic, and result caching to ensure idempotent execution -// even across worker failures or restarts. -// -// # Key Features -// -// Exactly-Once Execution: Workflows and steps execute exactly once, even in -// the presence of worker failures, network partitions, or duplicate deliveries. -// -// Durable State: All workflow state is persisted to a database, allowing -// workflows to survive process restarts and infrastructure failures. -// -// Distributed Coordination: Multiple workers can safely operate on the same -// workflow queue using lease-based coordination and circuit breaker protection. -// -// Comprehensive Observability: Built-in Prometheus metrics track workflow -// throughput, latency, error rates, and system health across all components. -// -// Flexible Scheduling: Support for immediate execution, cron-based scheduling, -// and workflow sleep states for time-based coordination. -// -// # Basic Usage -// -// Creating an engine and worker: -// -// // Create the engine with database DSN -// engine, err := hydra.NewEngine(hydra.Config{ -// DSN: "user:password@tcp(localhost:3306)/hydra", -// Namespace: "production", -// Logger: logger, -// }) -// if err != nil { -// return err -// } -// -// // Create and configure a worker -// worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{ -// WorkerID: "worker-1", -// Concurrency: 10, -// PollInterval: 100 * time.Millisecond, -// HeartbeatInterval: 30 * time.Second, -// ClaimTimeout: 5 * time.Minute, -// }) -// -// Defining a workflow: -// -// type OrderWorkflow struct { -// engine *hydra.Engine -// } -// -// func (w *OrderWorkflow) Name() string { -// return "order-processing" -// } -// -// func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error { -// // Step 1: Validate payment -// payment, err := hydra.Step(ctx, "validate-payment", func(stepCtx context.Context) (*Payment, error) { -// return validatePayment(stepCtx, req.PaymentID) -// }) -// if err != nil { -// return err -// } -// -// // Step 2: Reserve inventory -// reservation, err := hydra.Step(ctx, "reserve-inventory", func(stepCtx context.Context) (*Reservation, error) { -// return reserveInventory(stepCtx, req.Items) -// }) -// if err != nil { -// return err -// } -// -// // Step 3: Process order -// _, err = hydra.Step(ctx, "process-order", func(stepCtx context.Context) (*Order, error) { -// return processOrder(stepCtx, payment, reservation) -// }) -// -// return err -// } -// -// Starting workflows: -// -// // Register the workflow with the worker -// orderWorkflow := &OrderWorkflow{engine: engine} -// err = hydra.RegisterWorkflow(worker, orderWorkflow) -// if err != nil { -// return err -// } -// -// // Start the worker -// ctx := context.Background() -// err = worker.Start(ctx) -// if err != nil { -// return err -// } -// defer worker.Shutdown(ctx) -// -// // Submit a workflow for execution -// request := &OrderRequest{ -// CustomerID: "cust_123", -// Items: []Item{{SKU: "item_456", Quantity: 2}}, -// PaymentID: "pay_789", -// } -// -// executionID, err := engine.StartWorkflow(ctx, "order-processing", request) -// if err != nil { -// return err -// } -// -// fmt.Printf("Started workflow execution: %s\n", executionID) -// -// # Marshalling Options -// -// Hydra supports multiple marshalling formats for workflow payloads and step results: -// -// JSON Marshaller (Default): -// -// engine, err := hydra.NewEngine(hydra.Config{ -// Marshaller: hydra.NewJSONMarshaller(), // Default if not specified -// // ... other config -// }) -// -// # Advanced Features -// -// Sleep States: Workflows can suspend execution and resume after a specified -// duration, allowing for time-based coordination and human approval processes: -// -// // Sleep for 24 hours for manual approval -// return hydra.Sleep(ctx, 24*time.Hour) -// -// Cron Scheduling: Register workflows to run on a schedule: -// -// err = engine.RegisterCron("0 0 * * *", "daily-report", func(ctx context.Context) error { -// // Generate daily report -// return generateDailyReport(ctx) -// }) -// -// Error Handling and Retries: Configure retry behavior at the workflow level: -// -// executionID, err := engine.StartWorkflow(ctx, "order-processing", request, -// hydra.WithMaxAttempts(5), -// hydra.WithRetryBackoff(2*time.Second), -// hydra.WithTimeout(10*time.Minute), -// ) -// -// # Observability -// -// Hydra provides comprehensive Prometheus metrics out of the box: -// -// Workflow Metrics: -// - hydra_workflows_started_total: Total workflows started -// - hydra_workflows_completed_total: Total workflows completed/failed -// - hydra_workflow_duration_seconds: Workflow execution time -// - hydra_workflow_queue_time_seconds: Time spent waiting for execution -// - hydra_workflows_active: Currently running workflows per worker -// -// Step Metrics: -// - hydra_steps_executed_total: Total steps executed with status -// - hydra_step_duration_seconds: Individual step execution time -// - hydra_steps_cached_total: Steps served from checkpoint cache -// - hydra_steps_retried_total: Step retry attempts -// -// Worker Metrics: -// - hydra_worker_polls_total: Worker polling operations -// - hydra_worker_heartbeats_total: Worker heartbeat operations -// - hydra_lease_acquisitions_total: Workflow lease acquisitions -// - hydra_worker_concurrency_current: Current workflow concurrency per worker -// -// Error and Performance Metrics: -// - hydra_errors_total: Categorized error counts -// - hydra_payload_size_bytes: Workflow and step payload sizes -// - hydra_db_operations_total: Database operation counts and latency -// -// All metrics include rich labels for namespace, workflow names, worker IDs, -// and status information, enabling detailed monitoring and alerting. -// -// # Architecture -// -// Hydra uses a lease-based coordination model to ensure exactly-once execution: -// -// 1. Workers poll the database for pending workflows in their namespace -// 2. Workers attempt to acquire exclusive leases on available workflows -// 3. Successful lease holders execute the workflow logic -// 4. Workers send periodic heartbeats to maintain lease ownership -// 5. Completed workflows update their status and release the lease -// 6. Failed workers automatically lose their leases after timeout -// -// This design provides fault tolerance without requiring complex consensus -// protocols or external coordination services. -// -// # Database Schema -// -// Hydra requires the following database tables: -// -// - workflow_executions: Stores workflow state, status, and metadata -// - workflow_steps: Tracks individual step execution and results -// - leases: Manages worker coordination and exclusive access -// - cron_jobs: Stores scheduled workflow definitions -// -// The schema should be created using the provided schema.sql file. -// -// # Error Handling -// -// Hydra distinguishes between different types of errors: -// -// Transient Errors: Network timeouts, temporary database failures, etc. -// These trigger automatic retries based on the configured retry policy. -// -// Permanent Errors: Validation failures, business logic errors, etc. -// These immediately fail the workflow without retries. -// -// Workflow Suspension: Controlled suspension using Sleep() for time-based -// coordination or external event waiting. -// -// # Performance Considerations -// -// - Workers use circuit breakers to prevent cascading failures -// - Database queries are optimized with appropriate indexes -// - Lease timeouts prevent stuck workflows from blocking execution -// - Configurable concurrency limits prevent resource exhaustion -// - Built-in connection pooling and retry logic for database operations -// -// # Thread Safety -// -// All Hydra components are thread-safe and designed for concurrent access: -// - Multiple workers can safely operate on the same workflow queue -// - Step execution is atomic and isolated using database transactions -// - Workflow state updates use optimistic locking to prevent race conditions -// - Metrics collection is thread-safe and non-blocking -package hydra diff --git a/go/pkg/hydra/engine.go b/go/pkg/hydra/engine.go deleted file mode 100644 index 5130ec5e51..0000000000 --- a/go/pkg/hydra/engine.go +++ /dev/null @@ -1,318 +0,0 @@ -package hydra - -import ( - "context" - "database/sql" - "fmt" - "time" - - "github.com/unkeyed/unkey/go/pkg/assert" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/metrics" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/otel/logging" - "github.com/unkeyed/unkey/go/pkg/otel/tracing" - "github.com/unkeyed/unkey/go/pkg/retry" - "github.com/unkeyed/unkey/go/pkg/uid" - "go.opentelemetry.io/otel/attribute" - - // MySQL driver - _ "github.com/go-sql-driver/mysql" -) - -// Config holds the configuration for creating a new Engine instance. -// -// All fields except Store are optional and will use sensible defaults -// if not provided. -type Config struct { - // DSN is the database connection string for MySQL. - // This field is required and cannot be empty. - // The engine will create an SQLC store from this connection. - DSN string - - // Namespace provides tenant isolation for workflows. All workflows - // created by this engine will be scoped to this namespace. - // Defaults to "default" if not specified. - Namespace string - - // Clock provides time-related operations for testing and scheduling. - // Defaults to a real clock implementation if not specified. - Clock clock.Clock - - // Logger handles structured logging for the engine operations. - // Defaults to a no-op logger if not specified. - Logger logging.Logger - - // Marshaller handles serialization of workflow payloads and step results. - // Defaults to JSON marshalling if not specified. - Marshaller Marshaller -} - -// NewConfig creates a default config with sensible defaults. -// -// The returned config uses: -// - "default" namespace -// - Real clock implementation -// - All other fields will be set to their defaults when passed to New() -func NewConfig() Config { - return Config{ - DSN: "", - Namespace: "default", - Clock: clock.New(), - Logger: nil, - Marshaller: nil, - } -} - -// Engine is the core workflow orchestration engine that manages workflow -// lifecycle, coordination, and execution. -// -// The engine is responsible for: -// - Starting new workflows and managing their state -// - Coordinating workflow execution across multiple workers -// - Handling cron-based scheduled workflows -// - Providing namespace isolation for multi-tenant deployments -// - Recording metrics and managing observability -// -// Engine instances are thread-safe and can be shared across multiple -// workers and goroutines. -type Engine struct { - db *sql.DB - namespace string - cronHandlers map[string]CronHandler - clock clock.Clock - logger logging.Logger - marshaller Marshaller -} - -// New creates a new Engine instance with the provided configuration. -// -// The engine will validate the configuration and apply defaults for -// any missing optional fields. The Store field is required and the -// function will panic if it is nil. -// -// Example: -// -// engine := hydra.New(hydra.Config{ -// Store: gormStore, -// Namespace: "production", -// Logger: logger, -// }) -func New(config Config) (*Engine, error) { - - err := assert.All( - assert.NotEmpty(config.DSN), - assert.NotNil(config.Clock), - assert.NotEmpty(config.Namespace), - assert.NotNil(config.Logger), - assert.NotNil(config.Marshaller), - ) - if err != nil { - return nil, err - } - - var db *sql.DB - err = retry.New( - retry.Attempts(10), - retry.Backoff(func(n int) time.Duration { - return time.Duration(n) * time.Second - }), - ).Do(func() error { - var openErr error - db, openErr = sql.Open("mysql", config.DSN) - if openErr != nil { - config.Logger.Info("mysql not ready yet, retrying...", "error", openErr.Error()) - } - return openErr - - }) - - if err != nil { - return nil, fmt.Errorf("hydra: failed to open database connection: %w", err) - } - - err = retry.New( - retry.Attempts(10), - retry.Backoff(func(n int) time.Duration { - return time.Duration(n) * time.Second - }), - ).Do(func() error { - return db.Ping() - }) - // Test the connection - if err != nil { - db.Close() - return nil, fmt.Errorf("hydra: failed to ping database: %v", err) - } - - return &Engine{ - db: db, - namespace: config.Namespace, - cronHandlers: make(map[string]CronHandler), - clock: config.Clock, - logger: config.Logger, - marshaller: config.Marshaller, - }, nil -} - -// GetNamespace returns the namespace for this engine instance. -// -// This method is primarily used by workers and internal components -// to scope database operations to the correct tenant namespace. -func (e *Engine) GetNamespace() string { - return e.namespace -} - -// GetDB returns the database connection for direct query usage -func (e *Engine) GetDB() *sql.DB { - return e.db -} - -// RegisterCron registers a cron job with the given schedule and handler. -// -// The cronSpec follows standard cron syntax (e.g., "0 0 * * *" for daily at midnight). -// The name must be unique within this engine's namespace. The handler will be -// called according to the schedule. -// -// Example: -// -// err := engine.RegisterCron("0 */6 * * *", "cleanup-task", func(ctx context.Context) error { -// return performCleanup(ctx) -// }) -// -// Returns an error if a cron job with the same name is already registered. -func (e *Engine) RegisterCron(cronSpec, name string, handler CronHandler) error { - if _, exists := e.cronHandlers[name]; exists { - return fmt.Errorf("cron %q is already registered", name) - } - - e.cronHandlers[name] = handler - - // Use new Query pattern instead of store abstraction - return store.Query.CreateCronJob(context.Background(), e.db, store.CreateCronJobParams{ - ID: uid.New(uid.CronJobPrefix), - Name: name, - CronSpec: cronSpec, - Namespace: e.namespace, - WorkflowName: sql.NullString{String: "", Valid: false}, // Empty since this uses a handler, not a workflow - Enabled: true, - CreatedAt: e.clock.Now().UnixMilli(), - UpdatedAt: e.clock.Now().UnixMilli(), - LastRunAt: sql.NullInt64{Int64: 0, Valid: false}, - NextRunAt: calculateNextRun(cronSpec, e.clock.Now()), - }) -} - -// StartWorkflow starts a new workflow execution with the given name and payload. -// -// This method creates a new workflow execution record in the database and makes -// it available for workers to pick up and execute. The workflow will be queued -// in a pending state until a worker acquires a lease and begins execution. -// -// Parameters: -// - ctx: Context for the operation, which may include cancellation and timeouts -// - workflowName: Must match the Name() method of a registered workflow type -// - payload: The input data for the workflow, which will be serialized and stored -// - opts: Optional configuration for retry behavior, timeouts, and trigger metadata -// -// Returns: -// - executionID: A unique identifier for this workflow execution -// - error: Any error that occurred during workflow creation -// -// The payload will be marshalled using the engine's configured marshaller (JSON by default) -// and must be serializable. The workflow will be executed with the configured retry -// policy and timeout settings. -// -// Example: -// -// executionID, err := engine.StartWorkflow(ctx, "order-processing", &OrderRequest{ -// CustomerID: "cust_123", -// Items: []Item{{SKU: "item_456", Quantity: 2}}, -// }, hydra.WithMaxAttempts(5), hydra.WithTimeout(30*time.Minute)) -// -// Metrics recorded: -// - hydra_workflows_started_total (counter) -// - hydra_workflows_queued (gauge) -// - hydra_payload_size_bytes (histogram) -func (e *Engine) StartWorkflow(ctx context.Context, workflowName string, payload any, opts ...WorkflowOption) (string, error) { - // Start tracing span for workflow creation - ctx, span := tracing.Start(ctx, "hydra.engine.StartWorkflow") - defer span.End() - - executionID := uid.New(uid.WorkflowPrefix) - - span.SetAttributes( - attribute.String("hydra.workflow.name", workflowName), - attribute.String("hydra.execution.id", executionID), - attribute.String("hydra.namespace", e.namespace), - ) - - config := &WorkflowConfig{ - MaxAttempts: 3, // Default to 3 attempts total (1 initial + 2 retries) - TimeoutDuration: 1 * time.Hour, - RetryBackoff: 1 * time.Second, - TriggerType: store.WorkflowExecutionsTriggerTypeApi, // Default trigger type - TriggerSource: nil, - } - for _, opt := range opts { - opt(config) - } - - span.SetAttributes( - attribute.String("hydra.trigger.type", string(config.TriggerType)), - ) - - data, err := e.marshaller.Marshal(payload) - if err != nil { - metrics.SerializationErrorsTotal.WithLabelValues(e.namespace, workflowName, "input").Inc() - tracing.RecordError(span, err) - return "", fmt.Errorf("failed to marshal payload: %w", err) - } - - // Record payload size - metrics.RecordPayloadSize(e.namespace, workflowName, "input", len(data)) - - // Extract trace ID and span ID from span context for workflow correlation - traceID := "" - spanID := "" - if spanContext := span.SpanContext(); spanContext.IsValid() { - traceID = spanContext.TraceID().String() - spanID = spanContext.SpanID().String() - } - - // Use new Query pattern instead of store abstraction - err = store.Query.CreateWorkflow(ctx, e.db, store.CreateWorkflowParams{ - ID: executionID, - WorkflowName: workflowName, - Status: store.WorkflowExecutionsStatusPending, - InputData: data, - OutputData: []byte{}, - ErrorMessage: sql.NullString{String: "", Valid: false}, - CreatedAt: e.clock.Now().UnixMilli(), - StartedAt: sql.NullInt64{Int64: 0, Valid: false}, - CompletedAt: sql.NullInt64{Int64: 0, Valid: false}, - MaxAttempts: config.MaxAttempts, - RemainingAttempts: config.MaxAttempts, // Start with full attempts available - NextRetryAt: sql.NullInt64{Int64: 0, Valid: false}, - Namespace: e.namespace, - TriggerType: store.NullWorkflowExecutionsTriggerType{WorkflowExecutionsTriggerType: store.WorkflowExecutionsTriggerTypeApi, Valid: false}, // Trigger type conversion not implemented - TriggerSource: sql.NullString{String: "", Valid: false}, - SleepUntil: sql.NullInt64{Int64: 0, Valid: false}, - TraceID: sql.NullString{String: traceID, Valid: traceID != ""}, - SpanID: sql.NullString{String: spanID, Valid: spanID != ""}, - }) - if err != nil { - metrics.RecordError(e.namespace, "engine", "workflow_creation_failed") - tracing.RecordError(span, err) - return "", fmt.Errorf("failed to create workflow: %w", err) - } - - // Record workflow started - triggerTypeStr := string(config.TriggerType) - metrics.WorkflowsStartedTotal.WithLabelValues(e.namespace, workflowName, triggerTypeStr).Inc() - metrics.WorkflowsQueued.WithLabelValues(e.namespace, "pending").Inc() - - span.SetAttributes(attribute.String("hydra.workflow.status", "created")) - - return executionID, nil -} diff --git a/go/pkg/hydra/engine_test.go b/go/pkg/hydra/engine_test.go deleted file mode 100644 index d898e53dc9..0000000000 --- a/go/pkg/hydra/engine_test.go +++ /dev/null @@ -1,164 +0,0 @@ -package hydra - -import ( - "context" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/hydra/testharness" -) - -// Test workflow that counts executions and emits events -type CountingWorkflow struct { - counter *int64 - events *testharness.EventCollector -} - -func (w *CountingWorkflow) Name() string { - return "counting-workflow" -} - -func (w *CountingWorkflow) Run(ctx WorkflowContext, req struct{}) error { - w.events.Emit(ctx, testharness.WorkflowStarted, "Starting counting workflow") - - _, err := Step(ctx, "increment", func(stepCtx context.Context) (string, error) { - w.events.Emit(ctx, testharness.StepExecuting, "Executing increment step", "step_name", "increment") - - // This should only execute exactly once - atomic.AddInt64(w.counter, 1) - - w.events.Emit(ctx, testharness.StepExecuted, "Completed increment step", "step_name", "increment", "result", "incremented") - - return "incremented", nil - }) - - if err != nil { - w.events.Emit(ctx, testharness.WorkflowFailed, "Workflow failed", "error", err.Error()) - } else { - w.events.Emit(ctx, testharness.WorkflowCompleted, "Workflow completed successfully") - } - - return err -} - -// CRITICAL CORRECTNESS TESTS - -func TestBasicWorkflowRegistration(t *testing.T) { - // Given: An engine instance and workflow - e := newTestEngine(t) - events := testharness.NewEventCollector() - workflow := &CountingWorkflow{ - counter: new(int64), - events: events, - } - - // When: Creating worker and registering workflow - worker, err := NewWorker(e, WorkerConfig{ - Concurrency: 1, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(context.Background()) - - // Then: Worker should start without error - require.NoError(t, err) - require.NotNil(t, worker) - defer worker.Shutdown(context.Background()) - - // And: We should be able to start a workflow - executionID, err := e.StartWorkflow(context.Background(), workflow.Name(), struct{}{}) - require.NoError(t, err) - require.NotEmpty(t, executionID) -} - -func TestStepExecutesExactlyOnce(t *testing.T) { - // Given: A workflow with a step that increments a counter and emits events - testClock := clock.NewTestClock() - e := newTestEngineWithClock(t, testClock) - events := testharness.NewEventCollector() - counter := int64(0) - workflow := &CountingWorkflow{ - counter: &counter, - events: events, - } - - // When: Creating worker, registering workflow, and starting - worker, err := NewWorker(e, WorkerConfig{ - Concurrency: 1, - PollInterval: 100 * time.Millisecond, // Fast polling for test - }) - require.NoError(t, err) - defer worker.Shutdown(context.Background()) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(context.Background()) - require.NoError(t, err) - - // Give worker time to start polling - time.Sleep(50 * time.Millisecond) - - // Start workflow execution - executionID, err := e.StartWorkflow(context.Background(), workflow.Name(), struct{}{}) - require.NoError(t, err) - require.NotEmpty(t, executionID) - - // Trigger worker polling with test clock - for i := 0; i < 10; i++ { - testClock.Tick(200 * time.Millisecond) - time.Sleep(10 * time.Millisecond) - - // Check if workflow has been picked up - currentStatus, err := store.Query.GetWorkflow(context.Background(), e.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: e.GetNamespace(), - }) - require.NoError(t, err) - if currentStatus.Status != store.WorkflowExecutionsStatusPending { - break - } - } - - // Wait for completion - completedWorkflow := waitForWorkflowCompletion(t, e, executionID, 3*time.Second) - require.NotNil(t, completedWorkflow) - - // Then: Assert using both counter and events - finalCount := atomic.LoadInt64(&counter) - - // Check events for detailed analysis - stepExecutions := events.FilterWithData(testharness.StepExecuting, "step_name", "increment") - stepCompletions := events.FilterWithData(testharness.StepExecuted, "step_name", "increment") - workflowCompletions := events.Filter(testharness.WorkflowCompleted) - - // The critical assertion: step should execute exactly once - assert.Equal(t, int64(1), finalCount, "Counter should be incremented exactly once") - assert.Equal(t, 1, len(stepExecutions), "Step should be executed exactly once") - assert.Equal(t, 1, len(stepCompletions), "Step should complete exactly once") - assert.Equal(t, 1, len(workflowCompletions), "Workflow should complete exactly once") -} - -func TestStepCheckpointingPreventsReExecution(t *testing.T) { - t.Skip("TODO: Implement checkpointing test") -} - -func TestWorkflowTerminatesEventually(t *testing.T) { - t.Skip("TODO: Implement retry limit testing") -} - -func TestWorkerCrashRecovery(t *testing.T) { - t.Skip("TODO: Implement worker crash recovery testing") -} - -func TestNoDuplicateStepExecution(t *testing.T) { - t.Skip("TODO: Implement concurrency safety testing") -} diff --git a/go/pkg/hydra/marshaller.go b/go/pkg/hydra/marshaller.go deleted file mode 100644 index 38eeb94c6e..0000000000 --- a/go/pkg/hydra/marshaller.go +++ /dev/null @@ -1,58 +0,0 @@ -package hydra - -import ( - "encoding/json" -) - -// Marshaller defines the interface for serializing workflow payloads and step results. -// -// The marshaller is responsible for converting Go values to and from byte arrays -// for storage in the database. Custom marshallers can be implemented to support -// different serialization formats like Protocol Buffers, MessagePack, or custom -// binary formats. -// -// Implementations must ensure that: -// - Marshal and Unmarshal are inverse operations -// - The same input always produces the same output (deterministic) -// - All workflow payload types are supported -// - Error handling is consistent and informative -type Marshaller interface { - // Marshal converts a Go value to bytes for storage. - // The value may be any type used in workflow payloads or step results. - Marshal(v any) ([]byte, error) - - // Unmarshal converts stored bytes back to a Go value. - // The target value should be a pointer to the desired type. - Unmarshal(data []byte, v any) error -} - -// JSONMarshaller implements Marshaller using standard Go JSON encoding. -// -// This is the default marshaller used by Hydra engines. It provides -// good compatibility with most Go types and is human-readable for -// debugging purposes. -// -// Limitations: -// - Cannot handle circular references -// - Maps with non-string keys are not supported -// - Precision may be lost for large integers -// - Custom types need JSON tags for proper serialization -type JSONMarshaller struct{} - -// NewJSONMarshaller creates a new JSON-based marshaller. -// -// This is the default marshaller used when no custom marshaller -// is provided to the engine configuration. -func NewJSONMarshaller() Marshaller { - return &JSONMarshaller{} -} - -// Marshal implements Marshaller.Marshal using encoding/json. -func (j *JSONMarshaller) Marshal(v any) ([]byte, error) { - return json.Marshal(v) -} - -// Unmarshal implements Marshaller.Unmarshal using encoding/json. -func (j *JSONMarshaller) Unmarshal(data []byte, v any) error { - return json.Unmarshal(data, v) -} diff --git a/go/pkg/hydra/marshaller_test.go b/go/pkg/hydra/marshaller_test.go deleted file mode 100644 index 5d5a621c28..0000000000 --- a/go/pkg/hydra/marshaller_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package hydra - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestJSONMarshaller(t *testing.T) { - marshaller := NewJSONMarshaller() - - t.Run("MarshalUnmarshalStruct", func(t *testing.T) { - type TestStruct struct { - Name string `json:"name"` - Value int `json:"value"` - } - - original := TestStruct{Name: "test", Value: 42} - - // Marshal - data, err := marshaller.Marshal(original) - require.NoError(t, err) - assert.Contains(t, string(data), "test") - assert.Contains(t, string(data), "42") - - // Unmarshal - var result TestStruct - err = marshaller.Unmarshal(data, &result) - require.NoError(t, err) - assert.Equal(t, original, result) - }) - - t.Run("MarshalUnmarshalPrimitive", func(t *testing.T) { - original := "hello world" - - data, err := marshaller.Marshal(original) - require.NoError(t, err) - - var result string - err = marshaller.Unmarshal(data, &result) - require.NoError(t, err) - assert.Equal(t, original, result) - }) - - t.Run("MarshalUnmarshalMap", func(t *testing.T) { - original := map[string]interface{}{ - "key1": "value1", - "key2": 123, - "key3": true, - } - - data, err := marshaller.Marshal(original) - require.NoError(t, err) - - var result map[string]interface{} - err = marshaller.Unmarshal(data, &result) - require.NoError(t, err) - assert.Equal(t, "value1", result["key1"]) - // Note: JSON unmarshaling converts numbers to float64 - assert.Equal(t, float64(123), result["key2"]) - assert.Equal(t, true, result["key3"]) - }) -} diff --git a/go/pkg/hydra/metrics/example_usage.go b/go/pkg/hydra/metrics/example_usage.go deleted file mode 100644 index 775b7b6594..0000000000 --- a/go/pkg/hydra/metrics/example_usage.go +++ /dev/null @@ -1,89 +0,0 @@ -package metrics - -import ( - "time" -) - -const exampleNamespace = "production" - -func ExampleWorkflowMetrics() { - namespace := exampleNamespace - workflowName := "user-onboarding" - - WorkflowsStartedTotal.WithLabelValues(namespace, workflowName, "manual").Inc() - - WorkflowsQueued.WithLabelValues(namespace, "pending").Set(42) - WorkflowsActive.WithLabelValues(namespace, "worker-1").Set(5) - - start := time.Now() - ObserveWorkflowDuration(namespace, workflowName, "completed", start) - WorkflowsCompletedTotal.WithLabelValues(namespace, workflowName, "completed").Inc() -} - -func ExampleStepMetrics() { - namespace := exampleNamespace - workflowName := "order-processing" - stepName := "charge-payment" - - start := time.Now() - ObserveStepDuration(namespace, workflowName, stepName, "completed", start) - StepsExecutedTotal.WithLabelValues(namespace, workflowName, stepName, "completed").Inc() - - StepsCachedTotal.WithLabelValues(namespace, workflowName, stepName).Inc() -} - -func ExampleDatabaseMetrics() { - start := time.Now() - ObserveDbOperation("select", "workflow_executions", "success", start) - - DbConnectionsActive.WithLabelValues("worker-1").Set(15) -} - -func ExampleSleepMetrics() { - namespace := exampleNamespace - workflowName := "user-onboarding" - - SleepsStartedTotal.WithLabelValues(namespace, workflowName).Inc() - SleepsResumedTotal.WithLabelValues(namespace, workflowName).Inc() - - actualSleepDuration := 25 * time.Minute // actual time slept - SleepDurationSeconds.WithLabelValues(namespace, workflowName).Observe(actualSleepDuration.Seconds()) - - CronTriggersTotal.WithLabelValues(namespace, "daily-report", "success").Inc() -} - -func ExampleErrorMetrics() { - namespace := exampleNamespace - - RecordError(namespace, "step", "timeout") - RecordError(namespace, "client", "serialization") - RecordError(namespace, "store", "connection") - - PanicsTotal.WithLabelValues("worker-1", "step_execution").Inc() - - TimeoutsTotal.WithLabelValues(namespace, "workflow_execution").Inc() -} - -func ExamplePayloadMetrics() { - namespace := exampleNamespace - workflowName := "image-processing" - - inputSize := 1024 * 50 // 50KB input - outputSize := 1024 * 5 // 5KB output - - RecordPayloadSize(namespace, workflowName, "input", inputSize) - RecordPayloadSize(namespace, workflowName, "output", outputSize) - - SerializationErrorsTotal.WithLabelValues(namespace, workflowName, "input").Inc() -} - -func ExampleWorkerMetrics() { - workerID := "worker-1" - namespace := exampleNamespace - - WorkerHeartbeatsTotal.WithLabelValues(workerID, namespace, "success").Inc() - WorkerPollsTotal.WithLabelValues(workerID, namespace, "found_work").Inc() - LeaseAcquisitionsTotal.WithLabelValues(workerID, "workflow", "success").Inc() - - WorkerConcurrencyCurrent.WithLabelValues(workerID, namespace).Set(8) -} diff --git a/go/pkg/hydra/metrics/metrics.go b/go/pkg/hydra/metrics/metrics.go deleted file mode 100644 index f90aabd6d0..0000000000 --- a/go/pkg/hydra/metrics/metrics.go +++ /dev/null @@ -1,351 +0,0 @@ -package metrics - -import ( - "os" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/unkeyed/unkey/go/pkg/version" -) - -var constLabels = prometheus.Labels{ - "region": os.Getenv("UNKEY_REGION"), - "version": version.Version, -} - -var workflowLatencyBuckets = []float64{ - 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, - 10.0, 30.0, 60.0, 120.0, 300.0, 600.0, -} - -var stepLatencyBuckets = []float64{ - 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, - 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, -} - -var dbLatencyBuckets = []float64{ - 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, - 0.2, 0.5, 1.0, 2.0, 5.0, -} - -var WorkflowsStartedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "workflows_started_total", - Help: "Total number of workflows started", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "trigger_type"}, -) - -var WorkflowsCompletedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "workflows_completed_total", - Help: "Total number of workflows completed", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "status"}, -) - -var WorkflowsRetriedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "workflows_retried_total", - Help: "Total number of workflow retry attempts", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "attempt"}, -) - -var WorkflowDurationSeconds = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "workflow_duration_seconds", - Help: "Time taken to complete workflows", - ConstLabels: constLabels, - Buckets: workflowLatencyBuckets, - }, - []string{"namespace", "workflow_name", "status"}, -) - -var WorkflowQueueTimeSeconds = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "workflow_queue_time_seconds", - Help: "Time workflow spent queued before execution", - ConstLabels: constLabels, - Buckets: workflowLatencyBuckets, - }, - []string{"namespace", "workflow_name"}, -) - -var WorkflowsActive = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: "hydra", - Name: "workflows_active", - Help: "Currently running workflows", - ConstLabels: constLabels, - }, - []string{"namespace", "worker_id"}, -) - -var WorkflowsQueued = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: "hydra", - Name: "workflows_queued", - Help: "Workflows waiting to be processed", - ConstLabels: constLabels, - }, - []string{"namespace", "status"}, -) - -var StepsExecutedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "steps_executed_total", - Help: "Total number of workflow steps executed", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "step_name", "status"}, -) - -var StepsRetriedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "steps_retried_total", - Help: "Total number of step retry attempts", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "step_name"}, -) - -var StepsCachedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "steps_cached_total", - Help: "Steps skipped due to checkpointing", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "step_name"}, -) - -var StepDurationSeconds = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "step_duration_seconds", - Help: "Time taken to execute workflow steps", - ConstLabels: constLabels, - Buckets: stepLatencyBuckets, - }, - []string{"namespace", "workflow_name", "step_name", "status"}, -) - -var WorkerHeartbeatsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "worker_heartbeats_total", - Help: "Total number of worker heartbeat operations", - ConstLabels: constLabels, - }, - []string{"worker_id", "namespace", "status"}, -) - -var LeaseAcquisitionsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "lease_acquisitions_total", - Help: "Total number of lease acquisition attempts", - ConstLabels: constLabels, - }, - []string{"worker_id", "resource_type", "status"}, -) - -var WorkerPollsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "worker_polls_total", - Help: "Total number of worker polling operations", - ConstLabels: constLabels, - }, - []string{"worker_id", "namespace", "status"}, -) - -var WorkerConcurrencyCurrent = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: "hydra", - Name: "worker_concurrency_current", - Help: "Current workflow concurrency per worker", - ConstLabels: constLabels, - }, - []string{"worker_id", "namespace"}, -) - -var DbOperationsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "db_operations_total", - Help: "Total number of database operations", - ConstLabels: constLabels, - }, - []string{"operation", "table", "status"}, -) - -var DbOperationDurationSeconds = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "db_operation_duration_seconds", - Help: "Time taken for database operations", - ConstLabels: constLabels, - Buckets: dbLatencyBuckets, - }, - []string{"operation", "table", "status"}, -) - -var DbConnectionsActive = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: "hydra", - Name: "db_connections_active", - Help: "Active database connections", - ConstLabels: constLabels, - }, - []string{"worker_id"}, -) - -var SleepsStartedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "sleeps_started_total", - Help: "Total number of sleep operations initiated", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name"}, -) - -var SleepsResumedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "sleeps_resumed_total", - Help: "Total number of workflows resumed from sleep", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name"}, -) - -var CronTriggersTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "cron_triggers_total", - Help: "Total number of cron-triggered workflows", - ConstLabels: constLabels, - }, - []string{"namespace", "cron_name", "status"}, -) - -var SleepDurationSeconds = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "sleep_duration_seconds", - Help: "Actual sleep durations", - ConstLabels: constLabels, - Buckets: workflowLatencyBuckets, - }, - []string{"namespace", "workflow_name"}, -) - -var CronExecutionLatencySeconds = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "cron_execution_latency_seconds", - Help: "Delay between scheduled and actual cron execution", - ConstLabels: constLabels, - Buckets: []float64{0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0, 300.0}, - }, - []string{"namespace", "cron_name"}, -) - -var WorkflowsSleeping = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: "hydra", - Name: "workflows_sleeping", - Help: "Currently sleeping workflows", - ConstLabels: constLabels, - }, - []string{"namespace"}, -) - -var ErrorsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "errors_total", - Help: "Total number of errors across all components", - ConstLabels: constLabels, - }, - []string{"namespace", "component", "error_type"}, -) - -var PanicsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "panics_total", - Help: "Total number of panic recoveries", - ConstLabels: constLabels, - }, - []string{"worker_id", "component"}, -) - -var TimeoutsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "timeouts_total", - Help: "Total number of operation timeouts", - ConstLabels: constLabels, - }, - []string{"namespace", "operation_type"}, -) - -var PayloadSizeBytes = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: "hydra", - Name: "payload_size_bytes", - Help: "Size of workflow and step payloads", - ConstLabels: constLabels, - Buckets: []float64{100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000}, - }, - []string{"namespace", "workflow_name", "direction"}, -) - -var SerializationErrorsTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "hydra", - Name: "serialization_errors_total", - Help: "Total number of payload serialization errors", - ConstLabels: constLabels, - }, - []string{"namespace", "workflow_name", "direction"}, -) - -func ObserveWorkflowDuration(namespace, workflowName, status string, start time.Time) { - duration := time.Since(start) - WorkflowDurationSeconds.WithLabelValues(namespace, workflowName, status).Observe(duration.Seconds()) -} - -func ObserveStepDuration(namespace, workflowName, stepName, status string, start time.Time) { - duration := time.Since(start) - StepDurationSeconds.WithLabelValues(namespace, workflowName, stepName, status).Observe(duration.Seconds()) -} - -func ObserveDbOperation(operation, table, status string, start time.Time) { - duration := time.Since(start) - DbOperationsTotal.WithLabelValues(operation, table, status).Inc() - DbOperationDurationSeconds.WithLabelValues(operation, table, status).Observe(duration.Seconds()) -} - -func RecordError(namespace, component, errorType string) { - ErrorsTotal.WithLabelValues(namespace, component, errorType).Inc() -} - -func RecordPayloadSize(namespace, workflowName, direction string, size int) { - PayloadSizeBytes.WithLabelValues(namespace, workflowName, direction).Observe(float64(size)) -} diff --git a/go/pkg/hydra/simple_consistency_test.go b/go/pkg/hydra/simple_consistency_test.go deleted file mode 100644 index e49778bb56..0000000000 --- a/go/pkg/hydra/simple_consistency_test.go +++ /dev/null @@ -1,270 +0,0 @@ -package hydra - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/hydra/testharness" -) - -// TestSimpleDataConsistency tests basic data consistency using event collection -func TestSimpleDataConsistency(t *testing.T) { - realClock := clock.New() - engine := newTestEngineWithClock(t, realClock) - - const numWorkflows = 10 - - // Create event collector - eventCollector := testharness.NewEventCollector() - - // Create event-aware workflow - workflow := &eventTrackingWorkflow{ - engine: engine, - name: "simple-consistency-workflow", - collector: eventCollector, - } - - ctx, cancel := context.WithTimeout(context.Background(), 12*time.Second) - defer cancel() - - // Start a single worker - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: "simple-consistency-worker", - Concurrency: 2, - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, - ClaimTimeout: 5 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Submit workflows - workflowIDs := make([]string, numWorkflows) - for i := 0; i < numWorkflows; i++ { - workflowID, startErr := workflow.Start(ctx, fmt.Sprintf("payload-%d", i)) - require.NoError(t, startErr) - workflowIDs[i] = workflowID - } - - // Wait for all workflows to finish using event collection - require.Eventually(t, func() bool { - completedEvents := eventCollector.Count(testharness.WorkflowCompleted) - failedEvents := eventCollector.Count(testharness.WorkflowFailed) - totalFinished := completedEvents + failedEvents - - return totalFinished == numWorkflows - }, 10*time.Second, 200*time.Millisecond, "All workflows should finish") - - // Verify exactly-once execution using events - for _, workflowID := range workflowIDs { - // Verify exactly one started event - startedEvents := eventCollector.FilterWithData(testharness.WorkflowStarted, "execution_id", workflowID) - require.Len(t, startedEvents, 1, "Workflow %s should start exactly once", workflowID) - - // Verify exactly one completion event - completedEvents := eventCollector.FilterWithData(testharness.WorkflowCompleted, "execution_id", workflowID) - failedEvents := eventCollector.FilterWithData(testharness.WorkflowFailed, "execution_id", workflowID) - totalCompletions := len(completedEvents) + len(failedEvents) - require.Equal(t, 1, totalCompletions, "Workflow %s should complete exactly once", workflowID) - - // Verify exactly one step execution - stepExecutingEvents := eventCollector.FilterWithData(testharness.StepExecuting, "execution_id", workflowID) - require.Len(t, stepExecutingEvents, 1, "Workflow %s should have exactly one step execution", workflowID) - } - - // Verify database consistency - // GetAllWorkflows was removed - check completed workflows indirectly - // Since we know we created specific workflows, verify them individually - completedInDB := 0 - for _, id := range workflowIDs { - wf, err := store.Query.GetWorkflow(context.Background(), engine.GetDB(), store.GetWorkflowParams{ - ID: id, - Namespace: engine.GetNamespace(), - }) - if err == nil && wf.Status == store.WorkflowExecutionsStatusCompleted { - completedInDB++ - } - } - - completedEventsCount := eventCollector.Count(testharness.WorkflowCompleted) - require.Equal(t, completedEventsCount, completedInDB, - "Database completed count should match completed events") - -} - -// TestConcurrentWorkerConsistency tests consistency with multiple workers using events -func TestConcurrentWorkerConsistency(t *testing.T) { - realClock := clock.New() - engine := newTestEngineWithClock(t, realClock) - - const ( - numWorkers = 3 - numWorkflows = 15 - ) - - // Create event collector - eventCollector := testharness.NewEventCollector() - - // Create event-aware workflow - workflow := &eventTrackingWorkflow{ - engine: engine, - name: "concurrent-consistency-workflow", - collector: eventCollector, - } - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - - // Start multiple workers - workers := make([]Worker, numWorkers) - for i := 0; i < numWorkers; i++ { - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: fmt.Sprintf("concurrent-worker-%d", i), - Concurrency: 2, - PollInterval: 50 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, - ClaimTimeout: 5 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - workers[i] = worker - } - - // Submit workflows - workflowIDs := make([]string, numWorkflows) - for i := 0; i < numWorkflows; i++ { - workflowID, err := workflow.Start(ctx, fmt.Sprintf("concurrent-payload-%d", i)) - require.NoError(t, err) - workflowIDs[i] = workflowID - } - - // Wait for workflows to finish using event collection - require.Eventually(t, func() bool { - completedEvents := eventCollector.Count(testharness.WorkflowCompleted) - failedEvents := eventCollector.Count(testharness.WorkflowFailed) - totalFinished := completedEvents + failedEvents - - return totalFinished == numWorkflows - }, 12*time.Second, 300*time.Millisecond, "All concurrent workflows should finish") - - // Verify exactly-once execution for each workflow - duplicateExecutions := 0 - duplicateCompletions := 0 - - for _, workflowID := range workflowIDs { - // Check for duplicate workflow executions - startedEvents := eventCollector.FilterWithData(testharness.WorkflowStarted, "execution_id", workflowID) - if len(startedEvents) > 1 { - duplicateExecutions++ - t.Errorf("DUPLICATE EXECUTION: Workflow %s started %d times", workflowID, len(startedEvents)) - } - require.Len(t, startedEvents, 1, "Workflow %s should start exactly once", workflowID) - - // Check for duplicate completions - completedEvents := eventCollector.FilterWithData(testharness.WorkflowCompleted, "execution_id", workflowID) - failedEvents := eventCollector.FilterWithData(testharness.WorkflowFailed, "execution_id", workflowID) - totalCompletions := len(completedEvents) + len(failedEvents) - - if totalCompletions > 1 { - duplicateCompletions++ - t.Errorf("DUPLICATE COMPLETION: Workflow %s completed %d times (%d completed + %d failed)", - workflowID, totalCompletions, len(completedEvents), len(failedEvents)) - } - require.Equal(t, 1, totalCompletions, "Workflow %s should complete exactly once", workflowID) - - // Verify exactly one step execution - stepExecutingEvents := eventCollector.FilterWithData(testharness.StepExecuting, "execution_id", workflowID) - require.Len(t, stepExecutingEvents, 1, "Workflow %s should have exactly one step execution", workflowID) - } - - // Assert no duplicates were found - require.Equal(t, 0, duplicateExecutions, "Should have zero duplicate workflow executions") - require.Equal(t, 0, duplicateCompletions, "Should have zero duplicate workflow completions") - - // Verify database consistency - // GetAllWorkflows was removed - check completed workflows indirectly - // Since we know we created specific workflows, verify them individually - completedInDB := 0 - for _, id := range workflowIDs { - wf, err := store.Query.GetWorkflow(context.Background(), engine.GetDB(), store.GetWorkflowParams{ - ID: id, - Namespace: engine.GetNamespace(), - }) - if err == nil && wf.Status == store.WorkflowExecutionsStatusCompleted { - completedInDB++ - } - } - - completedEventsCount := eventCollector.Count(testharness.WorkflowCompleted) - require.Equal(t, completedEventsCount, completedInDB, - "Database completed count should match completed events") - -} - -// eventTrackingWorkflow emits events during execution for testing -type eventTrackingWorkflow struct { - engine *Engine - name string - collector *testharness.EventCollector -} - -func (w *eventTrackingWorkflow) Name() string { - return w.name -} - -func (w *eventTrackingWorkflow) Run(ctx WorkflowContext, req any) error { - // Emit workflow started event - w.collector.Emit(ctx, testharness.WorkflowStarted, "Workflow execution started") - - // Emit step executing event - w.collector.Emit(ctx, testharness.StepExecuting, "Step execution started", "step_name", "consistency-step") - - // Execute the step - result, err := Step(ctx, "consistency-step", func(stepCtx context.Context) (string, error) { - // Simulate some work - time.Sleep(20 * time.Millisecond) - return "step-completed", nil - }) - - if err != nil { - // Emit step failed event - w.collector.Emit(ctx, testharness.StepFailed, "Step execution failed", - "step_name", "consistency-step", "error", err.Error()) - - // Emit workflow failed event - w.collector.Emit(ctx, testharness.WorkflowFailed, "Workflow execution failed", "error", err.Error()) - - return err - } - - // Emit step executed event - w.collector.Emit(ctx, testharness.StepExecuted, "Step execution completed", - "step_name", "consistency-step", "result", result) - - // Emit workflow completed event - w.collector.Emit(ctx, testharness.WorkflowCompleted, "Workflow execution completed") - - return nil -} - -func (w *eventTrackingWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/sleep.go b/go/pkg/hydra/sleep.go deleted file mode 100644 index b824a8844e..0000000000 --- a/go/pkg/hydra/sleep.go +++ /dev/null @@ -1,133 +0,0 @@ -package hydra - -import ( - "database/sql" - "fmt" - "time" - - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/uid" -) - -// Sleep suspends workflow execution for the specified duration. -// -// This function allows workflows to pause execution and resume after -// a specified time period. The workflow will be marked as sleeping -// and workers will not attempt to execute it until the sleep duration -// has elapsed. -// -// Sleep is useful for: -// - Time-based coordination (e.g., waiting for settlement periods) -// - Human approval workflows (e.g., waiting for manual intervention) -// - Rate limiting and backoff strategies -// - Scheduled processing windows -// -// The sleep duration is durable - if the worker crashes or restarts -// during the sleep period, the workflow will still resume at the -// correct time. -// -// Example usage: -// -// // Sleep for 24 hours for manual approval -// err = hydra.Sleep(ctx, 24*time.Hour) -// if err != nil { -// return err -// } -// -// // Continue with post-approval processing -// result, err := hydra.Step(ctx, "post-approval", func(stepCtx context.Context) (string, error) { -// return processApprovedRequest(stepCtx) -// }) -// -// Note: Sleep creates an internal step to track the sleep state. -// The step name is generated automatically based on the duration. -// -// Metrics recorded: -// - hydra_sleeps_started_total (counter) -// - hydra_workflows_sleeping (gauge) -func Sleep(ctx WorkflowContext, duration time.Duration) error { - wctx, ok := ctx.(*workflowContext) - if !ok { - return fmt.Errorf("invalid workflow context") - } - - stepName := fmt.Sprintf("sleep-%d", duration.Milliseconds()) - - _, err := store.Query.GetCompletedStep(wctx.ctx, wctx.db, store.GetCompletedStepParams{ - Namespace: wctx.namespace, - ExecutionID: wctx.ExecutionID(), - StepName: stepName, - }) - if err == nil { - return nil - } - - now := time.Now().UnixMilli() - existingStep, err := store.Query.GetStep(wctx.ctx, wctx.db, store.GetStepParams{ - Namespace: wctx.namespace, - ExecutionID: wctx.ExecutionID(), - StepName: stepName, - }) - if err == nil && existingStep.StartedAt.Valid { - sleepUntil := existingStep.StartedAt.Int64 + duration.Milliseconds() - - if sleepUntil <= now { - return wctx.markStepCompleted(existingStep.ID, []byte("{}")) - } - return store.Query.SleepWorkflow(wctx.ctx, wctx.db, store.SleepWorkflowParams{ - SleepUntil: sql.NullInt64{Int64: sleepUntil, Valid: true}, - ID: wctx.ExecutionID(), - Namespace: wctx.namespace, - }) - } - - sleepUntil := now + duration.Milliseconds() - - // Create sleep step with lease validation - only if worker holds valid lease - stepID := uid.New(uid.StepPrefix) - result, err := wctx.db.ExecContext(wctx.ctx, ` - INSERT INTO workflow_steps ( - id, execution_id, step_name, status, output_data, error_message, - started_at, completed_at, max_attempts, remaining_attempts, namespace - ) - SELECT ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ? - WHERE EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - stepID, - wctx.ExecutionID(), - stepName, - store.WorkflowStepsStatusRunning, - []byte{}, - sql.NullString{String: "", Valid: false}, - sql.NullInt64{Int64: now, Valid: true}, - sql.NullInt64{Int64: 0, Valid: false}, - 1, // Sleep doesn't need retries - 1, - wctx.namespace, - wctx.ExecutionID(), // resource_id for lease check - wctx.workerID, // worker_id for lease check - now, // expires_at check - ) - if err != nil { - return fmt.Errorf("failed to create sleep step: %w", err) - } - - // Check if the step creation actually happened (lease validation) - rowsAffected, err := result.RowsAffected() - if err != nil { - return fmt.Errorf("failed to check step creation result: %w", err) - } - if rowsAffected == 0 { - return fmt.Errorf("sleep step creation failed: lease expired or invalid") - } - - return store.Query.SleepWorkflow(wctx.ctx, wctx.db, store.SleepWorkflowParams{ - SleepUntil: sql.NullInt64{Int64: sleepUntil, Valid: true}, - ID: wctx.ExecutionID(), - Namespace: wctx.namespace, - }) -} diff --git a/go/pkg/hydra/step.go b/go/pkg/hydra/step.go deleted file mode 100644 index 2dc8c209ad..0000000000 --- a/go/pkg/hydra/step.go +++ /dev/null @@ -1,311 +0,0 @@ -package hydra - -import ( - "context" - "database/sql" - "fmt" - "reflect" - "time" - - "github.com/unkeyed/unkey/go/pkg/hydra/metrics" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/otel/tracing" - "github.com/unkeyed/unkey/go/pkg/uid" - "go.opentelemetry.io/otel/attribute" -) - -// Step executes a named step within a workflow with automatic checkpointing and retry logic. -// -// Steps are the fundamental units of work in Hydra workflows. They provide: -// - Exactly-once execution guarantees -// - Automatic result caching (checkpointing) -// - Built-in retry logic for transient failures -// - Comprehensive metrics and observability -// -// Parameters: -// - ctx: The workflow context from the workflow's Run() method -// - stepName: A unique name for this step within the workflow -// - fn: The function to execute, which should be idempotent -// -// The stepName must be unique within the workflow and should remain stable -// across deployments. If a step has already completed successfully, its -// cached result will be returned without re-executing the function. -// -// The function fn receives a standard Go context and should: -// - Be idempotent (safe to run multiple times) -// - Handle context cancellation gracefully -// - Return consistent results for the same inputs -// - Use the provided context for any I/O operations -// -// Example usage: -// -// // Simple step with string result -// result, err := hydra.Step(ctx, "fetch-user", func(stepCtx context.Context) (string, error) { -// user, err := userService.GetUser(stepCtx, userID) -// if err != nil { -// return "", err -// } -// return user.Name, nil -// }) -// -// // Step with complex result type -// order, err := hydra.Step(ctx, "create-order", func(stepCtx context.Context) (*Order, error) { -// return orderService.CreateOrder(stepCtx, &CreateOrderRequest{ -// CustomerID: customerID, -// Items: items, -// }) -// }) -// -// Metrics recorded: -// - hydra_steps_executed_total (counter with status) -// - hydra_step_duration_seconds (histogram) -// - hydra_steps_cached_total (counter for cache hits) -// - hydra_steps_retried_total (counter for retry attempts) -// -// Returns the result of the function execution or the cached result if the -// step has already completed successfully. -func Step[TResponse any](ctx WorkflowContext, stepName string, fn func(context.Context) (TResponse, error)) (TResponse, error) { - var zero TResponse - - wctx, ok := ctx.(*workflowContext) - if !ok { - return zero, fmt.Errorf("invalid workflow context") - } - - // Start tracing span for this step - stepCtx, span := tracing.Start(wctx.ctx, fmt.Sprintf("hydra.step.%s", stepName)) - defer span.End() - - span.SetAttributes( - attribute.String("hydra.step.name", stepName), - attribute.String("hydra.workflow.name", wctx.workflowName), - attribute.String("hydra.execution.id", wctx.executionID), - attribute.String("hydra.namespace", wctx.namespace), - ) - - existing, err := store.Query.GetCompletedStep(wctx.ctx, wctx.db, store.GetCompletedStepParams{ - Namespace: wctx.namespace, - ExecutionID: wctx.ExecutionID(), - StepName: stepName, - }) - if err == nil { - // Record cached step hit - metrics.StepsCachedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName).Inc() - span.SetAttributes(attribute.Bool("hydra.step.cached", true)) - - responseType := reflect.TypeOf((*TResponse)(nil)).Elem() - var response TResponse - - if responseType.Kind() == reflect.Ptr { - responseValue := reflect.New(responseType.Elem()) - var ok bool - response, ok = responseValue.Interface().(TResponse) - if !ok { - conversionErr := fmt.Errorf("failed to convert response to expected type") - tracing.RecordError(span, conversionErr) - return zero, conversionErr - } - } - - if len(existing.OutputData) > 0 { - err = wctx.marshaller.Unmarshal(existing.OutputData, &response) - if err != nil { - metrics.RecordError(wctx.namespace, "step", "unmarshal_cached_result_failed") - tracing.RecordError(span, err) - return zero, fmt.Errorf("failed to unmarshal cached step result: %w", err) - } - } - - return response, nil - } - - span.SetAttributes(attribute.Bool("hydra.step.cached", false)) - - _, err = store.Query.GetStep(wctx.ctx, wctx.db, store.GetStepParams{ - Namespace: wctx.namespace, - ExecutionID: wctx.ExecutionID(), - StepName: stepName, - }) - var stepToUse *store.WorkflowStep - shouldCreateNewStep := err != nil // sql.ErrNoRows means step doesn't exist, so create new one - - stepStartTime := time.Now() - - if shouldCreateNewStep { - stepID := uid.New(uid.StepPrefix) - - // Create step with lease validation - only if worker holds valid lease - now := time.Now().UnixMilli() - createResult, createErr := wctx.db.ExecContext(wctx.ctx, ` - INSERT INTO workflow_steps ( - id, execution_id, step_name, status, output_data, error_message, - started_at, completed_at, max_attempts, remaining_attempts, namespace - ) - SELECT ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ? - WHERE EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - stepID, - wctx.ExecutionID(), - stepName, - store.WorkflowStepsStatusRunning, - []byte{}, - sql.NullString{String: "", Valid: false}, - sql.NullInt64{Int64: stepStartTime.UnixMilli(), Valid: true}, - sql.NullInt64{Int64: 0, Valid: false}, - wctx.stepMaxAttempts, - wctx.stepMaxAttempts, - wctx.namespace, - wctx.ExecutionID(), // resource_id for lease check - wctx.workerID, // worker_id for lease check - now, // expires_at check - ) - if createErr != nil { - return zero, fmt.Errorf("failed to create step: %w", createErr) - } - - // Check if the step creation actually happened (lease validation) - rowsAffected, rowsErr := createResult.RowsAffected() - if rowsErr != nil { - return zero, fmt.Errorf("failed to check step creation result: %w", rowsErr) - } - if rowsAffected == 0 { - return zero, fmt.Errorf("step creation failed: lease expired or invalid") - } - - // Step created successfully - span.SetAttributes(attribute.Bool("hydra.step.new", true)) - } else { - // Update existing step to running status with lease validation - now := time.Now().UnixMilli() - updateResult, updateErr := wctx.db.ExecContext(wctx.ctx, ` - UPDATE workflow_steps - SET status = ?, completed_at = ?, output_data = ?, error_message = ? - WHERE workflow_steps.namespace = ? AND execution_id = ? AND step_name = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - store.WorkflowStepsStatusRunning, - sql.NullInt64{Int64: 0, Valid: false}, - []byte{}, - sql.NullString{String: "", Valid: false}, - wctx.namespace, - wctx.ExecutionID(), - stepName, - wctx.ExecutionID(), // resource_id for lease check - wctx.workerID, // worker_id for lease check - now, // expires_at check - ) - if updateErr != nil { - return zero, fmt.Errorf("failed to update step status: %w", updateErr) - } - - // Check if the update actually happened (lease validation) - rowsAffected, rowsErr := updateResult.RowsAffected() - if rowsErr != nil { - return zero, fmt.Errorf("failed to check step update result: %w", rowsErr) - } - if rowsAffected == 0 { - return zero, fmt.Errorf("step update failed: lease expired or invalid") - } - - // Get the step after successful update - stepResult, getErr := store.Query.GetStep(wctx.ctx, wctx.db, store.GetStepParams{ - Namespace: wctx.namespace, - ExecutionID: wctx.ExecutionID(), - StepName: stepName, - }) - stepToUse = &stepResult - if getErr != nil { - return zero, fmt.Errorf("failed to retrieve updated step: %w", getErr) - } - - // Record step retry - if stepToUse.RemainingAttempts < stepToUse.MaxAttempts { - metrics.StepsRetriedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName).Inc() - span.SetAttributes(attribute.Bool("hydra.step.retry", true)) - } - span.SetAttributes(attribute.Bool("hydra.step.new", false)) - } - - response, err := fn(stepCtx) - if err != nil { - tracing.RecordError(span, err) - wctx.logger.Error("step execution failed", "error", err.Error()) - span.SetAttributes(attribute.String("hydra.step.status", "failed")) - - if markErr := wctx.markStepFailed(stepName, err.Error()); markErr != nil { - metrics.RecordError(wctx.namespace, "step", "mark_step_failed_error") - } - metrics.ObserveStepDuration(wctx.namespace, wctx.workflowName, stepName, "failed", stepStartTime) - metrics.StepsExecutedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName, "failed").Inc() - return zero, fmt.Errorf("step execution failed: %w", err) - } - - respData, err := wctx.marshaller.Marshal(response) - if err != nil { - tracing.RecordError(span, err) - span.SetAttributes(attribute.String("hydra.step.status", "failed")) - - if markErr := wctx.markStepFailed(stepName, err.Error()); markErr != nil { - metrics.RecordError(wctx.namespace, "step", "mark_step_failed_error") - } - metrics.ObserveStepDuration(wctx.namespace, wctx.workflowName, stepName, "failed", stepStartTime) - metrics.StepsExecutedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName, "failed").Inc() - metrics.RecordError(wctx.namespace, "step", "marshal_response_failed") - return zero, fmt.Errorf("failed to marshal response: %w", err) - } - - err = wctx.markStepCompleted(stepName, respData) - if err != nil { - tracing.RecordError(span, err) - span.SetAttributes(attribute.String("hydra.step.status", "failed")) - metrics.RecordError(wctx.namespace, "step", "mark_completed_failed") - return zero, fmt.Errorf("failed to mark step completed: %w", err) - } - - span.SetAttributes(attribute.String("hydra.step.status", "completed")) - metrics.ObserveStepDuration(wctx.namespace, wctx.workflowName, stepName, "completed", stepStartTime) - metrics.StepsExecutedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName, "completed").Inc() - - return response, nil -} - -// StepVoid executes a named step within a workflow that performs side effects but doesn't return a value. -// -// This is a convenience wrapper around Step for functions that only return an error. -// It's perfect for steps that perform database updates, send notifications, or other -// side effects where the result itself isn't needed by subsequent steps. -// -// Parameters: -// - ctx: The workflow context from the workflow's Run() method -// - stepName: A unique name for this step within the workflow -// - fn: The function to execute, which should be idempotent and only return an error -// -// Example usage: -// -// // Database update step -// err := hydra.StepVoid(ctx, "update-user-status", func(stepCtx context.Context) error { -// return userService.UpdateStatus(stepCtx, userID, "active") -// }) -// -// // Notification step -// err := hydra.StepVoid(ctx, "send-email", func(stepCtx context.Context) error { -// return emailService.SendWelcomeEmail(stepCtx, userEmail) -// }) -// -// Returns only an error if the step execution fails. -func StepVoid(ctx WorkflowContext, stepName string, fn func(context.Context) error) error { - _, err := Step(ctx, stepName, func(stepCtx context.Context) (*struct{}, error) { - if err := fn(stepCtx); err != nil { - return nil, err - } - return &struct{}{}, nil - }) - return err -} diff --git a/go/pkg/hydra/step_atomicity_test.go b/go/pkg/hydra/step_atomicity_test.go deleted file mode 100644 index e76b68634c..0000000000 --- a/go/pkg/hydra/step_atomicity_test.go +++ /dev/null @@ -1,217 +0,0 @@ -package hydra - -import ( - "context" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/uid" -) - -// TestStepExecutionAtomicity ensures that step execution is atomic: -// either a step fully completes (executes + status update) or it doesn't execute at all. -// This prevents duplicate side effects when status updates fail after step execution. -func TestStepExecutionAtomicity(t *testing.T) { - // Arrange: Create engine with test clock and a workflow that tracks execution attempts - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - var stepExecutionCount int64 - var sideEffectsCount int64 // Track side effects that should only happen once - - // Create a workflow with a step that has side effects - workflow := &atomicityTestWorkflow{ - engine: engine, - name: "atomicity-test-workflow", - stepFunc: func(ctx context.Context) (string, error) { - // This represents the step execution with side effects - _ = atomic.AddInt64(&stepExecutionCount, 1) - - // Simulate important side effects (e.g., sending email, charging payment, etc.) - atomic.AddInt64(&sideEffectsCount, 1) - - return "step-result", nil - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Act: Start workflow - executionID, err := workflow.Start(ctx, struct{}{}) - require.NoError(t, err) - - // Start worker - worker, err := NewWorker(engine, WorkerConfig{ - Concurrency: 1, - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, - ClaimTimeout: 5 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Trigger workflow execution - require.Eventually(t, func() bool { - testClock.Tick(200 * time.Millisecond) - time.Sleep(10 * time.Millisecond) - - // Check if workflow completed - currentStatus, getErr := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - if getErr != nil { - return false - } - return currentStatus.Status == store.WorkflowExecutionsStatusCompleted - }, 5*time.Second, 50*time.Millisecond, "Workflow should complete") - - // Assert: Step should execute exactly once despite any potential failures - finalExecutionCount := atomic.LoadInt64(&stepExecutionCount) - finalSideEffectsCount := atomic.LoadInt64(&sideEffectsCount) - - require.Equal(t, int64(1), finalExecutionCount, - "ATOMICITY VIOLATION: Step executed %d times instead of 1. "+ - "This indicates non-atomic step execution where the step ran multiple times.", finalExecutionCount) - - require.Equal(t, int64(1), finalSideEffectsCount, - "SIDE EFFECT DUPLICATION: Side effects occurred %d times instead of 1. "+ - "This could mean duplicate emails sent, multiple payments charged, etc.", finalSideEffectsCount) - - // Verify the workflow completed successfully - finalWorkflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - require.NoError(t, err) - require.Equal(t, store.WorkflowExecutionsStatusCompleted, finalWorkflow.Status, - "Workflow should complete successfully") - -} - -// TestStepExecutionAtomicityWithFailures tests atomicity when database operations fail -func TestStepExecutionAtomicityWithFailures(t *testing.T) { - // This test would be more complex and would require mocking the store - // to simulate failures during status updates after step execution. - // For now, we'll focus on the basic atomicity test above. - t.Skip("TODO: Implement test with simulated database failures during status updates") -} - -// TestConcurrentStepExecution tests that multiple workers don't execute the same step -func TestConcurrentStepExecution(t *testing.T) { - // Arrange: Create engine with test clock - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - var stepExecutionCount int64 - - // Create a workflow with a step that takes time to execute - workflow := &atomicityTestWorkflow{ - engine: engine, - name: "concurrent-test-workflow", - stepFunc: func(ctx context.Context) (string, error) { - _ = atomic.AddInt64(&stepExecutionCount, 1) - - // Simulate some work time - time.Sleep(100 * time.Millisecond) - - return "concurrent-result", nil - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Start workflow - executionID, err := workflow.Start(ctx, struct{}{}) - require.NoError(t, err) - - // Start multiple workers that might try to process the same workflow - worker1ID := uid.New(uid.WorkerPrefix) - worker2ID := uid.New(uid.WorkerPrefix) - - worker1, err := NewWorker(engine, WorkerConfig{ - WorkerID: worker1ID, - Concurrency: 1, - PollInterval: 50 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, - ClaimTimeout: 5 * time.Second, - }) - require.NoError(t, err) - - worker2, err := NewWorker(engine, WorkerConfig{ - WorkerID: worker2ID, - Concurrency: 1, - PollInterval: 50 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, - ClaimTimeout: 5 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker1, workflow) - require.NoError(t, err) - err = RegisterWorkflow(worker2, workflow) - require.NoError(t, err) - - err = worker1.Start(ctx) - require.NoError(t, err) - defer worker1.Shutdown(ctx) - - err = worker2.Start(ctx) - require.NoError(t, err) - defer worker2.Shutdown(ctx) - - // Trigger both workers to poll simultaneously - require.Eventually(t, func() bool { - testClock.Tick(100 * time.Millisecond) - time.Sleep(20 * time.Millisecond) - - // Check if workflow completed - currentStatus, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - if err != nil { - return false - } - return currentStatus.Status == store.WorkflowExecutionsStatusCompleted - }, 5*time.Second, 50*time.Millisecond, "Workflow should complete with concurrent workers") - - // Assert: Step should execute exactly once even with multiple workers - finalExecutionCount := atomic.LoadInt64(&stepExecutionCount) - require.Equal(t, int64(1), finalExecutionCount, - "CONCURRENCY VIOLATION: Step executed %d times instead of 1. "+ - "Multiple workers executed the same step, violating exactly-once guarantees.", finalExecutionCount) - -} - -// atomicityTestWorkflow is a test workflow for testing step execution atomicity -type atomicityTestWorkflow struct { - engine *Engine - name string - stepFunc func(ctx context.Context) (string, error) -} - -func (w *atomicityTestWorkflow) Name() string { - return w.name -} - -func (w *atomicityTestWorkflow) Run(ctx WorkflowContext, req any) error { - _, err := Step(ctx, "atomic-step", w.stepFunc) - return err -} - -func (w *atomicityTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/step_idempotency_test.go b/go/pkg/hydra/step_idempotency_test.go deleted file mode 100644 index e58113d926..0000000000 --- a/go/pkg/hydra/step_idempotency_test.go +++ /dev/null @@ -1,168 +0,0 @@ -package hydra - -import ( - "context" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/store" -) - -// TestStepIdempotencyDuringWorkerFailure guarantees that workflow steps are idempotent -// and execute exactly once, even when workers fail and other workers resume the workflow. -// -// This prevents duplicate side effects like sending emails twice, processing payments -// multiple times, or creating duplicate database records during worker handoffs. -func TestStepIdempotencyDuringWorkerFailure(t *testing.T) { - // Arrange: Create engine with test clock for deterministic timing - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - var stepExecutionCount int64 - - // Create a workflow with one step that takes time to execute - workflow := &testWorkflow{ - engine: engine, - name: "idempotency-test-workflow", - stepFunc: func(ctx context.Context) (string, error) { - // This should only execute once, but the bug causes it to execute multiple times - atomic.AddInt64(&stepExecutionCount, 1) - - // Step executes instantly - we'll control timing via test clock and worker coordination - return "step-completed", nil - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Act: Start workflow using the preferred pattern - executionID, err := workflow.Start(ctx, struct{}{}) - require.NoError(t, err) - - // Start first worker to begin processing - worker1, err := NewWorker(engine, WorkerConfig{ - Concurrency: 1, - PollInterval: 50 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, // Short heartbeat for faster cleanup - ClaimTimeout: 2 * time.Second, // Short timeout to simulate crash - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker1, workflow) - require.NoError(t, err) - - err = worker1.Start(ctx) - require.NoError(t, err) - - // Let worker1 start processing - advance test clock to trigger polling - testClock.Tick(100 * time.Millisecond) // Trigger initial poll - - // Give a brief moment for worker1 to process (this is unavoidable for goroutine coordination) - time.Sleep(50 * time.Millisecond) - - // Keep triggering polls until workflow is picked up - for i := 0; i < 10; i++ { - testClock.Tick(100 * time.Millisecond) - time.Sleep(10 * time.Millisecond) - - // Check if workflow has been picked up - currentStatus, getErr := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - require.NoError(t, getErr) - if currentStatus.Status != store.WorkflowExecutionsStatusPending { - break - } - } - - // Check that workflow is being processed - _, err = store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - require.NoError(t, err) - - // Simulate worker1 crash by shutting it down - err = worker1.Shutdown(context.Background()) - require.NoError(t, err) - - // Advance time to expire the lease and trigger cleanup - // Leases expire after ClaimTimeout (2 seconds), cleanup runs every HeartbeatInterval * 2 (2 seconds) - testClock.Tick(3 * time.Second) // Advance past lease expiration + cleanup interval - - // Start worker2 to take over the workflow - worker2, err := NewWorker(engine, WorkerConfig{ - Concurrency: 1, - PollInterval: 50 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, // Short heartbeat for faster cleanup - ClaimTimeout: 5 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker2, workflow) - require.NoError(t, err) - - err = worker2.Start(ctx) - require.NoError(t, err) - defer worker2.Shutdown(ctx) - - // Advance time to trigger worker2 polling and cleanup detection - testClock.Tick(200 * time.Millisecond) // Trigger worker2 polling - - // Keep triggering polls and cleanup until workflow is picked up by worker2 - for i := 0; i < 20; i++ { - testClock.Tick(200 * time.Millisecond) // Trigger polling and cleanup - time.Sleep(10 * time.Millisecond) - - // Check if workflow has been picked up - currentStatus, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - require.NoError(t, err) - if currentStatus.Status != store.WorkflowExecutionsStatusPending { - break - } - } - - // Wait for worker2 to complete the workflow - finalResult := waitForWorkflowCompletion(t, engine, executionID, 3*time.Second) - // Check final step execution count - finalCount := atomic.LoadInt64(&stepExecutionCount) - - // Assert: Step idempotency - should execute exactly once despite worker failure - require.Equal(t, int64(1), finalCount, - "STEP IDEMPOTENCY VIOLATION: Step executed %d times instead of 1. "+ - "This could cause duplicate side effects like sending emails twice, "+ - "processing payments multiple times, or creating duplicate records.", finalCount) - - // Verify workflow completed successfully - require.Equal(t, store.WorkflowExecutionsStatusCompleted, finalResult.Status, "Workflow should complete successfully despite worker crash") -} - -// testWorkflow is a minimal workflow for testing step idempotency -type testWorkflow struct { - engine *Engine - name string - stepFunc func(ctx context.Context) (string, error) -} - -func (w *testWorkflow) Name() string { - return w.name -} - -func (w *testWorkflow) Run(ctx WorkflowContext, req any) error { - _, err := Step(ctx, "test-step", w.stepFunc) - return err -} - -// Start is a convenience method that starts this workflow using the embedded engine -// This encourages a cleaner API pattern: workflow.Start() instead of engine.StartWorkflow() -func (w *testWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/store/db.go b/go/pkg/hydra/store/db.go deleted file mode 100644 index a52622d0b3..0000000000 --- a/go/pkg/hydra/store/db.go +++ /dev/null @@ -1,27 +0,0 @@ -package store - -import ( - "context" - "database/sql" -) - -type DBTX interface { - ExecContext(context.Context, string, ...interface{}) (sql.Result, error) - PrepareContext(context.Context, string) (*sql.Stmt, error) - QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) - QueryRowContext(context.Context, string, ...interface{}) *sql.Row -} - -func New(db DBTX) *Queries { - return &Queries{db: db} -} - -type Queries struct { - db DBTX -} - -func (q *Queries) WithTx(tx *sql.Tx) *Queries { - return &Queries{ - db: tx, - } -} diff --git a/go/pkg/hydra/store/generate.go b/go/pkg/hydra/store/generate.go deleted file mode 100644 index 712573026b..0000000000 --- a/go/pkg/hydra/store/generate.go +++ /dev/null @@ -1,6 +0,0 @@ -package store - -//go:generate sqlc generate -f sqlc.json -// we copy all of the relevant bits into queries.go and don't want the default -// exports that get generated -//go:generate rm delete_me.go diff --git a/go/pkg/hydra/store/models.go b/go/pkg/hydra/store/models.go deleted file mode 100644 index e948928a3d..0000000000 --- a/go/pkg/hydra/store/models.go +++ /dev/null @@ -1,245 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.29.0 - -package store - -import ( - "database/sql" - "database/sql/driver" - "fmt" -) - -type LeasesKind string - -const ( - LeasesKindWorkflow LeasesKind = "workflow" - LeasesKindStep LeasesKind = "step" - LeasesKindCronJob LeasesKind = "cron_job" -) - -func (e *LeasesKind) Scan(src interface{}) error { - switch s := src.(type) { - case []byte: - *e = LeasesKind(s) - case string: - *e = LeasesKind(s) - default: - return fmt.Errorf("unsupported scan type for LeasesKind: %T", src) - } - return nil -} - -type NullLeasesKind struct { - LeasesKind LeasesKind `json:"leases_kind"` - Valid bool `json:"valid"` // Valid is true if LeasesKind is not NULL -} - -// Scan implements the Scanner interface. -func (ns *NullLeasesKind) Scan(value interface{}) error { - if value == nil { - ns.LeasesKind, ns.Valid = "", false - return nil - } - ns.Valid = true - return ns.LeasesKind.Scan(value) -} - -// Value implements the driver Valuer interface. -func (ns NullLeasesKind) Value() (driver.Value, error) { - if !ns.Valid { - return nil, nil - } - return string(ns.LeasesKind), nil -} - -type WorkflowExecutionsStatus string - -const ( - WorkflowExecutionsStatusPending WorkflowExecutionsStatus = "pending" - WorkflowExecutionsStatusRunning WorkflowExecutionsStatus = "running" - WorkflowExecutionsStatusSleeping WorkflowExecutionsStatus = "sleeping" - WorkflowExecutionsStatusCompleted WorkflowExecutionsStatus = "completed" - WorkflowExecutionsStatusFailed WorkflowExecutionsStatus = "failed" -) - -func (e *WorkflowExecutionsStatus) Scan(src interface{}) error { - switch s := src.(type) { - case []byte: - *e = WorkflowExecutionsStatus(s) - case string: - *e = WorkflowExecutionsStatus(s) - default: - return fmt.Errorf("unsupported scan type for WorkflowExecutionsStatus: %T", src) - } - return nil -} - -type NullWorkflowExecutionsStatus struct { - WorkflowExecutionsStatus WorkflowExecutionsStatus `json:"workflow_executions_status"` - Valid bool `json:"valid"` // Valid is true if WorkflowExecutionsStatus is not NULL -} - -// Scan implements the Scanner interface. -func (ns *NullWorkflowExecutionsStatus) Scan(value interface{}) error { - if value == nil { - ns.WorkflowExecutionsStatus, ns.Valid = "", false - return nil - } - ns.Valid = true - return ns.WorkflowExecutionsStatus.Scan(value) -} - -// Value implements the driver Valuer interface. -func (ns NullWorkflowExecutionsStatus) Value() (driver.Value, error) { - if !ns.Valid { - return nil, nil - } - return string(ns.WorkflowExecutionsStatus), nil -} - -type WorkflowExecutionsTriggerType string - -const ( - WorkflowExecutionsTriggerTypeManual WorkflowExecutionsTriggerType = "manual" - WorkflowExecutionsTriggerTypeCron WorkflowExecutionsTriggerType = "cron" - WorkflowExecutionsTriggerTypeEvent WorkflowExecutionsTriggerType = "event" - WorkflowExecutionsTriggerTypeApi WorkflowExecutionsTriggerType = "api" -) - -func (e *WorkflowExecutionsTriggerType) Scan(src interface{}) error { - switch s := src.(type) { - case []byte: - *e = WorkflowExecutionsTriggerType(s) - case string: - *e = WorkflowExecutionsTriggerType(s) - default: - return fmt.Errorf("unsupported scan type for WorkflowExecutionsTriggerType: %T", src) - } - return nil -} - -type NullWorkflowExecutionsTriggerType struct { - WorkflowExecutionsTriggerType WorkflowExecutionsTriggerType `json:"workflow_executions_trigger_type"` - Valid bool `json:"valid"` // Valid is true if WorkflowExecutionsTriggerType is not NULL -} - -// Scan implements the Scanner interface. -func (ns *NullWorkflowExecutionsTriggerType) Scan(value interface{}) error { - if value == nil { - ns.WorkflowExecutionsTriggerType, ns.Valid = "", false - return nil - } - ns.Valid = true - return ns.WorkflowExecutionsTriggerType.Scan(value) -} - -// Value implements the driver Valuer interface. -func (ns NullWorkflowExecutionsTriggerType) Value() (driver.Value, error) { - if !ns.Valid { - return nil, nil - } - return string(ns.WorkflowExecutionsTriggerType), nil -} - -type WorkflowStepsStatus string - -const ( - WorkflowStepsStatusPending WorkflowStepsStatus = "pending" - WorkflowStepsStatusRunning WorkflowStepsStatus = "running" - WorkflowStepsStatusCompleted WorkflowStepsStatus = "completed" - WorkflowStepsStatusFailed WorkflowStepsStatus = "failed" -) - -func (e *WorkflowStepsStatus) Scan(src interface{}) error { - switch s := src.(type) { - case []byte: - *e = WorkflowStepsStatus(s) - case string: - *e = WorkflowStepsStatus(s) - default: - return fmt.Errorf("unsupported scan type for WorkflowStepsStatus: %T", src) - } - return nil -} - -type NullWorkflowStepsStatus struct { - WorkflowStepsStatus WorkflowStepsStatus `json:"workflow_steps_status"` - Valid bool `json:"valid"` // Valid is true if WorkflowStepsStatus is not NULL -} - -// Scan implements the Scanner interface. -func (ns *NullWorkflowStepsStatus) Scan(value interface{}) error { - if value == nil { - ns.WorkflowStepsStatus, ns.Valid = "", false - return nil - } - ns.Valid = true - return ns.WorkflowStepsStatus.Scan(value) -} - -// Value implements the driver Valuer interface. -func (ns NullWorkflowStepsStatus) Value() (driver.Value, error) { - if !ns.Valid { - return nil, nil - } - return string(ns.WorkflowStepsStatus), nil -} - -type CronJob struct { - ID string `db:"id" json:"id"` - Name string `db:"name" json:"name"` - CronSpec string `db:"cron_spec" json:"cron_spec"` - Namespace string `db:"namespace" json:"namespace"` - WorkflowName sql.NullString `db:"workflow_name" json:"workflow_name"` - Enabled bool `db:"enabled" json:"enabled"` - CreatedAt int64 `db:"created_at" json:"created_at"` - UpdatedAt int64 `db:"updated_at" json:"updated_at"` - LastRunAt sql.NullInt64 `db:"last_run_at" json:"last_run_at"` - NextRunAt int64 `db:"next_run_at" json:"next_run_at"` -} - -type Lease struct { - ResourceID string `db:"resource_id" json:"resource_id"` - Kind LeasesKind `db:"kind" json:"kind"` - Namespace string `db:"namespace" json:"namespace"` - WorkerID string `db:"worker_id" json:"worker_id"` - AcquiredAt int64 `db:"acquired_at" json:"acquired_at"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` - HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"` -} - -type WorkflowExecution struct { - ID string `db:"id" json:"id"` - WorkflowName string `db:"workflow_name" json:"workflow_name"` - Status WorkflowExecutionsStatus `db:"status" json:"status"` - InputData []byte `db:"input_data" json:"input_data"` - OutputData []byte `db:"output_data" json:"output_data"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - CreatedAt int64 `db:"created_at" json:"created_at"` - StartedAt sql.NullInt64 `db:"started_at" json:"started_at"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - MaxAttempts int32 `db:"max_attempts" json:"max_attempts"` - RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"` - NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"` - Namespace string `db:"namespace" json:"namespace"` - TriggerType NullWorkflowExecutionsTriggerType `db:"trigger_type" json:"trigger_type"` - TriggerSource sql.NullString `db:"trigger_source" json:"trigger_source"` - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` - TraceID sql.NullString `db:"trace_id" json:"trace_id"` - SpanID sql.NullString `db:"span_id" json:"span_id"` -} - -type WorkflowStep struct { - ID string `db:"id" json:"id"` - ExecutionID string `db:"execution_id" json:"execution_id"` - StepName string `db:"step_name" json:"step_name"` - Status WorkflowStepsStatus `db:"status" json:"status"` - OutputData []byte `db:"output_data" json:"output_data"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - StartedAt sql.NullInt64 `db:"started_at" json:"started_at"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - MaxAttempts int32 `db:"max_attempts" json:"max_attempts"` - RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"` - Namespace string `db:"namespace" json:"namespace"` -} diff --git a/go/pkg/hydra/store/querier.go b/go/pkg/hydra/store/querier.go deleted file mode 100644 index a9a208732a..0000000000 --- a/go/pkg/hydra/store/querier.go +++ /dev/null @@ -1,41 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.29.0 - -package store - -import ( - "context" -) - -type Querier interface { - CleanupExpiredLeases(ctx context.Context, db DBTX, arg CleanupExpiredLeasesParams) error - CompleteWorkflow(ctx context.Context, db DBTX, arg CompleteWorkflowParams) error - CreateCronJob(ctx context.Context, db DBTX, arg CreateCronJobParams) error - CreateLease(ctx context.Context, db DBTX, arg CreateLeaseParams) error - CreateStep(ctx context.Context, db DBTX, arg CreateStepParams) error - CreateWorkflow(ctx context.Context, db DBTX, arg CreateWorkflowParams) error - GetCompletedStep(ctx context.Context, db DBTX, arg GetCompletedStepParams) (WorkflowStep, error) - GetCronJob(ctx context.Context, db DBTX, arg GetCronJobParams) (CronJob, error) - GetCronJobs(ctx context.Context, db DBTX, namespace string) ([]CronJob, error) - GetDueCronJobs(ctx context.Context, db DBTX, arg GetDueCronJobsParams) ([]CronJob, error) - GetLease(ctx context.Context, db DBTX, arg GetLeaseParams) (Lease, error) - GetPendingWorkflows(ctx context.Context, db DBTX, arg GetPendingWorkflowsParams) ([]WorkflowExecution, error) - GetPendingWorkflowsFiltered(ctx context.Context, db DBTX, arg GetPendingWorkflowsFilteredParams) ([]WorkflowExecution, error) - GetSleepingWorkflows(ctx context.Context, db DBTX, arg GetSleepingWorkflowsParams) ([]WorkflowExecution, error) - GetStep(ctx context.Context, db DBTX, arg GetStepParams) (WorkflowStep, error) - GetWorkflow(ctx context.Context, db DBTX, arg GetWorkflowParams) (WorkflowExecution, error) - HeartbeatLease(ctx context.Context, db DBTX, arg HeartbeatLeaseParams) error - ReleaseLease(ctx context.Context, db DBTX, arg ReleaseLeaseParams) error - ResetOrphanedWorkflows(ctx context.Context, db DBTX, arg ResetOrphanedWorkflowsParams) error - SleepWorkflow(ctx context.Context, db DBTX, arg SleepWorkflowParams) error - UpdateCronJob(ctx context.Context, db DBTX, arg UpdateCronJobParams) error - UpdateCronJobLastRun(ctx context.Context, db DBTX, arg UpdateCronJobLastRunParams) error - UpdateLease(ctx context.Context, db DBTX, arg UpdateLeaseParams) error - UpdateStepStatus(ctx context.Context, db DBTX, arg UpdateStepStatusParams) error - UpdateStepStatusWithLease(ctx context.Context, db DBTX, arg UpdateStepStatusWithLeaseParams) error - UpdateWorkflowFields(ctx context.Context, db DBTX, arg UpdateWorkflowFieldsParams) error - UpdateWorkflowToRunning(ctx context.Context, db DBTX, arg UpdateWorkflowToRunningParams) error -} - -var _ Querier = (*Queries)(nil) diff --git a/go/pkg/hydra/store/queries.go b/go/pkg/hydra/store/queries.go deleted file mode 100644 index f7208632a4..0000000000 --- a/go/pkg/hydra/store/queries.go +++ /dev/null @@ -1,22 +0,0 @@ -package store - -// Query provides access to the generated database queries defined in the SQL files -// -// Example usage: -// -// import ( -// "context" -// "database/sql" -// "github.com/unkeyed/unkey/go/pkg/hydra/store" -// ) -// -// func GetWorkflow(ctx context.Context, db *sql.DB, namespace, id string) (store.WorkflowExecution, error) { -// return store.Query.GetWorkflow(ctx, db, store.GetWorkflowParams{ -// ID: id, -// Namespace: namespace, -// }) -// } -// -// The Query object contains all the database operations defined in the SQL files -// and automatically generated by sqlc. -var Query Querier = &Queries{db: nil} diff --git a/go/pkg/hydra/store/queries/workflows.sql b/go/pkg/hydra/store/queries/workflows.sql deleted file mode 100644 index dde1870bfa..0000000000 --- a/go/pkg/hydra/store/queries/workflows.sql +++ /dev/null @@ -1,189 +0,0 @@ --- name: GetWorkflow :one -SELECT * FROM workflow_executions -WHERE id = ? AND namespace = ?; - --- name: CreateWorkflow :exec -INSERT INTO workflow_executions ( - id, workflow_name, status, input_data, output_data, error_message, - created_at, started_at, completed_at, max_attempts, remaining_attempts, - next_retry_at, namespace, trigger_type, trigger_source, sleep_until, - trace_id, span_id -) VALUES ( - ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ?, - ?, ?, ?, ?, ?, - ?, ? -); - --- name: GetPendingWorkflows :many -SELECT * FROM workflow_executions -WHERE namespace = ? - AND ( - status = 'pending' - OR (status = 'failed' AND next_retry_at <= ?) - OR (status = 'sleeping' AND sleep_until <= ?) - ) -ORDER BY created_at ASC -LIMIT ?; - --- name: GetPendingWorkflowsFiltered :many -SELECT * FROM workflow_executions -WHERE namespace = ? - AND ( - status = 'pending' - OR (status = 'failed' AND next_retry_at <= ?) - OR (status = 'sleeping' AND sleep_until <= ?) - ) - AND workflow_name IN (/*SLICE:workflow_names*/?) -ORDER BY created_at ASC -LIMIT ?; - --- name: UpdateWorkflowFields :exec -UPDATE workflow_executions -SET - status = COALESCE(?, status), - error_message = COALESCE(?, error_message), - completed_at = COALESCE(?, completed_at), - started_at = COALESCE(?, started_at), - output_data = COALESCE(?, output_data), - remaining_attempts = COALESCE(?, remaining_attempts), - next_retry_at = COALESCE(?, next_retry_at), - sleep_until = COALESCE(?, sleep_until) -WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - ); - --- name: UpdateStepStatus :exec -UPDATE workflow_steps -SET status = ?, completed_at = ?, output_data = ?, error_message = ? -WHERE namespace = ? AND execution_id = ? AND step_name = ?; - --- name: SleepWorkflow :exec -UPDATE workflow_executions -SET status = 'sleeping', sleep_until = ? -WHERE id = ? AND namespace = ?; - --- name: CreateStep :exec -INSERT INTO workflow_steps ( - id, execution_id, step_name, status, output_data, error_message, - started_at, completed_at, max_attempts, remaining_attempts, namespace -) VALUES ( - ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ? -); - --- name: GetStep :one -SELECT * FROM workflow_steps -WHERE namespace = ? AND execution_id = ? AND step_name = ?; - --- name: GetCompletedStep :one -SELECT * FROM workflow_steps -WHERE namespace = ? AND execution_id = ? AND step_name = ? AND status = 'completed'; - --- name: UpdateStepStatusWithLease :exec -UPDATE workflow_steps -SET status = ?, completed_at = ?, output_data = ?, error_message = ? -WHERE workflow_steps.namespace = ? AND execution_id = ? AND step_name = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - ); - --- name: GetLease :one -SELECT * FROM leases -WHERE resource_id = ? AND kind = ?; - --- name: CreateLease :exec -INSERT INTO leases ( - resource_id, kind, namespace, worker_id, acquired_at, expires_at, heartbeat_at -) VALUES ( - ?, ?, ?, ?, ?, ?, ? -); - --- name: UpdateLease :exec -UPDATE leases -SET worker_id = ?, acquired_at = ?, expires_at = ?, heartbeat_at = ? -WHERE resource_id = ? AND kind = ? AND expires_at < ?; - --- name: UpdateWorkflowToRunning :exec -UPDATE workflow_executions -SET status = 'running', - started_at = CASE WHEN started_at IS NULL THEN ? ELSE started_at END, - sleep_until = NULL -WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - ); - --- name: CompleteWorkflow :exec -UPDATE workflow_executions -SET status = 'completed', completed_at = ?, output_data = ? -WHERE id = ? AND namespace = ?; - --- name: HeartbeatLease :exec -UPDATE leases -SET heartbeat_at = ?, expires_at = ? -WHERE resource_id = ? AND worker_id = ?; - --- name: ReleaseLease :exec -DELETE FROM leases -WHERE resource_id = ? AND worker_id = ?; - --- name: GetSleepingWorkflows :many -SELECT * FROM workflow_executions -WHERE namespace = ? AND status = 'sleeping' AND sleep_until <= ? -ORDER BY sleep_until ASC; - --- name: GetCronJob :one -SELECT * FROM cron_jobs -WHERE namespace = ? AND name = ?; - --- name: GetCronJobs :many -SELECT * FROM cron_jobs -WHERE namespace = ? AND enabled = true; - --- name: GetDueCronJobs :many -SELECT * FROM cron_jobs -WHERE namespace = ? AND enabled = true AND next_run_at <= ?; - --- name: CreateCronJob :exec -INSERT INTO cron_jobs ( - id, name, cron_spec, namespace, workflow_name, enabled, - created_at, updated_at, last_run_at, next_run_at -) VALUES ( - ?, ?, ?, ?, ?, ?, ?, ?, ?, ? -) ON DUPLICATE KEY UPDATE - cron_spec = sqlc.arg('cron_spec'), enabled = sqlc.arg('enabled'), updated_at = sqlc.arg('updated_at'), next_run_at = sqlc.arg('next_run_at'), last_run_at = sqlc.arg('last_run_at'), next_run_at = sqlc.arg('next_run_at'); - --- name: UpdateCronJob :exec -UPDATE cron_jobs -SET cron_spec = ?, workflow_name = ?, enabled = ?, updated_at = ?, next_run_at = ? -WHERE id = ? AND namespace = ?; - --- name: UpdateCronJobLastRun :exec -UPDATE cron_jobs -SET last_run_at = ?, next_run_at = ?, updated_at = ? -WHERE id = ? AND namespace = ?; - --- name: CleanupExpiredLeases :exec -DELETE FROM leases -WHERE namespace = ? AND expires_at < ?; - - --- name: ResetOrphanedWorkflows :exec -UPDATE workflow_executions -SET status = 'pending' -WHERE workflow_executions.namespace = ? - AND workflow_executions.status = 'running' - AND workflow_executions.id NOT IN ( - SELECT resource_id - FROM leases - WHERE kind = 'workflow' AND leases.namespace = ? - ); - diff --git a/go/pkg/hydra/store/schema.sql b/go/pkg/hydra/store/schema.sql deleted file mode 100644 index ff2c054eb1..0000000000 --- a/go/pkg/hydra/store/schema.sql +++ /dev/null @@ -1,72 +0,0 @@ -CREATE DATABASE IF NOT EXISTS `hydra`; -USE `hydra`; - -CREATE TABLE IF NOT EXISTS workflow_executions ( - id VARCHAR(255) PRIMARY KEY, - workflow_name VARCHAR(255) NOT NULL, - status ENUM('pending', 'running', 'sleeping', 'completed', 'failed') NOT NULL, - input_data LONGBLOB, -- Large binary data for workflow inputs - output_data MEDIUMBLOB, -- Medium binary data for workflow outputs - error_message TEXT, - - created_at BIGINT NOT NULL, - started_at BIGINT, - completed_at BIGINT, - max_attempts INT NOT NULL, - remaining_attempts INT NOT NULL, - next_retry_at BIGINT, - - namespace VARCHAR(255) NOT NULL, - - trigger_type ENUM('manual', 'cron', 'event', 'api'), - trigger_source VARCHAR(255), - - sleep_until BIGINT, - - trace_id VARCHAR(255), - span_id VARCHAR(255) -); - -CREATE TABLE IF NOT EXISTS workflow_steps ( - id VARCHAR(255) PRIMARY KEY, - execution_id VARCHAR(255) NOT NULL, - step_name VARCHAR(255) NOT NULL, - status ENUM('pending', 'running', 'completed', 'failed') NOT NULL, - output_data LONGBLOB, - error_message TEXT, - - started_at BIGINT, - completed_at BIGINT, - - max_attempts INT NOT NULL, - remaining_attempts INT NOT NULL, - - namespace VARCHAR(255) NOT NULL -); - --- Cron Jobs Table -CREATE TABLE IF NOT EXISTS `cron_jobs` ( - `id` varchar(255) NOT NULL, - `name` varchar(255) NOT NULL, - `cron_spec` varchar(255) NOT NULL, - `namespace` varchar(255) NOT NULL, - `workflow_name` varchar(255) DEFAULT NULL, - `enabled` tinyint(1) NOT NULL DEFAULT '1', - `created_at` bigint NOT NULL, - `updated_at` bigint NOT NULL, - `last_run_at` bigint DEFAULT NULL, - `next_run_at` bigint NOT NULL, - PRIMARY KEY (`id`), - UNIQUE KEY `cron_jobs_name_namespace_idx` (`name`,`namespace`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; - --- Leases Table (step kind included for GORM compatibility, though unused) -CREATE TABLE IF NOT EXISTS leases ( - resource_id VARCHAR(255) PRIMARY KEY, - kind ENUM('workflow', 'step', 'cron_job') NOT NULL, - namespace VARCHAR(255) NOT NULL, - worker_id VARCHAR(255) NOT NULL, - acquired_at BIGINT NOT NULL, - expires_at BIGINT NOT NULL, - heartbeat_at BIGINT NOT NULL -); diff --git a/go/pkg/hydra/store/sqlc.json b/go/pkg/hydra/store/sqlc.json deleted file mode 100644 index 7d861e85f8..0000000000 --- a/go/pkg/hydra/store/sqlc.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "version": "2", - "sql": [ - { - "engine": "mysql", - "queries": "queries/", - "schema": "schema.sql", - "gen": { - "go": { - "package": "store", - "out": ".", - "emit_json_tags": true, - "emit_db_tags": true, - "emit_prepared_queries": false, - "emit_interface": true, - "emit_exact_table_names": false, - "emit_empty_slices": true, - "emit_methods_with_db_argument": true, - "output_db_file_name": "delete_me", - "overrides": [ - { - "column": "workflow_executions.input_data", - "go_type": { - "type": "[]byte" - } - }, - { - "column": "workflow_executions.output_data", - "go_type": { - "type": "[]byte" - } - }, - { - "column": "workflow_steps.output_data", - "go_type": { - "type": "[]byte" - } - } - ] - } - } - } - ] -} diff --git a/go/pkg/hydra/store/workflows.sql.go b/go/pkg/hydra/store/workflows.sql.go deleted file mode 100644 index 40bbe1dfed..0000000000 --- a/go/pkg/hydra/store/workflows.sql.go +++ /dev/null @@ -1,962 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.29.0 -// source: workflows.sql - -package store - -import ( - "context" - "database/sql" -) - -const cleanupExpiredLeases = `-- name: CleanupExpiredLeases :exec -DELETE FROM leases -WHERE namespace = ? AND expires_at < ? -` - -type CleanupExpiredLeasesParams struct { - Namespace string `db:"namespace" json:"namespace"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` -} - -func (q *Queries) CleanupExpiredLeases(ctx context.Context, db DBTX, arg CleanupExpiredLeasesParams) error { - _, err := db.ExecContext(ctx, cleanupExpiredLeases, arg.Namespace, arg.ExpiresAt) - return err -} - -const completeWorkflow = `-- name: CompleteWorkflow :exec -UPDATE workflow_executions -SET status = 'completed', completed_at = ?, output_data = ? -WHERE id = ? AND namespace = ? -` - -type CompleteWorkflowParams struct { - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - OutputData []byte `db:"output_data" json:"output_data"` - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` -} - -func (q *Queries) CompleteWorkflow(ctx context.Context, db DBTX, arg CompleteWorkflowParams) error { - _, err := db.ExecContext(ctx, completeWorkflow, - arg.CompletedAt, - arg.OutputData, - arg.ID, - arg.Namespace, - ) - return err -} - -const createCronJob = `-- name: CreateCronJob :exec -INSERT INTO cron_jobs ( - id, name, cron_spec, namespace, workflow_name, enabled, - created_at, updated_at, last_run_at, next_run_at -) VALUES ( - ?, ?, ?, ?, ?, ?, ?, ?, ?, ? -) ON DUPLICATE KEY UPDATE - cron_spec = ?, enabled = ?, updated_at = ?, next_run_at = ?, last_run_at = ?, next_run_at = ? -` - -type CreateCronJobParams struct { - ID string `db:"id" json:"id"` - Name string `db:"name" json:"name"` - CronSpec string `db:"cron_spec" json:"cron_spec"` - Namespace string `db:"namespace" json:"namespace"` - WorkflowName sql.NullString `db:"workflow_name" json:"workflow_name"` - Enabled bool `db:"enabled" json:"enabled"` - CreatedAt int64 `db:"created_at" json:"created_at"` - UpdatedAt int64 `db:"updated_at" json:"updated_at"` - LastRunAt sql.NullInt64 `db:"last_run_at" json:"last_run_at"` - NextRunAt int64 `db:"next_run_at" json:"next_run_at"` -} - -func (q *Queries) CreateCronJob(ctx context.Context, db DBTX, arg CreateCronJobParams) error { - _, err := db.ExecContext(ctx, createCronJob, - arg.ID, - arg.Name, - arg.CronSpec, - arg.Namespace, - arg.WorkflowName, - arg.Enabled, - arg.CreatedAt, - arg.UpdatedAt, - arg.LastRunAt, - arg.NextRunAt, - arg.CronSpec, - arg.Enabled, - arg.UpdatedAt, - arg.NextRunAt, - arg.LastRunAt, - arg.NextRunAt, - ) - return err -} - -const createLease = `-- name: CreateLease :exec -INSERT INTO leases ( - resource_id, kind, namespace, worker_id, acquired_at, expires_at, heartbeat_at -) VALUES ( - ?, ?, ?, ?, ?, ?, ? -) -` - -type CreateLeaseParams struct { - ResourceID string `db:"resource_id" json:"resource_id"` - Kind LeasesKind `db:"kind" json:"kind"` - Namespace string `db:"namespace" json:"namespace"` - WorkerID string `db:"worker_id" json:"worker_id"` - AcquiredAt int64 `db:"acquired_at" json:"acquired_at"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` - HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"` -} - -func (q *Queries) CreateLease(ctx context.Context, db DBTX, arg CreateLeaseParams) error { - _, err := db.ExecContext(ctx, createLease, - arg.ResourceID, - arg.Kind, - arg.Namespace, - arg.WorkerID, - arg.AcquiredAt, - arg.ExpiresAt, - arg.HeartbeatAt, - ) - return err -} - -const createStep = `-- name: CreateStep :exec -INSERT INTO workflow_steps ( - id, execution_id, step_name, status, output_data, error_message, - started_at, completed_at, max_attempts, remaining_attempts, namespace -) VALUES ( - ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ? -) -` - -type CreateStepParams struct { - ID string `db:"id" json:"id"` - ExecutionID string `db:"execution_id" json:"execution_id"` - StepName string `db:"step_name" json:"step_name"` - Status WorkflowStepsStatus `db:"status" json:"status"` - OutputData []byte `db:"output_data" json:"output_data"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - StartedAt sql.NullInt64 `db:"started_at" json:"started_at"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - MaxAttempts int32 `db:"max_attempts" json:"max_attempts"` - RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"` - Namespace string `db:"namespace" json:"namespace"` -} - -func (q *Queries) CreateStep(ctx context.Context, db DBTX, arg CreateStepParams) error { - _, err := db.ExecContext(ctx, createStep, - arg.ID, - arg.ExecutionID, - arg.StepName, - arg.Status, - arg.OutputData, - arg.ErrorMessage, - arg.StartedAt, - arg.CompletedAt, - arg.MaxAttempts, - arg.RemainingAttempts, - arg.Namespace, - ) - return err -} - -const createWorkflow = `-- name: CreateWorkflow :exec -INSERT INTO workflow_executions ( - id, workflow_name, status, input_data, output_data, error_message, - created_at, started_at, completed_at, max_attempts, remaining_attempts, - next_retry_at, namespace, trigger_type, trigger_source, sleep_until, - trace_id, span_id -) VALUES ( - ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ?, - ?, ?, ?, ?, ?, - ?, ? -) -` - -type CreateWorkflowParams struct { - ID string `db:"id" json:"id"` - WorkflowName string `db:"workflow_name" json:"workflow_name"` - Status WorkflowExecutionsStatus `db:"status" json:"status"` - InputData []byte `db:"input_data" json:"input_data"` - OutputData []byte `db:"output_data" json:"output_data"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - CreatedAt int64 `db:"created_at" json:"created_at"` - StartedAt sql.NullInt64 `db:"started_at" json:"started_at"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - MaxAttempts int32 `db:"max_attempts" json:"max_attempts"` - RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"` - NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"` - Namespace string `db:"namespace" json:"namespace"` - TriggerType NullWorkflowExecutionsTriggerType `db:"trigger_type" json:"trigger_type"` - TriggerSource sql.NullString `db:"trigger_source" json:"trigger_source"` - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` - TraceID sql.NullString `db:"trace_id" json:"trace_id"` - SpanID sql.NullString `db:"span_id" json:"span_id"` -} - -func (q *Queries) CreateWorkflow(ctx context.Context, db DBTX, arg CreateWorkflowParams) error { - _, err := db.ExecContext(ctx, createWorkflow, - arg.ID, - arg.WorkflowName, - arg.Status, - arg.InputData, - arg.OutputData, - arg.ErrorMessage, - arg.CreatedAt, - arg.StartedAt, - arg.CompletedAt, - arg.MaxAttempts, - arg.RemainingAttempts, - arg.NextRetryAt, - arg.Namespace, - arg.TriggerType, - arg.TriggerSource, - arg.SleepUntil, - arg.TraceID, - arg.SpanID, - ) - return err -} - -const getCompletedStep = `-- name: GetCompletedStep :one -SELECT id, execution_id, step_name, status, output_data, error_message, started_at, completed_at, max_attempts, remaining_attempts, namespace FROM workflow_steps -WHERE namespace = ? AND execution_id = ? AND step_name = ? AND status = 'completed' -` - -type GetCompletedStepParams struct { - Namespace string `db:"namespace" json:"namespace"` - ExecutionID string `db:"execution_id" json:"execution_id"` - StepName string `db:"step_name" json:"step_name"` -} - -func (q *Queries) GetCompletedStep(ctx context.Context, db DBTX, arg GetCompletedStepParams) (WorkflowStep, error) { - row := db.QueryRowContext(ctx, getCompletedStep, arg.Namespace, arg.ExecutionID, arg.StepName) - var i WorkflowStep - err := row.Scan( - &i.ID, - &i.ExecutionID, - &i.StepName, - &i.Status, - &i.OutputData, - &i.ErrorMessage, - &i.StartedAt, - &i.CompletedAt, - &i.MaxAttempts, - &i.RemainingAttempts, - &i.Namespace, - ) - return i, err -} - -const getCronJob = `-- name: GetCronJob :one -SELECT id, name, cron_spec, namespace, workflow_name, enabled, created_at, updated_at, last_run_at, next_run_at FROM cron_jobs -WHERE namespace = ? AND name = ? -` - -type GetCronJobParams struct { - Namespace string `db:"namespace" json:"namespace"` - Name string `db:"name" json:"name"` -} - -func (q *Queries) GetCronJob(ctx context.Context, db DBTX, arg GetCronJobParams) (CronJob, error) { - row := db.QueryRowContext(ctx, getCronJob, arg.Namespace, arg.Name) - var i CronJob - err := row.Scan( - &i.ID, - &i.Name, - &i.CronSpec, - &i.Namespace, - &i.WorkflowName, - &i.Enabled, - &i.CreatedAt, - &i.UpdatedAt, - &i.LastRunAt, - &i.NextRunAt, - ) - return i, err -} - -const getCronJobs = `-- name: GetCronJobs :many -SELECT id, name, cron_spec, namespace, workflow_name, enabled, created_at, updated_at, last_run_at, next_run_at FROM cron_jobs -WHERE namespace = ? AND enabled = true -` - -func (q *Queries) GetCronJobs(ctx context.Context, db DBTX, namespace string) ([]CronJob, error) { - rows, err := db.QueryContext(ctx, getCronJobs, namespace) - if err != nil { - return nil, err - } - defer rows.Close() - items := []CronJob{} - for rows.Next() { - var i CronJob - if err := rows.Scan( - &i.ID, - &i.Name, - &i.CronSpec, - &i.Namespace, - &i.WorkflowName, - &i.Enabled, - &i.CreatedAt, - &i.UpdatedAt, - &i.LastRunAt, - &i.NextRunAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getDueCronJobs = `-- name: GetDueCronJobs :many -SELECT id, name, cron_spec, namespace, workflow_name, enabled, created_at, updated_at, last_run_at, next_run_at FROM cron_jobs -WHERE namespace = ? AND enabled = true AND next_run_at <= ? -` - -type GetDueCronJobsParams struct { - Namespace string `db:"namespace" json:"namespace"` - NextRunAt int64 `db:"next_run_at" json:"next_run_at"` -} - -func (q *Queries) GetDueCronJobs(ctx context.Context, db DBTX, arg GetDueCronJobsParams) ([]CronJob, error) { - rows, err := db.QueryContext(ctx, getDueCronJobs, arg.Namespace, arg.NextRunAt) - if err != nil { - return nil, err - } - defer rows.Close() - items := []CronJob{} - for rows.Next() { - var i CronJob - if err := rows.Scan( - &i.ID, - &i.Name, - &i.CronSpec, - &i.Namespace, - &i.WorkflowName, - &i.Enabled, - &i.CreatedAt, - &i.UpdatedAt, - &i.LastRunAt, - &i.NextRunAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getLease = `-- name: GetLease :one -SELECT resource_id, kind, namespace, worker_id, acquired_at, expires_at, heartbeat_at FROM leases -WHERE resource_id = ? AND kind = ? -` - -type GetLeaseParams struct { - ResourceID string `db:"resource_id" json:"resource_id"` - Kind LeasesKind `db:"kind" json:"kind"` -} - -func (q *Queries) GetLease(ctx context.Context, db DBTX, arg GetLeaseParams) (Lease, error) { - row := db.QueryRowContext(ctx, getLease, arg.ResourceID, arg.Kind) - var i Lease - err := row.Scan( - &i.ResourceID, - &i.Kind, - &i.Namespace, - &i.WorkerID, - &i.AcquiredAt, - &i.ExpiresAt, - &i.HeartbeatAt, - ) - return i, err -} - -const getPendingWorkflows = `-- name: GetPendingWorkflows :many -SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions -WHERE namespace = ? - AND ( - status = 'pending' - OR (status = 'failed' AND next_retry_at <= ?) - OR (status = 'sleeping' AND sleep_until <= ?) - ) -ORDER BY created_at ASC -LIMIT ? -` - -type GetPendingWorkflowsParams struct { - Namespace string `db:"namespace" json:"namespace"` - NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"` - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` - Limit int32 `db:"limit" json:"limit"` -} - -func (q *Queries) GetPendingWorkflows(ctx context.Context, db DBTX, arg GetPendingWorkflowsParams) ([]WorkflowExecution, error) { - rows, err := db.QueryContext(ctx, getPendingWorkflows, - arg.Namespace, - arg.NextRetryAt, - arg.SleepUntil, - arg.Limit, - ) - if err != nil { - return nil, err - } - defer rows.Close() - items := []WorkflowExecution{} - for rows.Next() { - var i WorkflowExecution - if err := rows.Scan( - &i.ID, - &i.WorkflowName, - &i.Status, - &i.InputData, - &i.OutputData, - &i.ErrorMessage, - &i.CreatedAt, - &i.StartedAt, - &i.CompletedAt, - &i.MaxAttempts, - &i.RemainingAttempts, - &i.NextRetryAt, - &i.Namespace, - &i.TriggerType, - &i.TriggerSource, - &i.SleepUntil, - &i.TraceID, - &i.SpanID, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getPendingWorkflowsFiltered = `-- name: GetPendingWorkflowsFiltered :many -SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions -WHERE namespace = ? - AND ( - status = 'pending' - OR (status = 'failed' AND next_retry_at <= ?) - OR (status = 'sleeping' AND sleep_until <= ?) - ) - AND workflow_name IN (/*SLICE:workflow_names*/?) -ORDER BY created_at ASC -LIMIT ? -` - -type GetPendingWorkflowsFilteredParams struct { - Namespace string `db:"namespace" json:"namespace"` - NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"` - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` - WorkflowName string `db:"workflow_name" json:"workflow_name"` - Limit int32 `db:"limit" json:"limit"` -} - -func (q *Queries) GetPendingWorkflowsFiltered(ctx context.Context, db DBTX, arg GetPendingWorkflowsFilteredParams) ([]WorkflowExecution, error) { - rows, err := db.QueryContext(ctx, getPendingWorkflowsFiltered, - arg.Namespace, - arg.NextRetryAt, - arg.SleepUntil, - arg.WorkflowName, - arg.Limit, - ) - if err != nil { - return nil, err - } - defer rows.Close() - items := []WorkflowExecution{} - for rows.Next() { - var i WorkflowExecution - if err := rows.Scan( - &i.ID, - &i.WorkflowName, - &i.Status, - &i.InputData, - &i.OutputData, - &i.ErrorMessage, - &i.CreatedAt, - &i.StartedAt, - &i.CompletedAt, - &i.MaxAttempts, - &i.RemainingAttempts, - &i.NextRetryAt, - &i.Namespace, - &i.TriggerType, - &i.TriggerSource, - &i.SleepUntil, - &i.TraceID, - &i.SpanID, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getSleepingWorkflows = `-- name: GetSleepingWorkflows :many -SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions -WHERE namespace = ? AND status = 'sleeping' AND sleep_until <= ? -ORDER BY sleep_until ASC -` - -type GetSleepingWorkflowsParams struct { - Namespace string `db:"namespace" json:"namespace"` - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` -} - -func (q *Queries) GetSleepingWorkflows(ctx context.Context, db DBTX, arg GetSleepingWorkflowsParams) ([]WorkflowExecution, error) { - rows, err := db.QueryContext(ctx, getSleepingWorkflows, arg.Namespace, arg.SleepUntil) - if err != nil { - return nil, err - } - defer rows.Close() - items := []WorkflowExecution{} - for rows.Next() { - var i WorkflowExecution - if err := rows.Scan( - &i.ID, - &i.WorkflowName, - &i.Status, - &i.InputData, - &i.OutputData, - &i.ErrorMessage, - &i.CreatedAt, - &i.StartedAt, - &i.CompletedAt, - &i.MaxAttempts, - &i.RemainingAttempts, - &i.NextRetryAt, - &i.Namespace, - &i.TriggerType, - &i.TriggerSource, - &i.SleepUntil, - &i.TraceID, - &i.SpanID, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getStep = `-- name: GetStep :one -SELECT id, execution_id, step_name, status, output_data, error_message, started_at, completed_at, max_attempts, remaining_attempts, namespace FROM workflow_steps -WHERE namespace = ? AND execution_id = ? AND step_name = ? -` - -type GetStepParams struct { - Namespace string `db:"namespace" json:"namespace"` - ExecutionID string `db:"execution_id" json:"execution_id"` - StepName string `db:"step_name" json:"step_name"` -} - -func (q *Queries) GetStep(ctx context.Context, db DBTX, arg GetStepParams) (WorkflowStep, error) { - row := db.QueryRowContext(ctx, getStep, arg.Namespace, arg.ExecutionID, arg.StepName) - var i WorkflowStep - err := row.Scan( - &i.ID, - &i.ExecutionID, - &i.StepName, - &i.Status, - &i.OutputData, - &i.ErrorMessage, - &i.StartedAt, - &i.CompletedAt, - &i.MaxAttempts, - &i.RemainingAttempts, - &i.Namespace, - ) - return i, err -} - -const getWorkflow = `-- name: GetWorkflow :one -SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions -WHERE id = ? AND namespace = ? -` - -type GetWorkflowParams struct { - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` -} - -func (q *Queries) GetWorkflow(ctx context.Context, db DBTX, arg GetWorkflowParams) (WorkflowExecution, error) { - row := db.QueryRowContext(ctx, getWorkflow, arg.ID, arg.Namespace) - var i WorkflowExecution - err := row.Scan( - &i.ID, - &i.WorkflowName, - &i.Status, - &i.InputData, - &i.OutputData, - &i.ErrorMessage, - &i.CreatedAt, - &i.StartedAt, - &i.CompletedAt, - &i.MaxAttempts, - &i.RemainingAttempts, - &i.NextRetryAt, - &i.Namespace, - &i.TriggerType, - &i.TriggerSource, - &i.SleepUntil, - &i.TraceID, - &i.SpanID, - ) - return i, err -} - -const heartbeatLease = `-- name: HeartbeatLease :exec -UPDATE leases -SET heartbeat_at = ?, expires_at = ? -WHERE resource_id = ? AND worker_id = ? -` - -type HeartbeatLeaseParams struct { - HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` - ResourceID string `db:"resource_id" json:"resource_id"` - WorkerID string `db:"worker_id" json:"worker_id"` -} - -func (q *Queries) HeartbeatLease(ctx context.Context, db DBTX, arg HeartbeatLeaseParams) error { - _, err := db.ExecContext(ctx, heartbeatLease, - arg.HeartbeatAt, - arg.ExpiresAt, - arg.ResourceID, - arg.WorkerID, - ) - return err -} - -const releaseLease = `-- name: ReleaseLease :exec -DELETE FROM leases -WHERE resource_id = ? AND worker_id = ? -` - -type ReleaseLeaseParams struct { - ResourceID string `db:"resource_id" json:"resource_id"` - WorkerID string `db:"worker_id" json:"worker_id"` -} - -func (q *Queries) ReleaseLease(ctx context.Context, db DBTX, arg ReleaseLeaseParams) error { - _, err := db.ExecContext(ctx, releaseLease, arg.ResourceID, arg.WorkerID) - return err -} - -const resetOrphanedWorkflows = `-- name: ResetOrphanedWorkflows :exec -UPDATE workflow_executions -SET status = 'pending' -WHERE workflow_executions.namespace = ? - AND workflow_executions.status = 'running' - AND workflow_executions.id NOT IN ( - SELECT resource_id - FROM leases - WHERE kind = 'workflow' AND leases.namespace = ? - ) -` - -type ResetOrphanedWorkflowsParams struct { - Namespace string `db:"namespace" json:"namespace"` - Namespace_2 string `db:"namespace_2" json:"namespace_2"` -} - -func (q *Queries) ResetOrphanedWorkflows(ctx context.Context, db DBTX, arg ResetOrphanedWorkflowsParams) error { - _, err := db.ExecContext(ctx, resetOrphanedWorkflows, arg.Namespace, arg.Namespace_2) - return err -} - -const sleepWorkflow = `-- name: SleepWorkflow :exec -UPDATE workflow_executions -SET status = 'sleeping', sleep_until = ? -WHERE id = ? AND namespace = ? -` - -type SleepWorkflowParams struct { - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` -} - -func (q *Queries) SleepWorkflow(ctx context.Context, db DBTX, arg SleepWorkflowParams) error { - _, err := db.ExecContext(ctx, sleepWorkflow, arg.SleepUntil, arg.ID, arg.Namespace) - return err -} - -const updateCronJob = `-- name: UpdateCronJob :exec -UPDATE cron_jobs -SET cron_spec = ?, workflow_name = ?, enabled = ?, updated_at = ?, next_run_at = ? -WHERE id = ? AND namespace = ? -` - -type UpdateCronJobParams struct { - CronSpec string `db:"cron_spec" json:"cron_spec"` - WorkflowName sql.NullString `db:"workflow_name" json:"workflow_name"` - Enabled bool `db:"enabled" json:"enabled"` - UpdatedAt int64 `db:"updated_at" json:"updated_at"` - NextRunAt int64 `db:"next_run_at" json:"next_run_at"` - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` -} - -func (q *Queries) UpdateCronJob(ctx context.Context, db DBTX, arg UpdateCronJobParams) error { - _, err := db.ExecContext(ctx, updateCronJob, - arg.CronSpec, - arg.WorkflowName, - arg.Enabled, - arg.UpdatedAt, - arg.NextRunAt, - arg.ID, - arg.Namespace, - ) - return err -} - -const updateCronJobLastRun = `-- name: UpdateCronJobLastRun :exec -UPDATE cron_jobs -SET last_run_at = ?, next_run_at = ?, updated_at = ? -WHERE id = ? AND namespace = ? -` - -type UpdateCronJobLastRunParams struct { - LastRunAt sql.NullInt64 `db:"last_run_at" json:"last_run_at"` - NextRunAt int64 `db:"next_run_at" json:"next_run_at"` - UpdatedAt int64 `db:"updated_at" json:"updated_at"` - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` -} - -func (q *Queries) UpdateCronJobLastRun(ctx context.Context, db DBTX, arg UpdateCronJobLastRunParams) error { - _, err := db.ExecContext(ctx, updateCronJobLastRun, - arg.LastRunAt, - arg.NextRunAt, - arg.UpdatedAt, - arg.ID, - arg.Namespace, - ) - return err -} - -const updateLease = `-- name: UpdateLease :exec -UPDATE leases -SET worker_id = ?, acquired_at = ?, expires_at = ?, heartbeat_at = ? -WHERE resource_id = ? AND kind = ? AND expires_at < ? -` - -type UpdateLeaseParams struct { - WorkerID string `db:"worker_id" json:"worker_id"` - AcquiredAt int64 `db:"acquired_at" json:"acquired_at"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` - HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"` - ResourceID string `db:"resource_id" json:"resource_id"` - Kind LeasesKind `db:"kind" json:"kind"` - ExpiresAt_2 int64 `db:"expires_at_2" json:"expires_at_2"` -} - -func (q *Queries) UpdateLease(ctx context.Context, db DBTX, arg UpdateLeaseParams) error { - _, err := db.ExecContext(ctx, updateLease, - arg.WorkerID, - arg.AcquiredAt, - arg.ExpiresAt, - arg.HeartbeatAt, - arg.ResourceID, - arg.Kind, - arg.ExpiresAt_2, - ) - return err -} - -const updateStepStatus = `-- name: UpdateStepStatus :exec -UPDATE workflow_steps -SET status = ?, completed_at = ?, output_data = ?, error_message = ? -WHERE namespace = ? AND execution_id = ? AND step_name = ? -` - -type UpdateStepStatusParams struct { - Status WorkflowStepsStatus `db:"status" json:"status"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - OutputData []byte `db:"output_data" json:"output_data"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - Namespace string `db:"namespace" json:"namespace"` - ExecutionID string `db:"execution_id" json:"execution_id"` - StepName string `db:"step_name" json:"step_name"` -} - -func (q *Queries) UpdateStepStatus(ctx context.Context, db DBTX, arg UpdateStepStatusParams) error { - _, err := db.ExecContext(ctx, updateStepStatus, - arg.Status, - arg.CompletedAt, - arg.OutputData, - arg.ErrorMessage, - arg.Namespace, - arg.ExecutionID, - arg.StepName, - ) - return err -} - -const updateStepStatusWithLease = `-- name: UpdateStepStatusWithLease :exec -UPDATE workflow_steps -SET status = ?, completed_at = ?, output_data = ?, error_message = ? -WHERE workflow_steps.namespace = ? AND execution_id = ? AND step_name = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - ) -` - -type UpdateStepStatusWithLeaseParams struct { - Status WorkflowStepsStatus `db:"status" json:"status"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - OutputData []byte `db:"output_data" json:"output_data"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - Namespace string `db:"namespace" json:"namespace"` - ExecutionID string `db:"execution_id" json:"execution_id"` - StepName string `db:"step_name" json:"step_name"` - ResourceID string `db:"resource_id" json:"resource_id"` - WorkerID string `db:"worker_id" json:"worker_id"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` -} - -func (q *Queries) UpdateStepStatusWithLease(ctx context.Context, db DBTX, arg UpdateStepStatusWithLeaseParams) error { - _, err := db.ExecContext(ctx, updateStepStatusWithLease, - arg.Status, - arg.CompletedAt, - arg.OutputData, - arg.ErrorMessage, - arg.Namespace, - arg.ExecutionID, - arg.StepName, - arg.ResourceID, - arg.WorkerID, - arg.ExpiresAt, - ) - return err -} - -const updateWorkflowFields = `-- name: UpdateWorkflowFields :exec -UPDATE workflow_executions -SET - status = COALESCE(?, status), - error_message = COALESCE(?, error_message), - completed_at = COALESCE(?, completed_at), - started_at = COALESCE(?, started_at), - output_data = COALESCE(?, output_data), - remaining_attempts = COALESCE(?, remaining_attempts), - next_retry_at = COALESCE(?, next_retry_at), - sleep_until = COALESCE(?, sleep_until) -WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - ) -` - -type UpdateWorkflowFieldsParams struct { - Status WorkflowExecutionsStatus `db:"status" json:"status"` - ErrorMessage sql.NullString `db:"error_message" json:"error_message"` - CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"` - StartedAt sql.NullInt64 `db:"started_at" json:"started_at"` - OutputData []byte `db:"output_data" json:"output_data"` - RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"` - NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"` - SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"` - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` - ResourceID string `db:"resource_id" json:"resource_id"` - WorkerID string `db:"worker_id" json:"worker_id"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` -} - -func (q *Queries) UpdateWorkflowFields(ctx context.Context, db DBTX, arg UpdateWorkflowFieldsParams) error { - _, err := db.ExecContext(ctx, updateWorkflowFields, - arg.Status, - arg.ErrorMessage, - arg.CompletedAt, - arg.StartedAt, - arg.OutputData, - arg.RemainingAttempts, - arg.NextRetryAt, - arg.SleepUntil, - arg.ID, - arg.Namespace, - arg.ResourceID, - arg.WorkerID, - arg.ExpiresAt, - ) - return err -} - -const updateWorkflowToRunning = `-- name: UpdateWorkflowToRunning :exec -UPDATE workflow_executions -SET status = 'running', - started_at = CASE WHEN started_at IS NULL THEN ? ELSE started_at END, - sleep_until = NULL -WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - ) -` - -type UpdateWorkflowToRunningParams struct { - StartedAt sql.NullInt64 `db:"started_at" json:"started_at"` - ID string `db:"id" json:"id"` - Namespace string `db:"namespace" json:"namespace"` - ResourceID string `db:"resource_id" json:"resource_id"` - WorkerID string `db:"worker_id" json:"worker_id"` - ExpiresAt int64 `db:"expires_at" json:"expires_at"` -} - -func (q *Queries) UpdateWorkflowToRunning(ctx context.Context, db DBTX, arg UpdateWorkflowToRunningParams) error { - _, err := db.ExecContext(ctx, updateWorkflowToRunning, - arg.StartedAt, - arg.ID, - arg.Namespace, - arg.ResourceID, - arg.WorkerID, - arg.ExpiresAt, - ) - return err -} diff --git a/go/pkg/hydra/store_coverage_test.go b/go/pkg/hydra/store_coverage_test.go deleted file mode 100644 index e53a8181c2..0000000000 --- a/go/pkg/hydra/store_coverage_test.go +++ /dev/null @@ -1,184 +0,0 @@ -package hydra - -import ( - "context" - "database/sql" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/uid" -) - -func TestSQLCQueryCoverage(t *testing.T) { - // Test that basic SQLC Query operations work - engine := newTestEngine(t) - ctx := context.Background() - namespace := engine.GetNamespace() - - t.Run("WorkflowOperations", func(t *testing.T) { - // Test CreateWorkflow using Query pattern - workflowID := uid.New(uid.WorkflowPrefix) - err := store.Query.CreateWorkflow(ctx, engine.GetDB(), store.CreateWorkflowParams{ - ID: workflowID, - WorkflowName: "test-workflow", - Status: store.WorkflowExecutionsStatusPending, - InputData: []byte(`{"test": "data"}`), - OutputData: []byte{}, - ErrorMessage: sql.NullString{Valid: false}, - CreatedAt: time.Now().UnixMilli(), - StartedAt: sql.NullInt64{Valid: false}, - CompletedAt: sql.NullInt64{Valid: false}, - MaxAttempts: 3, - RemainingAttempts: 3, - NextRetryAt: sql.NullInt64{Valid: false}, - Namespace: namespace, - TriggerType: store.NullWorkflowExecutionsTriggerType{Valid: false}, - TriggerSource: sql.NullString{Valid: false}, - SleepUntil: sql.NullInt64{Valid: false}, - TraceID: sql.NullString{Valid: false}, - SpanID: sql.NullString{Valid: false}, - }) - require.NoError(t, err, "CreateWorkflow should work") - - // Test GetWorkflow using Query pattern - workflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: workflowID, - Namespace: namespace, - }) - require.NoError(t, err, "GetWorkflow should work") - require.Equal(t, workflowID, workflow.ID) - require.Equal(t, "test-workflow", workflow.WorkflowName) - require.Equal(t, store.WorkflowExecutionsStatusPending, workflow.Status) - - // GetAllWorkflows was removed - test individual workflow retrieval instead - retrievedWorkflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: workflowID, - Namespace: namespace, - }) - require.NoError(t, err, "GetWorkflow should work") - require.Equal(t, workflowID, retrievedWorkflow.ID) - - // Test GetPendingWorkflows using Query pattern - pendingWorkflows, err := store.Query.GetPendingWorkflows(ctx, engine.GetDB(), store.GetPendingWorkflowsParams{ - Namespace: namespace, - NextRetryAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - SleepUntil: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - Limit: 10, - }) - require.NoError(t, err, "GetPendingWorkflows should work") - require.Len(t, pendingWorkflows, 1) - - // Test UpdateWorkflowFields (replacement for UpdateWorkflowStatus) - // Note: This will fail due to lease validation, which is expected in tests - now := time.Now().UnixMilli() - _ = store.Query.UpdateWorkflowFields(ctx, engine.GetDB(), store.UpdateWorkflowFieldsParams{ - Status: store.WorkflowExecutionsStatusRunning, - ErrorMessage: sql.NullString{Valid: false}, - CompletedAt: sql.NullInt64{Valid: false}, - StartedAt: sql.NullInt64{Valid: false}, - OutputData: nil, - RemainingAttempts: 0, - NextRetryAt: sql.NullInt64{Valid: false}, - SleepUntil: sql.NullInt64{Valid: false}, - ID: workflowID, - Namespace: namespace, - ResourceID: workflowID, - WorkerID: "test-worker", - ExpiresAt: now, - }) - // Ignore error due to missing lease - - // Test CompleteWorkflow using restored simple query - err = store.Query.CompleteWorkflow(ctx, engine.GetDB(), store.CompleteWorkflowParams{ - CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - OutputData: []byte(`{"result": "success"}`), - ID: workflowID, - Namespace: namespace, - }) - require.NoError(t, err, "CompleteWorkflow should work") - - // Verify final state - finalWorkflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: workflowID, - Namespace: namespace, - }) - require.NoError(t, err) - require.Equal(t, store.WorkflowExecutionsStatusCompleted, finalWorkflow.Status) - }) - - t.Run("StepOperations", func(t *testing.T) { - // Create a workflow first - workflowID := uid.New(uid.WorkflowPrefix) - err := store.Query.CreateWorkflow(ctx, engine.GetDB(), store.CreateWorkflowParams{ - ID: workflowID, - WorkflowName: "test-workflow-with-steps", - Status: store.WorkflowExecutionsStatusRunning, - InputData: []byte(`{"test": "data"}`), - OutputData: []byte{}, - ErrorMessage: sql.NullString{Valid: false}, - CreatedAt: time.Now().UnixMilli(), - StartedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - CompletedAt: sql.NullInt64{Valid: false}, - MaxAttempts: 3, - RemainingAttempts: 3, - NextRetryAt: sql.NullInt64{Valid: false}, - Namespace: namespace, - TriggerType: store.NullWorkflowExecutionsTriggerType{Valid: false}, - TriggerSource: sql.NullString{Valid: false}, - SleepUntil: sql.NullInt64{Valid: false}, - TraceID: sql.NullString{Valid: false}, - SpanID: sql.NullString{Valid: false}, - }) - require.NoError(t, err) - - // Test CreateStep using Query pattern - stepID := uid.New(uid.StepPrefix) - err = store.Query.CreateStep(ctx, engine.GetDB(), store.CreateStepParams{ - ID: stepID, - ExecutionID: workflowID, - StepName: "test-step", - Status: store.WorkflowStepsStatusRunning, - OutputData: []byte{}, - ErrorMessage: sql.NullString{Valid: false}, - StartedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - CompletedAt: sql.NullInt64{Valid: false}, - MaxAttempts: 3, - RemainingAttempts: 3, - Namespace: namespace, - }) - require.NoError(t, err, "CreateStep should work") - - // Test GetStep using Query pattern - step, err := store.Query.GetStep(ctx, engine.GetDB(), store.GetStepParams{ - Namespace: namespace, - ExecutionID: workflowID, - StepName: "test-step", - }) - require.NoError(t, err, "GetStep should work") - require.Equal(t, stepID, step.ID) - require.Equal(t, "test-step", step.StepName) - - // Test UpdateStepStatus using restored simple query - err = store.Query.UpdateStepStatus(ctx, engine.GetDB(), store.UpdateStepStatusParams{ - Status: store.WorkflowStepsStatusCompleted, - CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - OutputData: []byte(`{"step_result": "success"}`), - ErrorMessage: sql.NullString{Valid: false}, - Namespace: namespace, - ExecutionID: workflowID, - StepName: "test-step", - }) - require.NoError(t, err, "UpdateStepStatus should work") - - // GetAllSteps was removed - test individual step retrieval instead - retrievedStep, err := store.Query.GetStep(ctx, engine.GetDB(), store.GetStepParams{ - Namespace: namespace, - ExecutionID: workflowID, - StepName: "test-step", - }) - require.NoError(t, err, "GetStep should work") - require.Equal(t, "test-step", retrievedStep.StepName) - }) -} diff --git a/go/pkg/hydra/test_helpers.go b/go/pkg/hydra/test_helpers.go deleted file mode 100644 index 183fec0f89..0000000000 --- a/go/pkg/hydra/test_helpers.go +++ /dev/null @@ -1,76 +0,0 @@ -package hydra - -import ( - "context" - "database/sql" - "fmt" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/otel/logging" - "github.com/unkeyed/unkey/go/pkg/testutil/containers" - "github.com/unkeyed/unkey/go/pkg/uid" -) - -// newTestEngineWithClock creates a test engine with the specified clock -func newTestEngineWithClock(t *testing.T, clk clock.Clock) *Engine { - t.Helper() - - // Use testcontainers for MySQL - mysqlCfg := containers.MySQL(t) - mysqlCfg.DBName = "hydra" - hydraDsn := mysqlCfg.FormatDSN() - - // Load the hydra schema into the database - db, err := sql.Open("mysql", hydraDsn) - require.NoError(t, err) - defer db.Close() - - // Create a unique namespace for this test to avoid data pollution - testNamespace := fmt.Sprintf("test_%s_%s", t.Name(), uid.New(uid.Prefix("test"))) - - // Create the engine with the properly configured database - engine, err := New(Config{ - DSN: hydraDsn, - Namespace: testNamespace, - Clock: clk, - Logger: logging.NewNoop(), - Marshaller: NewJSONMarshaller(), - }) - if err != nil { - t.Fatalf("Failed to create test engine: %v", err) - } - - return engine -} - -// newTestEngine creates a test engine with default clock -func newTestEngine(t *testing.T) *Engine { - return newTestEngineWithClock(t, clock.New()) -} - -// waitForWorkflowCompletion waits for a workflow to complete and returns the final workflow state -func waitForWorkflowCompletion(t *testing.T, engine *Engine, workflowID string, timeout time.Duration) *store.WorkflowExecution { - t.Helper() - - var workflow store.WorkflowExecution - var err error - - require.Eventually(t, func() bool { - workflow, err = store.Query.GetWorkflow(context.Background(), engine.GetDB(), store.GetWorkflowParams{ - ID: workflowID, - Namespace: engine.GetNamespace(), - }) - if err != nil { - return false - } - return workflow.Status == store.WorkflowExecutionsStatusCompleted || - workflow.Status == store.WorkflowExecutionsStatusFailed - }, timeout, 100*time.Millisecond, "Workflow should complete within timeout") - - require.NoError(t, err) - return &workflow -} diff --git a/go/pkg/hydra/testharness/events.go b/go/pkg/hydra/testharness/events.go deleted file mode 100644 index 922286d6af..0000000000 --- a/go/pkg/hydra/testharness/events.go +++ /dev/null @@ -1,179 +0,0 @@ -package testharness - -import ( - "sync" - "time" -) - -// WorkflowContext interface for extracting metadata (avoid import cycle) -type WorkflowContext interface { - ExecutionID() string - WorkflowName() string -} - -// EventType represents the type of event that occurred -type EventType string - -const ( - WorkflowStarted EventType = "workflow_started" - WorkflowCompleted EventType = "workflow_completed" - WorkflowFailed EventType = "workflow_failed" - StepExecuting EventType = "step_executing" - StepExecuted EventType = "step_executed" - StepFailed EventType = "step_failed" -) - -// EventRecord represents something that happened during test execution -type EventRecord struct { - Type EventType `json:"type"` - Message string `json:"message"` - Timestamp time.Time `json:"timestamp"` - Data map[string]interface{} `json:"data"` -} - -// EventCollector captures events during test execution -type EventCollector struct { - mu sync.RWMutex - events []EventRecord -} - -// NewEventCollector creates a new event collector -func NewEventCollector() *EventCollector { - return &EventCollector{ - mu: sync.RWMutex{}, - events: make([]EventRecord, 0), - } -} - -// Emit records an event with workflow context metadata automatically included -func (e *EventCollector) Emit(ctx WorkflowContext, eventType EventType, message string, extraData ...interface{}) { - e.mu.Lock() - defer e.mu.Unlock() - - // Start with context metadata - data := map[string]interface{}{ - "execution_id": ctx.ExecutionID(), - "workflow_name": ctx.WorkflowName(), - } - - // Add extra data as key-value pairs - for i := 0; i < len(extraData); i += 2 { - if i+1 < len(extraData) { - if key, ok := extraData[i].(string); ok { - data[key] = extraData[i+1] - } - } - } - - event := EventRecord{ - Type: eventType, - Message: message, - Timestamp: time.Now(), - Data: data, - } - - e.events = append(e.events, event) -} - -// Events returns all collected events -func (e *EventCollector) Events() []EventRecord { - e.mu.RLock() - defer e.mu.RUnlock() - - // Return a copy to prevent race conditions - events := make([]EventRecord, len(e.events)) - copy(events, e.events) - return events -} - -// Filter returns events that match the given criteria -func (e *EventCollector) Filter(eventType EventType) []EventRecord { - e.mu.RLock() - defer e.mu.RUnlock() - - var filtered []EventRecord - for _, event := range e.events { - if event.Type == eventType { - filtered = append(filtered, event) - } - } - return filtered -} - -// FilterWithData returns events that match the type and have specific data values -func (e *EventCollector) FilterWithData(eventType EventType, key string, value interface{}) []EventRecord { - e.mu.RLock() - defer e.mu.RUnlock() - - var filtered []EventRecord - for _, event := range e.events { - if event.Type == eventType { - if eventValue, exists := event.Data[key]; exists && eventValue == value { - filtered = append(filtered, event) - } - } - } - return filtered -} - -// Count returns the number of events of a specific type -func (e *EventCollector) Count(eventType EventType) int { - return len(e.Filter(eventType)) -} - -// CountWithData returns the number of events that match type and data criteria -func (e *EventCollector) CountWithData(eventType EventType, key string, value interface{}) int { - return len(e.FilterWithData(eventType, key, value)) -} - -// Clear removes all collected events -func (e *EventCollector) Clear() { - e.mu.Lock() - defer e.mu.Unlock() - e.events = e.events[:0] -} - -// GetLatest returns the most recent event of a given type, or nil if none found -func (e *EventCollector) GetLatest(eventType EventType) *EventRecord { - events := e.Filter(eventType) - if len(events) == 0 { - return nil - } - return &events[len(events)-1] -} - -// GetFirst returns the first event of a given type, or nil if none found -func (e *EventCollector) GetFirst(eventType EventType) *EventRecord { - events := e.Filter(eventType) - if len(events) == 0 { - return nil - } - return &events[0] -} - -// EventsBetween returns events that occurred between start and end times (inclusive) -func (e *EventCollector) EventsBetween(start, end time.Time) []EventRecord { - e.mu.RLock() - defer e.mu.RUnlock() - - var filtered []EventRecord - for _, event := range e.events { - if (event.Timestamp.Equal(start) || event.Timestamp.After(start)) && - (event.Timestamp.Equal(end) || event.Timestamp.Before(end)) { - filtered = append(filtered, event) - } - } - return filtered -} - -// Summary returns a summary of all event types and their counts -func (e *EventCollector) Summary() map[string]int { - e.mu.RLock() - defer e.mu.RUnlock() - - summary := make(map[string]int) - for _, event := range e.events { - summary[string(event.Type)]++ - } - return summary -} diff --git a/go/pkg/hydra/worker.go b/go/pkg/hydra/worker.go deleted file mode 100644 index b0650d446d..0000000000 --- a/go/pkg/hydra/worker.go +++ /dev/null @@ -1,1003 +0,0 @@ -package hydra - -import ( - "context" - "database/sql" - "fmt" - "strconv" - "sync" - "time" - - "github.com/unkeyed/unkey/go/pkg/circuitbreaker" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/db" - "github.com/unkeyed/unkey/go/pkg/hydra/metrics" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/otel/tracing" - "github.com/unkeyed/unkey/go/pkg/uid" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/trace" -) - -// Worker represents a workflow worker that can start, run, and shutdown. -// -// Workers are responsible for: -// - Polling the database for pending workflows -// - Acquiring exclusive leases on workflows to prevent duplicate execution -// - Executing workflow logic by calling registered workflow handlers -// - Sending periodic heartbeats to maintain lease ownership -// - Processing scheduled cron jobs -// - Recording metrics for observability -// -// Workers are designed to be run as long-lived processes and can safely -// handle network failures, database outages, and graceful shutdowns. -type Worker interface { - // Start begins the worker's main execution loop. - // This method blocks until the context is cancelled or an error occurs. - Start(ctx context.Context) error - - // Shutdown gracefully stops the worker and waits for active workflows to complete. - // This method should be called during application shutdown to ensure clean termination. - Shutdown(ctx context.Context) error -} - -// WorkerConfig holds the configuration for a worker instance. -// -// All fields are optional and will use sensible defaults if not specified. -type WorkerConfig struct { - // WorkerID uniquely identifies this worker instance. - // If not provided, a random ID will be generated. - WorkerID string - - // Concurrency controls how many workflows can execute simultaneously. - // Defaults to 10 if not specified. - Concurrency int - - // PollInterval controls how frequently the worker checks for new workflows. - // Shorter intervals provide lower latency but increase database load. - // Defaults to 5 seconds if not specified. - PollInterval time.Duration - - // HeartbeatInterval controls how frequently the worker sends lease heartbeats. - // This should be significantly shorter than ClaimTimeout to prevent lease expiration. - // Defaults to 30 seconds if not specified. - HeartbeatInterval time.Duration - - // ClaimTimeout controls how long a worker can hold a workflow lease. - // Expired leases are automatically released, allowing other workers to take over. - // Defaults to 5 minutes if not specified. - ClaimTimeout time.Duration - - // CronInterval controls how frequently the worker checks for due cron jobs. - // Defaults to 1 minute if not specified. - CronInterval time.Duration -} - -type worker struct { - engine *Engine - config WorkerConfig - workflows map[string]Workflow[any] - clock clock.Clock - shutdownC chan struct{} - doneC chan struct{} - wg sync.WaitGroup - activeLeases map[string]bool // Track workflow IDs we have leases for - activeLeasesM sync.RWMutex // Protect the activeLeases map - queryCircuitBreaker circuitbreaker.CircuitBreaker[[]store.WorkflowExecution] // Protect query operations - leaseCircuitBreaker circuitbreaker.CircuitBreaker[any] // Protect lease operations - workflowQueue chan store.WorkflowExecution // Queue of workflows to process -} - -// NewWorker creates a new worker instance with the provided configuration. -// -// The worker will be associated with the given engine and inherit its -// namespace and storage configuration. Missing configuration values -// will be populated with sensible defaults. -// -// The worker must have workflows registered using RegisterWorkflow() -// before calling Start(). -// -// Example: -// -// worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{ -// WorkerID: "worker-1", -// Concurrency: 20, -// PollInterval: 100 * time.Millisecond, -// HeartbeatInterval: 30 * time.Second, -// ClaimTimeout: 5 * time.Minute, -// }) -// if err != nil { -// return err -// } -// -// The worker includes built-in circuit breakers to protect against -// database overload and automatic retry logic for transient failures. -func NewWorker(e *Engine, config WorkerConfig) (Worker, error) { - if config.WorkerID == "" { - config.WorkerID = uid.New(uid.WorkerPrefix) - } - if config.Concurrency <= 0 { - config.Concurrency = 10 - } - if config.PollInterval <= 0 { - config.PollInterval = 5 * time.Second - } - if config.HeartbeatInterval <= 0 { - config.HeartbeatInterval = 30 * time.Second - } - if config.ClaimTimeout <= 0 { - config.ClaimTimeout = 5 * time.Minute - } - if config.CronInterval <= 0 { - config.CronInterval = 1 * time.Minute - } - - // Initialize circuit breakers for different database operations - queryCircuitBreaker := circuitbreaker.New[[]store.WorkflowExecution]("hydra-query") - leaseCircuitBreaker := circuitbreaker.New[any]("hydra-lease") - - // Create workflow queue with capacity based on concurrency - queueSize := config.Concurrency * 10 - if queueSize < 50 { - queueSize = 50 // Minimum queue size - } - - worker := &worker{ - engine: e, - config: config, - workflows: make(map[string]Workflow[any]), - clock: e.clock, - shutdownC: make(chan struct{}), - doneC: make(chan struct{}), - wg: sync.WaitGroup{}, - activeLeases: make(map[string]bool), - activeLeasesM: sync.RWMutex{}, - queryCircuitBreaker: queryCircuitBreaker, - leaseCircuitBreaker: leaseCircuitBreaker, - workflowQueue: make(chan store.WorkflowExecution, queueSize), - } - - return worker, nil -} - -func (w *worker) run(ctx context.Context) { - defer close(w.doneC) - - // Start workflow processors - for i := 0; i < w.config.Concurrency; i++ { - w.wg.Add(1) - go w.processWorkflows(ctx) - } - - w.wg.Add(4) - go w.pollForWorkflows(ctx) - go w.sendHeartbeats(ctx) - go w.cleanupExpiredLeases(ctx) - go w.processCronJobs(ctx) - - select { - case <-w.shutdownC: - case <-ctx.Done(): - } - - // Don't close the queue immediately - let processors drain it first - w.wg.Wait() -} - -func (w *worker) pollForWorkflows(ctx context.Context) { - defer w.wg.Done() - - ticker := w.clock.NewTicker(w.config.PollInterval) - defer ticker.Stop() - tickerC := ticker.C() - - for { - select { - case <-tickerC: - w.pollOnce(ctx) - - case <-w.shutdownC: - return - - case <-ctx.Done(): - return - } - } -} - -func (w *worker) pollOnce(ctx context.Context) { - workflowNames := make([]string, 0, len(w.workflows)) - for name := range w.workflows { - workflowNames = append(workflowNames, name) - } - - // Use a more conservative fetch limit to reduce contention - fetchLimit := w.config.Concurrency * 2 // Fetch less to reduce contention - if fetchLimit < 10 { - fetchLimit = 10 // Minimum fetch size - } - if fetchLimit > 1000 { - fetchLimit = 1000 // Maximum reasonable fetch size - } - - // Convert to int32 safely for gosec - using string conversion to avoid overflow warning - fetchLimit32, _ := strconv.ParseInt(strconv.Itoa(fetchLimit), 10, 32) - - workflows, err := w.queryCircuitBreaker.Do(ctx, func(ctx context.Context) ([]store.WorkflowExecution, error) { - // Use new Query pattern - now := time.Now().UnixMilli() - var workflows []store.WorkflowExecution - var err error - - if len(workflowNames) > 0 { - // Use filtered query - for now just use the first workflow name - // Multiple workflow names support requires SQLC query enhancement - workflows, err = store.Query.GetPendingWorkflowsFiltered(ctx, w.engine.GetDB(), store.GetPendingWorkflowsFilteredParams{ - Namespace: w.engine.namespace, - NextRetryAt: sql.NullInt64{Int64: now, Valid: true}, - SleepUntil: sql.NullInt64{Int64: now, Valid: true}, - WorkflowName: workflowNames[0], - Limit: int32(fetchLimit32), //nolint:gosec // G115: fetchLimit is bounded to [10, 1000] - }) - } else { - workflows, err = store.Query.GetPendingWorkflows(ctx, w.engine.GetDB(), store.GetPendingWorkflowsParams{ - Namespace: w.engine.namespace, - NextRetryAt: sql.NullInt64{Int64: now, Valid: true}, - SleepUntil: sql.NullInt64{Int64: now, Valid: true}, - Limit: int32(fetchLimit32), //nolint:gosec // G115: fetchLimit is bounded to [10, 1000] - }) - } - - if err != nil { - return nil, err - } - - // Return store types directly (no conversion needed) - return workflows, nil - }) - - // Record polling metrics - if err != nil { - metrics.WorkerPollsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, "error").Inc() - return - } - - // Record successful poll with found work status - status := "no_work" - if len(workflows) > 0 { - status = "found_work" - } - metrics.WorkerPollsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, status).Inc() - - // Queue workflows - let polling goroutine block if needed - for _, workflow := range workflows { - w.workflowQueue <- workflow - } -} - -func (w *worker) processWorkflows(ctx context.Context) { - defer w.wg.Done() - - for { - select { - case workflow := <-w.workflowQueue: - // Try to acquire lease using new Query pattern with transaction - err := w.acquireWorkflowLease(ctx, workflow.ID, w.config.WorkerID) - if err != nil { - // Another worker got it or error, skip this workflow - metrics.LeaseAcquisitionsTotal.WithLabelValues(w.config.WorkerID, "workflow", "failed").Inc() - continue - } - - // Record successful lease acquisition - metrics.LeaseAcquisitionsTotal.WithLabelValues(w.config.WorkerID, "workflow", "success").Inc() - - // Track this lease for heartbeats - w.addActiveLease(workflow.ID) - - // Update active workflows gauge - metrics.WorkflowsActive.WithLabelValues(w.engine.namespace, w.config.WorkerID).Inc() - - // Execute the workflow - w.executeWorkflow(ctx, &workflow) - - // Release the lease and stop tracking it - // Use new Query pattern - if err := store.Query.ReleaseLease(ctx, w.engine.GetDB(), store.ReleaseLeaseParams{ - ResourceID: workflow.ID, - WorkerID: w.config.WorkerID, - }); err != nil { - w.engine.logger.Error("Failed to release workflow lease", - "workflow_id", workflow.ID, - "worker_id", w.config.WorkerID, - "error", err.Error(), - ) - } - w.removeActiveLease(workflow.ID) - - // Update active workflows gauge - metrics.WorkflowsActive.WithLabelValues(w.engine.namespace, w.config.WorkerID).Dec() - - case <-w.shutdownC: - return - case <-ctx.Done(): - return - } - } -} - -func (w *worker) executeWorkflow(ctx context.Context, e *store.WorkflowExecution) { - startTime := w.clock.Now() - - // Start tracing span for workflow execution - var span trace.Span - - if e.TraceID.Valid && e.SpanID.Valid && e.TraceID.String != "" && e.SpanID.String != "" { - // Reconstruct the exact trace context from stored trace ID and span ID - traceID, traceErr := trace.TraceIDFromHex(e.TraceID.String) - spanID, spanErr := trace.SpanIDFromHex(e.SpanID.String) - - if traceErr == nil && spanErr == nil { - // Create the exact span context from the original workflow creation - originalSpanCtx := trace.NewSpanContext(trace.SpanContextConfig{ - TraceID: traceID, - SpanID: spanID, - TraceFlags: trace.FlagsSampled, - TraceState: trace.TraceState{}, - Remote: false, - }) - - // Set this context as the parent for the execution span - ctx = trace.ContextWithSpanContext(ctx, originalSpanCtx) - } - } - - ctx, span = tracing.Start(ctx, fmt.Sprintf("hydra.worker.executeWorkflow.%s", e.WorkflowName)) - defer span.End() - - spanAttributes := []attribute.KeyValue{ - attribute.String("hydra.workflow.name", e.WorkflowName), - attribute.String("hydra.execution.id", e.ID), - attribute.String("hydra.namespace", e.Namespace), - attribute.String("hydra.worker.id", w.config.WorkerID), - } - - if e.TraceID.Valid && e.TraceID.String != "" { - spanAttributes = append(spanAttributes, attribute.String("hydra.original_trace_id", e.TraceID.String)) - } - if e.SpanID.Valid && e.SpanID.String != "" { - spanAttributes = append(spanAttributes, attribute.String("hydra.original_span_id", e.SpanID.String)) - } - - span.SetAttributes(spanAttributes...) - - // Calculate queue time (time from creation to execution start) - queueTime := time.Duration(startTime.UnixMilli()-e.CreatedAt) * time.Millisecond - metrics.WorkflowQueueTimeSeconds.WithLabelValues(e.Namespace, e.WorkflowName).Observe(queueTime.Seconds()) - - // Update workflow to running status with lease validation - now := time.Now().UnixMilli() - err := store.Query.UpdateWorkflowToRunning(ctx, w.engine.GetDB(), store.UpdateWorkflowToRunningParams{ - StartedAt: sql.NullInt64{Int64: startTime.UnixMilli(), Valid: true}, - ID: e.ID, - Namespace: e.Namespace, - ResourceID: e.ID, - WorkerID: w.config.WorkerID, - ExpiresAt: now, - }) - if err != nil { - metrics.RecordError(e.Namespace, "worker", "status_update_failed") - tracing.RecordError(span, err) - span.SetAttributes(attribute.String("hydra.workflow.status", "failed")) - return - } - - wf, exists := w.workflows[e.WorkflowName] - if !exists { - noHandlerErr := fmt.Errorf("no handler registered for workflow %s", e.WorkflowName) - tracing.RecordError(span, noHandlerErr) - span.SetAttributes(attribute.String("hydra.workflow.status", "failed")) - - // Use lease-validated failure to ensure correctness - failureTime := w.clock.Now().UnixMilli() - result, failErr := w.engine.GetDB().ExecContext(ctx, ` - UPDATE workflow_executions - SET status = 'failed', error_message = ?, remaining_attempts = remaining_attempts - 1, completed_at = ?, next_retry_at = NULL - WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - sql.NullString{String: noHandlerErr.Error(), Valid: true}, - sql.NullInt64{Int64: failureTime, Valid: true}, - e.ID, - e.Namespace, - e.ID, // resource_id for lease check - w.config.WorkerID, // worker_id for lease check - failureTime, // expires_at check - ) - if failErr != nil { - w.engine.logger.Error("Failed to mark workflow as failed", - "workflow_id", e.ID, - "workflow_name", e.WorkflowName, - "namespace", e.Namespace, - "error", failErr.Error(), - ) - } else { - // Check if the failure actually happened (lease validation) - if rowsAffected, checkErr := result.RowsAffected(); checkErr != nil { - w.engine.logger.Error("Failed to check workflow failure result", - "workflow_id", e.ID, - "error", checkErr.Error(), - ) - } else if rowsAffected == 0 { - w.engine.logger.Warn("Workflow failure failed: lease expired or invalid", - "workflow_id", e.ID, - "worker_id", w.config.WorkerID, - ) - } - } - metrics.ObserveWorkflowDuration(e.Namespace, e.WorkflowName, "failed", startTime) - metrics.WorkflowsCompletedTotal.WithLabelValues(e.Namespace, e.WorkflowName, "failed").Inc() - metrics.RecordError(e.Namespace, "worker", "no_handler_registered") - return - } - - payload := &RawPayload{Data: e.InputData} - - wctx := &workflowContext{ - ctx: ctx, // This is the traced context from the worker span - executionID: e.ID, - workflowName: e.WorkflowName, - namespace: e.Namespace, - workerID: w.config.WorkerID, - db: w.engine.GetDB(), - marshaller: w.engine.marshaller, - logger: w.engine.logger.With("execution_id", e.ID, "namespace", e.Namespace, "workflow_name", e.WorkflowName), - stepTimeout: 5 * time.Minute, // Default step timeout - stepMaxAttempts: 3, // Default step max attempts - } - - err = wf.Run(wctx, payload) - - if err != nil { - tracing.RecordError(span, err) - - if suspendErr, ok := err.(*WorkflowSuspendedError); ok { - span.SetAttributes(attribute.String("hydra.workflow.status", "suspended")) - - // Use simple sleep workflow since we have the lease - if sleepErr := store.Query.SleepWorkflow(ctx, w.engine.GetDB(), store.SleepWorkflowParams{ - SleepUntil: sql.NullInt64{Int64: suspendErr.ResumeTime, Valid: true}, - ID: e.ID, - Namespace: e.Namespace, - }); sleepErr != nil { - w.engine.logger.Error("Failed to suspend workflow", - "workflow_id", e.ID, - "workflow_name", e.WorkflowName, - "namespace", e.Namespace, - "resume_time", suspendErr.ResumeTime, - "error", sleepErr.Error(), - ) - } - metrics.SleepsStartedTotal.WithLabelValues(e.Namespace, e.WorkflowName).Inc() - return - } - - isFinal := e.RemainingAttempts <= 1 - span.SetAttributes(attribute.String("hydra.workflow.status", "failed")) - - // Use lease-validated failure to ensure correctness - finalFailureTime := w.clock.Now().UnixMilli() - var result sql.Result - var failErr error - - if isFinal { - // Final failure - no more retries - result, failErr = w.engine.GetDB().ExecContext(ctx, ` - UPDATE workflow_executions - SET status = 'failed', error_message = ?, remaining_attempts = remaining_attempts - 1, completed_at = ?, next_retry_at = NULL - WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - sql.NullString{String: err.Error(), Valid: true}, - sql.NullInt64{Int64: finalFailureTime, Valid: true}, - e.ID, - e.Namespace, - e.ID, // resource_id for lease check - w.config.WorkerID, // worker_id for lease check - finalFailureTime, // expires_at check - ) - } else { - // Failure with retry - calculate next retry time - nextRetryAt := w.clock.Now().Add(time.Duration(e.MaxAttempts-e.RemainingAttempts+1) * time.Second).UnixMilli() - result, failErr = w.engine.GetDB().ExecContext(ctx, ` - UPDATE workflow_executions - SET status = 'failed', error_message = ?, remaining_attempts = remaining_attempts - 1, next_retry_at = ? - WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - sql.NullString{String: err.Error(), Valid: true}, - sql.NullInt64{Int64: nextRetryAt, Valid: true}, - e.ID, - e.Namespace, - e.ID, // resource_id for lease check - w.config.WorkerID, // worker_id for lease check - finalFailureTime, // expires_at check - ) - } - if failErr != nil { - w.engine.logger.Error("Failed to mark workflow as failed", - "workflow_id", e.ID, - "workflow_name", e.WorkflowName, - "namespace", e.Namespace, - "is_final", isFinal, - "original_error", err.Error(), - "fail_error", failErr.Error(), - ) - } else { - // Check if the failure actually happened (lease validation) - if rowsAffected, checkErr := result.RowsAffected(); checkErr != nil { - w.engine.logger.Error("Failed to check workflow failure result", - "workflow_id", e.ID, - "error", checkErr.Error(), - ) - } else if rowsAffected == 0 { - w.engine.logger.Warn("Workflow failure failed: lease expired or invalid", - "workflow_id", e.ID, - "worker_id", w.config.WorkerID, - "is_final", isFinal, - ) - } - } - - if !isFinal { - metrics.WorkflowsRetriedTotal.WithLabelValues(e.Namespace, e.WorkflowName, fmt.Sprintf("%d", e.MaxAttempts-e.RemainingAttempts+1)).Inc() - } - - metrics.ObserveWorkflowDuration(e.Namespace, e.WorkflowName, "failed", startTime) - metrics.WorkflowsCompletedTotal.WithLabelValues(e.Namespace, e.WorkflowName, "failed").Inc() - return - } - - span.SetAttributes(attribute.String("hydra.workflow.status", "completed")) - - // Use lease-validated completion to ensure correctness - now = w.clock.Now().UnixMilli() - result, err := w.engine.GetDB().ExecContext(ctx, ` - UPDATE workflow_executions - SET status = 'completed', completed_at = ?, output_data = ? - WHERE id = ? AND workflow_executions.namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'workflow' - AND worker_id = ? AND expires_at > ? - )`, - sql.NullInt64{Int64: now, Valid: true}, - []byte{}, // No output data for now - e.ID, - e.Namespace, - e.ID, // resource_id for lease check - w.config.WorkerID, // worker_id for lease check - now, // expires_at check - ) - if err != nil { - tracing.RecordError(span, err) - w.engine.logger.Error("Failed to mark workflow as completed", - "workflow_id", e.ID, - "workflow_name", e.WorkflowName, - "namespace", e.Namespace, - "error", err.Error(), - ) - return - } - - // Check if the completion actually happened (lease validation) - rowsAffected, checkErr := result.RowsAffected() - if checkErr != nil { - w.engine.logger.Error("Failed to check workflow completion result", - "workflow_id", e.ID, - "error", checkErr.Error(), - ) - return - } - if rowsAffected == 0 { - w.engine.logger.Warn("Workflow completion failed: lease expired or invalid", - "workflow_id", e.ID, - "worker_id", w.config.WorkerID, - ) - return - } - - metrics.ObserveWorkflowDuration(e.Namespace, e.WorkflowName, "completed", startTime) - metrics.WorkflowsCompletedTotal.WithLabelValues(e.Namespace, e.WorkflowName, "completed").Inc() -} - -func (w *worker) sendHeartbeats(ctx context.Context) { - defer w.wg.Done() - - ticker := w.clock.NewTicker(w.config.HeartbeatInterval) - defer ticker.Stop() - tickerC := ticker.C() - - for { - select { - case <-tickerC: - w.sendHeartbeatsForActiveLeases(ctx) - - case <-w.shutdownC: - return - case <-ctx.Done(): - return - } - } -} - -// addActiveLease tracks a workflow lease for heartbeat sending -func (w *worker) addActiveLease(workflowID string) { - w.activeLeasesM.Lock() - defer w.activeLeasesM.Unlock() - w.activeLeases[workflowID] = true -} - -// removeActiveLease stops tracking a workflow lease -func (w *worker) removeActiveLease(workflowID string) { - w.activeLeasesM.Lock() - defer w.activeLeasesM.Unlock() - delete(w.activeLeases, workflowID) -} - -// sendHeartbeatsForActiveLeases sends heartbeats for all workflows this worker has leases for -func (w *worker) sendHeartbeatsForActiveLeases(ctx context.Context) { - w.activeLeasesM.RLock() - // Copy the map to avoid holding the lock while sending heartbeats - leaseIDs := make([]string, 0, len(w.activeLeases)) - for workflowID := range w.activeLeases { - leaseIDs = append(leaseIDs, workflowID) - } - w.activeLeasesM.RUnlock() - - // Send heartbeats for each active lease - now := w.clock.Now().UnixMilli() - newExpiresAt := now + w.config.ClaimTimeout.Milliseconds() - - for _, workflowID := range leaseIDs { - // Protect heartbeat with circuit breaker - _, err := w.leaseCircuitBreaker.Do(ctx, func(ctx context.Context) (any, error) { - // Use new Query pattern - return nil, store.Query.HeartbeatLease(ctx, w.engine.GetDB(), store.HeartbeatLeaseParams{ - HeartbeatAt: now, - ExpiresAt: newExpiresAt, - ResourceID: workflowID, - WorkerID: w.config.WorkerID, - }) - }) - if err != nil { - // Record failed heartbeat - metrics.WorkerHeartbeatsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, "failed").Inc() - continue - } - - // Record successful heartbeat - metrics.WorkerHeartbeatsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, "success").Inc() - } -} - -func (w *worker) cleanupExpiredLeases(ctx context.Context) { - defer w.wg.Done() - - ticker := w.clock.NewTicker(w.config.HeartbeatInterval * 2) // Clean up less frequently than heartbeats - defer ticker.Stop() - tickerC := ticker.C() - - for { - select { - case <-tickerC: - // Clean up expired leases first - now := w.clock.Now().UnixMilli() - err := store.Query.CleanupExpiredLeases(ctx, w.engine.GetDB(), store.CleanupExpiredLeasesParams{ - Namespace: w.engine.namespace, - ExpiresAt: now, - }) - if err != nil { - w.engine.logger.Warn("Failed to cleanup expired leases", "error", err.Error()) - } - - // Then reset orphaned workflows back to pending so they can be picked up again - err = store.Query.ResetOrphanedWorkflows(ctx, w.engine.GetDB(), store.ResetOrphanedWorkflowsParams{ - Namespace: w.engine.namespace, - Namespace_2: w.engine.namespace, - }) - if err != nil { - w.engine.logger.Warn("Failed to reset orphaned workflows", "error", err.Error()) - } - - case <-w.shutdownC: - return - case <-ctx.Done(): - return - } - } -} - -func (w *worker) processCronJobs(ctx context.Context) { - defer w.wg.Done() - - ticker := w.clock.NewTicker(w.config.CronInterval) - defer ticker.Stop() - tickerC := ticker.C() - - for { - select { - case <-tickerC: - w.processDueCronJobs(ctx) - - case <-w.shutdownC: - return - case <-ctx.Done(): - return - } - } -} - -func (w *worker) processDueCronJobs(ctx context.Context) { - now := w.engine.clock.Now().UnixMilli() - - dueCrons, err := store.Query.GetDueCronJobs(ctx, w.engine.GetDB(), store.GetDueCronJobsParams{ - Namespace: w.engine.namespace, - NextRunAt: now, - }) - if err != nil { - return - } - - if len(dueCrons) == 0 { - return - } - - for _, cronJob := range dueCrons { - var canHandle bool - if cronJob.WorkflowName.Valid && cronJob.WorkflowName.String != "" { - _, canHandle = w.workflows[cronJob.WorkflowName.String] - } else { - _, canHandle = w.engine.cronHandlers[cronJob.Name] - } - - if !canHandle { - continue - } - - err := store.Query.CreateLease(ctx, w.engine.GetDB(), store.CreateLeaseParams{ - ResourceID: cronJob.ID, - Kind: store.LeasesKindCronJob, - Namespace: w.engine.namespace, - WorkerID: w.config.WorkerID, - AcquiredAt: now, - ExpiresAt: now + (5 * time.Minute).Milliseconds(), // 5 minute lease for cron execution - HeartbeatAt: now, - }) - if err != nil { - continue - } - - w.executeCronJob(ctx, cronJob) - - if err := store.Query.ReleaseLease(ctx, w.engine.GetDB(), store.ReleaseLeaseParams{ - ResourceID: cronJob.ID, - WorkerID: w.config.WorkerID, - }); err != nil { - w.engine.logger.Error("Failed to release cron job lease", - "cron_job_id", cronJob.ID, - "cron_name", cronJob.Name, - "worker_id", w.config.WorkerID, - "error", err.Error(), - ) - } - } -} - -func (w *worker) executeCronJob(ctx context.Context, cronJob store.CronJob) { - - now := w.engine.clock.Now().UnixMilli() - - payload := &CronPayload{ - CronJobID: cronJob.ID, - CronName: cronJob.Name, - ScheduledAt: cronJob.NextRunAt, - ActualRunAt: now, - Namespace: cronJob.Namespace, - } - - handler, exists := w.engine.cronHandlers[cronJob.Name] - if !exists { - return - } - - // Execute cron handler with panic recovery - func() { - defer func() { - if r := recover(); r != nil { - w.engine.logger.Error("Cron handler panicked", - "cron_job_id", cronJob.ID, - "cron_name", cronJob.Name, - "panic", r, - ) - } - }() - if err := handler(ctx, *payload); err != nil { - w.engine.logger.Error("Cron handler execution failed", - "cron_job_id", cronJob.ID, - "cron_name", cronJob.Name, - "error", err.Error(), - ) - } - }() - - // Update cron job with lease validation - only if worker holds valid cron lease - nextRun := calculateNextRun(cronJob.CronSpec, w.engine.clock.Now()) - updateTime := w.engine.clock.Now().UnixMilli() - result, err := w.engine.GetDB().ExecContext(ctx, ` - UPDATE cron_jobs - SET last_run_at = ?, next_run_at = ?, updated_at = ? - WHERE id = ? AND namespace = ? - AND EXISTS ( - SELECT 1 FROM leases - WHERE resource_id = ? AND kind = 'cron_job' - AND worker_id = ? AND expires_at > ? - )`, - sql.NullInt64{Int64: now, Valid: true}, - nextRun, - updateTime, - cronJob.ID, - w.engine.namespace, - cronJob.ID, // resource_id for lease check - w.config.WorkerID, // worker_id for lease check - updateTime, // expires_at check - ) - if err != nil { - w.engine.logger.Error("Failed to update cron job last run time", - "cron_job_id", cronJob.ID, - "cron_name", cronJob.Name, - "namespace", w.engine.namespace, - "last_run", now, - "next_run", nextRun, - "error", err.Error(), - ) - } else { - // Check if the update actually happened (lease validation) - if rowsAffected, checkErr := result.RowsAffected(); checkErr != nil { - w.engine.logger.Error("Failed to check cron job update result", - "cron_job_id", cronJob.ID, - "error", checkErr.Error(), - ) - } else if rowsAffected == 0 { - w.engine.logger.Warn("Cron job update failed: lease expired or invalid", - "cron_job_id", cronJob.ID, - "worker_id", w.config.WorkerID, - ) - } - } - -} - -// acquireWorkflowLease implements workflow lease acquisition using new Query pattern -func (w *worker) acquireWorkflowLease(ctx context.Context, workflowID, workerID string) error { - now := w.clock.Now().UnixMilli() - expiresAt := now + w.config.ClaimTimeout.Milliseconds() - - // Begin transaction - tx, err := w.engine.GetDB().BeginTx(ctx, nil) - if err != nil { - return err - } - defer func() { - if rollbackErr := tx.Rollback(); rollbackErr != nil && rollbackErr != sql.ErrTxDone { - w.engine.logger.Error("failed to rollback transaction", "error", rollbackErr) - } - }() - - // First, check if workflow is still available for leasing - workflow, err := store.Query.GetWorkflow(ctx, tx, store.GetWorkflowParams{ - ID: workflowID, - Namespace: w.engine.namespace, - }) - if err != nil { - if db.IsNotFound(err) { - return fmt.Errorf("workflow not found") - } - return err - } - - // Check if workflow is in a valid state for execution - if workflow.Status != store.WorkflowExecutionsStatusPending && - workflow.Status != store.WorkflowExecutionsStatusFailed && - workflow.Status != store.WorkflowExecutionsStatusSleeping { - return fmt.Errorf("workflow not available for execution, status: %s", workflow.Status) - } - - // Check for retry timing if it's a failed workflow - if workflow.Status == store.WorkflowExecutionsStatusFailed && - workflow.NextRetryAt.Valid && workflow.NextRetryAt.Int64 > now { - return fmt.Errorf("workflow retry not yet due") - } - - // Check for sleep timing if it's a sleeping workflow - if workflow.Status == store.WorkflowExecutionsStatusSleeping && - workflow.SleepUntil.Valid && workflow.SleepUntil.Int64 > now { - return fmt.Errorf("workflow still sleeping") - } - - // Try to create the lease - err = store.Query.CreateLease(ctx, tx, store.CreateLeaseParams{ - ResourceID: workflowID, - Kind: store.LeasesKindWorkflow, - Namespace: w.engine.namespace, - WorkerID: workerID, - AcquiredAt: now, - ExpiresAt: expiresAt, - HeartbeatAt: now, - }) - if err != nil { - // If lease creation failed, try to take over ONLY expired leases - leaseResult, leaseErr := tx.ExecContext(ctx, ` - UPDATE leases - SET worker_id = ?, acquired_at = ?, expires_at = ?, heartbeat_at = ? - WHERE resource_id = ? AND kind = ? AND expires_at < ?`, - workerID, now, expiresAt, now, workflowID, store.LeasesKindWorkflow, now) - if leaseErr != nil { - return fmt.Errorf("failed to check for expired lease: %w", leaseErr) - } - - // Check if we actually took over an expired lease - rowsAffected, rowsErr := leaseResult.RowsAffected() - if rowsErr != nil { - return fmt.Errorf("failed to check lease takeover result: %w", rowsErr) - } - if rowsAffected == 0 { - return fmt.Errorf("workflow is already leased by another worker") - } - } - - // Update workflow to running status - err = store.Query.UpdateWorkflowToRunning(ctx, tx, store.UpdateWorkflowToRunningParams{ - StartedAt: sql.NullInt64{Int64: now, Valid: true}, - ID: workflowID, - Namespace: w.engine.namespace, - ResourceID: workflowID, - WorkerID: w.config.WorkerID, - ExpiresAt: now, - }) - if err != nil { - return fmt.Errorf("failed to update workflow status: %w", err) - } - - // Commit the transaction - return tx.Commit() -} - -func (w *worker) Start(ctx context.Context) error { - go w.run(ctx) - return nil -} - -func (w *worker) Shutdown(ctx context.Context) error { - select { - case <-w.shutdownC: - default: - close(w.shutdownC) - } - - select { - case <-w.doneC: - return nil - case <-ctx.Done(): - return ctx.Err() - } -} diff --git a/go/pkg/hydra/worker_heartbeat_test.go b/go/pkg/hydra/worker_heartbeat_test.go deleted file mode 100644 index 49aecd076f..0000000000 --- a/go/pkg/hydra/worker_heartbeat_test.go +++ /dev/null @@ -1,138 +0,0 @@ -package hydra - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/uid" -) - -// TestWorkerHeartbeatFunctionality ensures that workers send heartbeats to maintain their leases -// and prevent workflows from being incorrectly marked as orphaned when workers are healthy. -func TestWorkerHeartbeatFunctionality(t *testing.T) { - // Arrange: Create engine with test clock for deterministic timing - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - // Create a workflow that will run for a while to give us time to test heartbeats - workflow := &longRunningWorkflow{ - engine: engine, - name: "heartbeat-test-workflow", - executeTime: 5 * time.Second, // Run longer than heartbeat interval - } - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - // Start workflow - executionID, err := workflow.Start(ctx, struct{}{}) - require.NoError(t, err) - - // Start worker with short heartbeat interval for faster testing - workerID := uid.New(uid.WorkerPrefix) - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: workerID, - Concurrency: 1, - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 1 * time.Second, // Send heartbeats frequently - ClaimTimeout: 10 * time.Second, // Long enough for multiple heartbeats - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Act: Let worker pick up workflow and start sending heartbeats - testClock.Tick(200 * time.Millisecond) // Trigger initial poll - time.Sleep(50 * time.Millisecond) // Let worker pick up the workflow - - // Keep triggering polls until workflow is picked up - require.Eventually(t, func() bool { - testClock.Tick(200 * time.Millisecond) - time.Sleep(10 * time.Millisecond) - - // Check if workflow has been picked up - currentStatus, getErr := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - if getErr != nil { - return false - } - return currentStatus.Status != store.WorkflowExecutionsStatusPending - }, 3*time.Second, 50*time.Millisecond, "Worker should pick up workflow within timeout") - - // Verify workflow is being processed - workflowStatus, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{ - ID: executionID, - Namespace: engine.GetNamespace(), - }) - require.NoError(t, err) - require.Equal(t, store.WorkflowExecutionsStatusRunning, workflowStatus.Status, "Workflow should be running") - - // Get initial lease - lease, err := store.Query.GetLease(ctx, engine.GetDB(), store.GetLeaseParams{ - ResourceID: executionID, - Kind: store.LeasesKindWorkflow, - }) - require.NoError(t, err) - require.Equal(t, workerID, lease.WorkerID, "Lease should be held by our worker") - - initialExpiresAt := lease.ExpiresAt - - // Advance time to trigger first heartbeat - testClock.Tick(1500 * time.Millisecond) // Past first heartbeat interval - time.Sleep(50 * time.Millisecond) // Let heartbeat be processed - - // Verify heartbeat extended the lease - updatedLease, err := store.Query.GetLease(ctx, engine.GetDB(), store.GetLeaseParams{ - ResourceID: executionID, - Kind: store.LeasesKindWorkflow, - }) - require.NoError(t, err) - require.Equal(t, workerID, updatedLease.WorkerID, "Lease should still be held by our worker") - require.Greater(t, updatedLease.ExpiresAt, initialExpiresAt, - "HEARTBEAT FAILURE: Lease expiration should be extended after heartbeat. "+ - "Initial: %d, Updated: %d. This means the worker is not sending heartbeats properly, "+ - "which could cause healthy workers to lose their leases prematurely.", - initialExpiresAt, updatedLease.ExpiresAt) - require.Greater(t, updatedLease.HeartbeatAt, lease.HeartbeatAt, - "HeartbeatAt timestamp should be updated") - - // The key test: verify heartbeat actually extended the lease - extensionAmount := updatedLease.ExpiresAt - initialExpiresAt - require.Greater(t, extensionAmount, int64(0), - "HEARTBEAT SUCCESS: Lease was extended by %d ms. Heartbeats are working correctly.", extensionAmount) - -} - -// longRunningWorkflow simulates a workflow that takes time to execute, -// giving us opportunity to test heartbeat behavior during execution -type longRunningWorkflow struct { - engine *Engine - name string - executeTime time.Duration -} - -func (w *longRunningWorkflow) Name() string { - return w.name -} - -func (w *longRunningWorkflow) Run(ctx WorkflowContext, req any) error { - // Simulate long-running work by sleeping - // In a real test, this would be actual work that takes time - time.Sleep(w.executeTime) - return nil -} - -func (w *longRunningWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} diff --git a/go/pkg/hydra/worker_polling_test.go b/go/pkg/hydra/worker_polling_test.go deleted file mode 100644 index 1117822682..0000000000 --- a/go/pkg/hydra/worker_polling_test.go +++ /dev/null @@ -1,314 +0,0 @@ -package hydra - -import ( - "context" - "fmt" - "sync" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" -) - -// TestWorkerPollingEfficiency verifies that workers can handle concurrent load -// without excessive database contention or resource exhaustion -func TestWorkerPollingEfficiency(t *testing.T) { - engine := newTestEngine(t) - - const ( - numWorkers = 10 - numWorkflows = 50 - testDuration = 5 * time.Second - ) - - var completedWorkflows atomic.Int64 - - // Create workflow that tracks completion - pollingWorkflow := &pollingTestWorkflow{ - engine: engine, - name: "polling-test-workflow", - onPoll: func() { - completedWorkflows.Add(1) - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), testDuration) - defer cancel() - - // Start workers - var wg sync.WaitGroup - for i := 0; i < numWorkers; i++ { - wg.Add(1) - go func(workerID int) { - defer wg.Done() - - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: fmt.Sprintf("polling-worker-%d", workerID), - Concurrency: 5, // Multiple workflows per worker - PollInterval: 100 * time.Millisecond, - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, pollingWorkflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - <-ctx.Done() - }(i) - } - - // Submit workflows for processing - for i := 0; i < numWorkflows; i++ { - _, err := pollingWorkflow.Start(ctx, fmt.Sprintf("poll-test-%d", i)) - require.NoError(t, err) - } - - // Wait for completion or timeout - require.Eventually(t, func() bool { - return completedWorkflows.Load() >= int64(numWorkflows) - }, testDuration, 100*time.Millisecond, - "Should complete %d workflows within %v", numWorkflows, testDuration) - - wg.Wait() - - // Verify all workflows were processed - finalCompleted := completedWorkflows.Load() - require.GreaterOrEqual(t, finalCompleted, int64(numWorkflows), - "Should have completed at least %d workflows, got %d", numWorkflows, finalCompleted) -} - -// TestWorkerPollingAccuracy tests that workers actually poll at the configured interval -func TestWorkerPollingAccuracy(t *testing.T) { - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - const pollInterval = 200 * time.Millisecond - const tolerance = 50 * time.Millisecond // 25% tolerance - - var pollTimes []time.Time - var mu sync.Mutex - - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: "accuracy-test-worker", - Concurrency: 1, - PollInterval: pollInterval, - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - require.NoError(t, err) - - pollingWorkflow := &pollingTestWorkflow{ - engine: engine, - name: "accuracy-test-workflow", - onPoll: func() { - mu.Lock() - pollTimes = append(pollTimes, testClock.Now()) - mu.Unlock() - }, - } - - err = RegisterWorkflow(worker, pollingWorkflow) - require.NoError(t, err) - - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Advance clock to trigger multiple polls - for i := 0; i < 10; i++ { - testClock.Tick(pollInterval) - time.Sleep(10 * time.Millisecond) // Allow processing - } - - // Analyze interval accuracy - mu.Lock() - if len(pollTimes) < 2 { - mu.Unlock() - t.Skip("Not enough poll events to analyze intervals") - return - } - - actualIntervals := make([]time.Duration, len(pollTimes)-1) - for i := 1; i < len(pollTimes); i++ { - actualIntervals[i-1] = pollTimes[i].Sub(pollTimes[i-1]) - } - mu.Unlock() - - // Check each interval is within tolerance - accurateIntervals := 0 - for _, interval := range actualIntervals { - diff := interval - pollInterval - if diff < 0 { - diff = -diff - } - - isAccurate := diff <= tolerance - if isAccurate { - accurateIntervals++ - } - - } - - accuracy := float64(accurateIntervals) / float64(len(actualIntervals)) * 100 - - // Performance assertions - require.GreaterOrEqual(t, accuracy, 80.0, - "At least 80%% of polling intervals should be accurate, got %.1f%%", accuracy) - -} - -// TestThunderingHerdPrevention ensures that when many workers start at the same time, -// they don't all poll the database simultaneously causing performance issues -func TestThunderingHerdPrevention(t *testing.T) { - testClock := clock.NewTestClock() - engine := newTestEngineWithClock(t, testClock) - - const ( - numWorkers = 50 // Large number to stress test - pollInterval = 100 * time.Millisecond - ) - - // Track when each worker polls - pollEvents := make(chan time.Time, 1000) - - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - - // Start all workers simultaneously - var wg sync.WaitGroup - for i := 0; i < numWorkers; i++ { - wg.Add(1) - go func(workerID int) { - defer wg.Done() - - worker, err := NewWorker(engine, WorkerConfig{ - WorkerID: fmt.Sprintf("herd-worker-%d", workerID), - Concurrency: 1, - PollInterval: pollInterval, - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - require.NoError(t, err) - - pollingWorkflow := &pollingTestWorkflow{ - engine: engine, - name: "herd-test-workflow", - onPoll: func() { - select { - case pollEvents <- testClock.Now(): - default: - // Channel full, skip - } - }, - } - - err = RegisterWorkflow(worker, pollingWorkflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - <-ctx.Done() - }(i) - } - - // Advance time to trigger polling - go func() { - for { - select { - case <-ctx.Done(): - return - default: - testClock.Tick(pollInterval / 4) - time.Sleep(5 * time.Millisecond) - } - } - }() - - wg.Wait() - close(pollEvents) - - // Analyze thundering herd behavior - pollTimes := make([]time.Time, 0) - for pollTime := range pollEvents { - pollTimes = append(pollTimes, pollTime) - } - - // Check for clustering (thundering herd indicator) - clustering := analyzePollingClustering(pollTimes, pollInterval) - - // Performance assertion - require.Less(t, clustering, 0.5, - "Polling clustering should be low to prevent thundering herd, got %.2f", clustering) - -} - -// pollingTestWorkflow is a minimal workflow that tracks when it's polled for -type pollingTestWorkflow struct { - engine *Engine - name string - onPoll func() -} - -func (w *pollingTestWorkflow) Name() string { - return w.name -} - -func (w *pollingTestWorkflow) Run(ctx WorkflowContext, req any) error { - // This is called when the workflow is actually executed - // We use onPoll to track when workers check for pending work - if w.onPoll != nil { - w.onPoll() - } - - _, err := Step(ctx, "polling-step", func(context.Context) (string, error) { - return "polled", nil - }) - return err -} - -func (w *pollingTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// Helper function to analyze polling clustering (thundering herd detection) -func analyzePollingClustering(pollTimes []time.Time, pollInterval time.Duration) float64 { - if len(pollTimes) < 2 { - return 0 - } - - // Group polls by time windows - windowSize := pollInterval / 10 // 10% of poll interval - timeWindows := make(map[int64]int) - - baseTime := pollTimes[0] - for _, pollTime := range pollTimes { - windowIndex := pollTime.Sub(baseTime).Nanoseconds() / windowSize.Nanoseconds() - timeWindows[windowIndex]++ - } - - // Calculate clustering factor (higher = more clustered) - totalPolls := len(pollTimes) - maxWindowCount := 0 - - for _, count := range timeWindows { - if count > maxWindowCount { - maxWindowCount = count - } - } - - clustering := float64(maxWindowCount) / float64(totalPolls) - - return clustering -} diff --git a/go/pkg/hydra/workflow.go b/go/pkg/hydra/workflow.go deleted file mode 100644 index c00ad43e80..0000000000 --- a/go/pkg/hydra/workflow.go +++ /dev/null @@ -1,315 +0,0 @@ -package hydra - -import ( - "context" - "database/sql" - "fmt" - "time" - - "github.com/unkeyed/unkey/go/pkg/hydra/store" - "github.com/unkeyed/unkey/go/pkg/otel/logging" - "github.com/unkeyed/unkey/go/pkg/otel/tracing" - "go.opentelemetry.io/otel/attribute" -) - -// Workflow defines the interface for typed workflows. -// -// Workflows are the core business logic containers in Hydra. They define -// a series of steps to be executed reliably with exactly-once guarantees. -// -// Workflows must be stateless and deterministic - they can be executed -// multiple times with the same input and produce the same result. State -// is managed by the workflow engine and persisted automatically. -// -// Type parameter TReq defines the input payload type for the workflow. -// Use 'any' for workflows that accept different payload types. -// -// Example implementation: -// -// type OrderWorkflow struct{} -// -// func (w *OrderWorkflow) Name() string { -// return "order-processing" -// } -// -// func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error { -// // Execute steps using hydra.Step() -// payment, err := hydra.Step(ctx, "validate-payment", func(stepCtx context.Context) (*Payment, error) { -// return validatePayment(stepCtx, req.PaymentID) -// }) -// if err != nil { -// return err -// } -// -// // Additional steps... -// return nil -// } -type Workflow[TReq any] interface { - // Name returns a unique identifier for this workflow type. - // The name is used to route workflow executions to the correct handler - // and must be consistent across deployments. - Name() string - - // Run executes the workflow logic with the provided context and request. - // This method should be deterministic and idempotent. - // - // The context provides access to workflow execution metadata and - // the Step() function for creating durable execution units. - // - // Returning an error will mark the workflow as failed and trigger - // retry logic if configured. Use hydra.Sleep() to suspend the - // workflow for time-based coordination. - Run(ctx WorkflowContext, req TReq) error -} - -// GenericWorkflow is a type alias for workflows that accept any request type. -// This is useful when registering workflows that handle different payload types -// or when the payload type is not known at compile time. -type GenericWorkflow = Workflow[any] - -// RawPayload represents raw workflow input data that needs to be unmarshalled -type RawPayload struct { - Data []byte -} - -// WorkflowContext provides access to workflow execution context and utilities. -// -// The context is passed to workflow Run() methods and provides access to: -// - The underlying Go context for cancellation and timeouts -// - Workflow execution metadata like execution ID and name -// - Step execution utilities through the Step() function -// -// Workflow contexts are created and managed by the workflow engine and -// should not be created manually. -type WorkflowContext interface { - // Context returns the underlying Go context for this workflow execution. - // This context will be cancelled if the workflow is cancelled or times out. - Context() context.Context - - // ExecutionID returns the unique identifier for this workflow execution. - // This ID can be used for logging, tracking, and debugging purposes. - ExecutionID() string - - // WorkflowName returns the name of the workflow being executed. - // This matches the value returned by the workflow's Name() method. - WorkflowName() string -} - -// workflowContext implements WorkflowContext and provides internal workflow utilities -type workflowContext struct { - ctx context.Context - executionID string - workflowName string - namespace string - workerID string - db *sql.DB - marshaller Marshaller - logger logging.Logger - stepTimeout time.Duration - stepMaxAttempts int32 -} - -func (w *workflowContext) Context() context.Context { - return w.ctx -} - -func (w *workflowContext) ExecutionID() string { - return w.executionID -} - -func (w *workflowContext) WorkflowName() string { - return w.workflowName -} - -func (w *workflowContext) markStepCompleted(stepName string, outputData []byte) error { - // Use simple step update - we're already in workflow execution context - return store.Query.UpdateStepStatus(w.ctx, w.db, store.UpdateStepStatusParams{ - Status: store.WorkflowStepsStatusCompleted, - CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - OutputData: outputData, - ErrorMessage: sql.NullString{String: "", Valid: false}, - Namespace: w.namespace, - ExecutionID: w.executionID, - StepName: stepName, - }) -} - -func (w *workflowContext) markStepFailed(stepName string, errorMsg string) error { - // Use simple step update - we're already in workflow execution context - return store.Query.UpdateStepStatus(w.ctx, w.db, store.UpdateStepStatusParams{ - Status: store.WorkflowStepsStatusFailed, - CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true}, - OutputData: []byte{}, - ErrorMessage: sql.NullString{String: errorMsg, Valid: errorMsg != ""}, - Namespace: w.namespace, - ExecutionID: w.executionID, - StepName: stepName, - }) -} - -// RegisterWorkflow registers a typed workflow with a worker. -// -// This function associates a workflow implementation with a worker so that -// the worker can execute workflows of this type. The workflow's Name() method -// is used as the unique identifier for routing workflow executions. -// -// The function handles type conversion transparently, allowing strongly-typed -// workflow implementations to be registered with the generic worker interface. -// -// Parameters: -// - w: The worker that will execute this workflow type -// - workflow: The workflow implementation to register -// -// Example: -// -// type OrderWorkflow struct{} -// -// func (w *OrderWorkflow) Name() string { return "order-processing" } -// func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error { -// // workflow implementation -// return nil -// } -// -// orderWorkflow := &OrderWorkflow{} -// err := hydra.RegisterWorkflow(worker, orderWorkflow) -// if err != nil { -// return err -// } -// -// Requirements: -// - The workflow name must be unique within the worker -// - The workflow must implement the Workflow[TReq] interface -// - The worker must be started with Start() after registration -// -// Returns an error if: -// - A workflow with the same name is already registered -// - The worker type is invalid -func RegisterWorkflow[TReq any](w Worker, workflow Workflow[TReq]) error { - worker, ok := w.(*worker) - if !ok { - return fmt.Errorf("invalid worker type") - } - - if _, exists := worker.workflows[workflow.Name()]; exists { - return fmt.Errorf("workflow %q is already registered", workflow.Name()) - } - - // Create a wrapper that handles the type conversion - genericWorkflow := &workflowWrapper[TReq]{ - wrapped: workflow, - } - - worker.workflows[workflow.Name()] = genericWorkflow - return nil -} - -// workflowWrapper wraps a typed workflow to implement GenericWorkflow -type workflowWrapper[TReq any] struct { - wrapped Workflow[TReq] -} - -func (w *workflowWrapper[TReq]) Name() string { - return w.wrapped.Name() -} - -func (w *workflowWrapper[TReq]) Run(ctx WorkflowContext, req any) error { - wctx, ok := ctx.(*workflowContext) - if !ok { - return fmt.Errorf("invalid context type, expected *workflowContext") - } - - // Start tracing span for workflow execution - workflowCtx, span := tracing.Start(wctx.ctx, fmt.Sprintf("hydra.workflow.%s", w.wrapped.Name())) - defer span.End() - - span.SetAttributes( - attribute.String("hydra.workflow.name", w.wrapped.Name()), - attribute.String("hydra.execution.id", wctx.executionID), - attribute.String("hydra.namespace", wctx.namespace), - attribute.String("hydra.worker.id", wctx.workerID), - ) - - // Update the workflow context to use the traced context - wctx.ctx = workflowCtx - - // Extract the raw payload and unmarshal it to the correct type - rawPayload, ok := req.(*RawPayload) - if !ok { - err := fmt.Errorf("expected RawPayload, got %T", req) - tracing.RecordError(span, err) - return err - } - - var typedReq TReq - if err := wctx.marshaller.Unmarshal(rawPayload.Data, &typedReq); err != nil { - tracing.RecordError(span, err) - return fmt.Errorf("failed to unmarshal workflow request: %w", err) - } - - // Pass the updated workflow context (with traced context) to the workflow implementation - err := w.wrapped.Run(wctx, typedReq) - if err != nil { - tracing.RecordError(span, err) - - span.SetAttributes(attribute.String("hydra.workflow.status", "failed")) - } else { - span.SetAttributes(attribute.String("hydra.workflow.status", "completed")) - } - - return err -} - -// WorkflowOption defines a function that configures workflow execution -type WorkflowOption func(*WorkflowConfig) - -// WorkflowConfig holds the configuration for workflow execution -type WorkflowConfig struct { - MaxAttempts int32 - - TimeoutDuration time.Duration - - RetryBackoff time.Duration - - TriggerType store.WorkflowExecutionsTriggerType - TriggerSource *string -} - -// WithMaxAttempts sets the maximum number of retry attempts for a workflow -func WithMaxAttempts(attempts int32) WorkflowOption { - return func(c *WorkflowConfig) { - c.MaxAttempts = attempts - } -} - -// WithTimeout sets the timeout duration for a workflow -func WithTimeout(timeout time.Duration) WorkflowOption { - return func(c *WorkflowConfig) { - c.TimeoutDuration = timeout - } -} - -// WithRetryBackoff sets the retry backoff duration for a workflow -func WithRetryBackoff(backoff time.Duration) WorkflowOption { - return func(c *WorkflowConfig) { - c.RetryBackoff = backoff - } -} - -// WithTrigger sets the trigger type and source for a workflow -func WithTrigger(triggerType store.WorkflowExecutionsTriggerType, triggerSource *string) WorkflowOption { - return func(c *WorkflowConfig) { - c.TriggerType = triggerType - c.TriggerSource = triggerSource - } -} - -// WorkflowSuspendedError represents an error that suspends workflow execution until a specific time -type WorkflowSuspendedError struct { - Reason string - - ResumeTime int64 -} - -func (e *WorkflowSuspendedError) Error() string { - return fmt.Sprintf("workflow suspended for %s until %d", e.Reason, e.ResumeTime) -} diff --git a/go/pkg/hydra/workflow_performance_test.go b/go/pkg/hydra/workflow_performance_test.go deleted file mode 100644 index 148acb614a..0000000000 --- a/go/pkg/hydra/workflow_performance_test.go +++ /dev/null @@ -1,422 +0,0 @@ -package hydra - -import ( - "context" - "fmt" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/go/pkg/clock" -) - -// TestWorkflowPickupLatencyBaseline measures the baseline latency for a single worker -// to pick up and start executing a single workflow. This establishes our performance -// baseline before testing the 5-second SLA requirement. -func TestWorkflowPickupLatencyBaseline(t *testing.T) { - // Arrange: Create engine with real clock for accurate timing - realClock := clock.New() - engine := newTestEngineWithClock(t, realClock) - - var workflowStartTime atomic.Int64 - - // Create a workflow that records when it actually starts executing - workflow := &latencyTestWorkflow{ - engine: engine, - name: "baseline-latency-workflow", - onStart: func() { - workflowStartTime.Store(time.Now().UnixMilli()) - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Start worker with production-like configuration - worker, err := NewWorker(engine, WorkerConfig{ - Concurrency: 1, - PollInterval: 100 * time.Millisecond, // Realistic poll interval - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - require.NoError(t, err) - - err = RegisterWorkflow(worker, workflow) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - // Act: Record submission time and start workflow - _, err = workflow.Start(ctx, struct{}{}) - require.NoError(t, err) - - // Wait for workflow to start executing - require.Eventually(t, func() bool { - return workflowStartTime.Load() != 0 - }, 5*time.Second, 10*time.Millisecond, "Workflow should start executing within 5 seconds") - - // Calculate pickup latency - latency := time.Since(time.UnixMilli(workflowStartTime.Load())) - - require.Less(t, latency, 5*time.Second, "Pickup latency should be less than 5 seconds for baseline test") -} - -// latencyTestWorkflow is a minimal workflow for testing pickup latency -type latencyTestWorkflow struct { - engine *Engine - name string - onStart func() -} - -func (w *latencyTestWorkflow) Name() string { - return w.name -} - -func (w *latencyTestWorkflow) Run(ctx WorkflowContext, req any) error { - // Record when workflow actually starts executing - if w.onStart != nil { - w.onStart() - } - - // Minimal work to complete quickly - _, err := Step(ctx, "timing-step", func(context.Context) (string, error) { - return "completed", nil - }) - return err -} - -func (w *latencyTestWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// TestWorkflowPickupLatencyConcurrent verifies that ALL workflows are picked up within the 5-second SLA -// under concurrent load. This tests the critical requirement that every workflow must be processed -// within the SLA window, not just the average. -func TestWorkflowPickupLatencyConcurrent(t *testing.T) { - // Arrange: Create engine with real clock for accurate timing - realClock := clock.New() - engine := newTestEngineWithClock(t, realClock) - - const numWorkers = 5 // Multiple workers to test concurrent performance - const numWorkflows = 50 // Realistic batch to stress test SLA compliance - - var completedCount atomic.Int64 - var maxLatency atomic.Int64 - var slaViolations atomic.Int64 - - // Create workflow factory that records completion timing - createWorkflow := func(id int) *concurrentLatencyWorkflow { - return &concurrentLatencyWorkflow{ - engine: engine, - name: "concurrent-latency-workflow", - id: id, - onComplete: func(latencyMs int64) { - // Track maximum latency across all workflows - for { - current := maxLatency.Load() - if latencyMs <= current || maxLatency.CompareAndSwap(current, latencyMs) { - break - } - } - - // Count SLA violations (workflows taking >5s) - if latencyMs > 5000 { - slaViolations.Add(1) - t.Errorf("SLA VIOLATION: Workflow %d took %dms (>5000ms) to be picked up", id, latencyMs) - } - - completedCount.Add(1) - }, - } - } - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - // Start multiple workers sharing the same database - workers := make([]Worker, numWorkers) - for i := 0; i < numWorkers; i++ { - worker, err := NewWorker(engine, WorkerConfig{ - Concurrency: 10, // Reasonable concurrency per worker - PollInterval: 50 * time.Millisecond, // Fast polling for concurrent load - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - require.NoError(t, err) - - // Register the workflow type with each worker - err = RegisterWorkflow(worker, createWorkflow(0)) - require.NoError(t, err) - - err = worker.Start(ctx) - require.NoError(t, err) - defer worker.Shutdown(ctx) - - workers[i] = worker - } - - // Act: Submit all workflows as quickly as possible - submissionStart := time.Now() - executionIDs := make([]string, numWorkflows) - - for i := 0; i < numWorkflows; i++ { - workflow := createWorkflow(i) - executionID, err := workflow.Start(ctx, submissionStart.UnixMilli()) - require.NoError(t, err) - executionIDs[i] = executionID - } - - _ = time.Since(submissionStart) // Submission timing not needed for SLA test - - // Wait for all workflows to complete - require.Eventually(t, func() bool { - return completedCount.Load() == numWorkflows - }, 15*time.Second, 100*time.Millisecond, - "All %d workflows should complete within timeout", numWorkflows) - - // Assert SLA compliance: ALL workflows must be picked up within 5 seconds - finalSlaViolations := slaViolations.Load() - finalMaxLatency := maxLatency.Load() - - require.Equal(t, int64(0), finalSlaViolations, - "SLA VIOLATION: %d out of %d workflows took longer than 5 seconds to be picked up", - finalSlaViolations, numWorkflows) - - require.Less(t, finalMaxLatency, int64(5000), - "SLA VIOLATION: Maximum pickup latency was %dms, must be <5000ms for ALL workflows", - finalMaxLatency) - -} - -// concurrentLatencyWorkflow tracks individual workflow latency in concurrent scenarios -type concurrentLatencyWorkflow struct { - engine *Engine - name string - id int - onComplete func(latencyMs int64) -} - -func (w *concurrentLatencyWorkflow) Name() string { - return w.name -} - -func (w *concurrentLatencyWorkflow) Run(ctx WorkflowContext, req any) error { - var submissionTime int64 - switch v := req.(type) { - case int64: - submissionTime = v - case float64: - submissionTime = int64(v) // JSON unmarshaling converts numbers to float64 - default: - return fmt.Errorf("expected int64 or float64 submission time, got %T", req) - } - - // Calculate latency from submission to execution start - latency := time.Now().UnixMilli() - submissionTime - - // Report completion with latency - if w.onComplete != nil { - w.onComplete(latency) - } - - // Minimal work to complete quickly - _, err := Step(ctx, "latency-step", func(context.Context) (string, error) { - return "completed", nil - }) - - return err -} - -func (w *concurrentLatencyWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// BenchmarkWorkflowSubmission measures the rate at which workflows can be submitted -func BenchmarkWorkflowSubmission(b *testing.B) { - engine := newTestEngineBench(b) - - workflow := &benchmarkWorkflow{ - engine: engine, - name: "benchmark-workflow", - } - - ctx := context.Background() - - b.ResetTimer() - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - _, err := workflow.Start(ctx, struct{}{}) - if err != nil { - b.Fatal(err) - } - } - }) -} - -// BenchmarkWorkflowThroughput measures end-to-end workflow processing throughput -func BenchmarkWorkflowThroughput(b *testing.B) { - engine := newTestEngineBench(b) - - workflow := &benchmarkWorkflow{ - engine: engine, - name: "throughput-workflow", - } - - // Start a single worker - worker, err := NewWorker(engine, WorkerConfig{ - Concurrency: 10, // Process multiple workflows concurrently - PollInterval: 10 * time.Millisecond, // Fast polling for benchmarks - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - if err != nil { - b.Fatal(err) - } - - err = RegisterWorkflow(worker, workflow) - if err != nil { - b.Fatal(err) - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - err = worker.Start(ctx) - if err != nil { - b.Fatal(err) - } - defer worker.Shutdown(ctx) - - // Give worker time to start - time.Sleep(50 * time.Millisecond) - - b.ResetTimer() - - // Track completion - var completed atomic.Int64 - workflow.onComplete = func() { - completed.Add(1) - } - - // Submit N workflows as fast as possible - submissionStart := time.Now() - for i := 0; i < b.N; i++ { - _, err := workflow.Start(ctx, struct{}{}) - if err != nil { - b.Fatal(err) - } - } - submissionDuration := time.Since(submissionStart) - - // Wait for all workflows to complete - for completed.Load() < int64(b.N) { - time.Sleep(1 * time.Millisecond) - } - - b.ReportMetric(float64(b.N)/submissionDuration.Seconds(), "submissions/sec") - b.ReportMetric(float64(b.N)/b.Elapsed().Seconds(), "completions/sec") -} - -// BenchmarkSingleWorkerLatency measures latency with a single worker processing one workflow at a time -func BenchmarkSingleWorkerLatency(b *testing.B) { - engine := newTestEngineBench(b) - - workflow := &benchmarkWorkflow{ - engine: engine, - name: "latency-workflow", - } - - worker, err := NewWorker(engine, WorkerConfig{ - Concurrency: 1, // Single workflow at a time - PollInterval: 1 * time.Millisecond, // Very fast polling - HeartbeatInterval: 5 * time.Second, - ClaimTimeout: 30 * time.Second, - }) - if err != nil { - b.Fatal(err) - } - - err = RegisterWorkflow(worker, workflow) - if err != nil { - b.Fatal(err) - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - err = worker.Start(ctx) - if err != nil { - b.Fatal(err) - } - defer worker.Shutdown(ctx) - - time.Sleep(50 * time.Millisecond) - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - completed := make(chan struct{}) - workflow.onComplete = func() { - close(completed) - } - - start := time.Now() - _, err := workflow.Start(ctx, struct{}{}) - if err != nil { - b.Fatal(err) - } - - <-completed - latency := time.Since(start) - - // Report per-operation timing - if i == 0 { - b.ReportMetric(float64(latency.Nanoseconds()), "ns/workflow") - } - } -} - -// benchmarkWorkflow is a minimal workflow for benchmarking -type benchmarkWorkflow struct { - engine *Engine - name string - onComplete func() -} - -func (w *benchmarkWorkflow) Name() string { - return w.name -} - -func (w *benchmarkWorkflow) Run(ctx WorkflowContext, req any) error { - // Minimal work - just complete a simple step - _, err := Step(ctx, "benchmark-step", func(context.Context) (string, error) { - return "done", nil - }) - - if w.onComplete != nil { - w.onComplete() - } - - return err -} - -func (w *benchmarkWorkflow) Start(ctx context.Context, payload any) (string, error) { - return w.engine.StartWorkflow(ctx, w.Name(), payload) -} - -// Helper for benchmarks that need testing.TB interface -func newTestEngineBench(tb testing.TB) *Engine { - // Use MySQL container for benchmarks - t, ok := tb.(*testing.T) - if !ok { - // For benchmarks, create a new testing.T - t = &testing.T{} - t.Helper() - } - - // Use the unified test helper - return newTestEngineWithClock(t, clock.New()) -} From 32e9645b7dff7e9da84191208964dbb26e9d6bbb Mon Sep 17 00:00:00 2001 From: chronark Date: Fri, 3 Oct 2025 19:15:33 +0200 Subject: [PATCH 3/3] fix: idk why but this fixes the component it failed to build otherwise --- .../content/docs/cli/run/ctrl/index.mdx | 75 ++++++++++++------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/apps/engineering/content/docs/cli/run/ctrl/index.mdx b/apps/engineering/content/docs/cli/run/ctrl/index.mdx index 304e132607..c47c74feca 100644 --- a/apps/engineering/content/docs/cli/run/ctrl/index.mdx +++ b/apps/engineering/content/docs/cli/run/ctrl/index.mdx @@ -21,7 +21,8 @@ HTTP port for the control plane server to listen on. Default: 8080 - **Type:** integer - **Default:** `8080` - **Environment:** `UNKEY_HTTP_PORT` - + + Enable colored log output. Default: true @@ -29,21 +30,24 @@ Enable colored log output. Default: true - **Type:** boolean - **Default:** `true` - **Environment:** `UNKEY_LOGS_COLOR` - + + Cloud platform identifier for this node. Used for logging and metrics. - **Type:** string - **Environment:** `UNKEY_PLATFORM` - + + Container image identifier. Used for logging and metrics. - **Type:** string - **Environment:** `UNKEY_IMAGE` - + + Geographic region identifier. Used for logging and routing. Default: unknown @@ -51,7 +55,8 @@ Geographic region identifier. Used for logging and routing. Default: unknown - **Type:** string - **Default:** `"unknown"` - **Environment:** `AWS_REGION` - + + Unique identifier for this instance. Auto-generated if not provided. @@ -59,21 +64,24 @@ Unique identifier for this instance. Auto-generated if not provided. - **Type:** string - **Default:** `"ins_5PkxT8"` - **Environment:** `UNKEY_INSTANCE_ID` - + + MySQL connection string for primary database. Required for all deployments. Example: user:pass@host:3306/unkey?parseTime=true - **Type:** string - **Environment:** `UNKEY_DATABASE_PRIMARY` - + + MySQL connection string for partition database. Required for all deployments. Example: user:pass@host:3306/partition_002?parseTime=true - **Type:** string - **Environment:** `UNKEY_DATABASE_PARTITION` - + + Enable OpenTelemetry tracing and metrics @@ -81,7 +89,8 @@ Enable OpenTelemetry tracing and metrics - **Type:** boolean - **Default:** `false` - **Environment:** `UNKEY_OTEL` - + + Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provided. Default: 0.25 @@ -89,42 +98,48 @@ Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provi - **Type:** float - **Default:** `0.25` - **Environment:** `UNKEY_OTEL_TRACE_SAMPLING_RATE` - + + Path to TLS certificate file for HTTPS. Both cert and key must be provided to enable HTTPS. - **Type:** string - **Environment:** `UNKEY_TLS_CERT_FILE` - + + Path to TLS key file for HTTPS. Both cert and key must be provided to enable HTTPS. - **Type:** string - **Environment:** `UNKEY_TLS_KEY_FILE` - + + Authentication token for control plane API access. Required for secure deployments. - **Type:** string - **Environment:** `UNKEY_AUTH_TOKEN` - + + Full URL of the krane service for VM operations. Required for deployments. Example: https://krane.example.com:8080 - **Type:** string - **Environment:** `UNKEY_KRANE_ADDRESS` - + + API key for simple authentication (demo purposes only). Will be replaced with JWT authentication. - **Type:** string - **Environment:** `UNKEY_API_KEY` - + + Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/agent/agent.sock @@ -132,42 +147,48 @@ Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/age - **Type:** string - **Default:** `"/var/lib/spire/agent/agent.sock"` - **Environment:** `UNKEY_SPIFFE_SOCKET_PATH` - + + Vault master keys for encryption - **Type:** string[] - **Environment:** `UNKEY_VAULT_MASTER_KEYS` - + + S3 Compatible Endpoint URL - **Type:** string - **Environment:** `UNKEY_VAULT_S3_URL` - + + S3 bucket name - **Type:** string - **Environment:** `UNKEY_VAULT_S3_BUCKET` - + + S3 access key ID - **Type:** string - **Environment:** `UNKEY_VAULT_S3_ACCESS_KEY_ID` - + + S3 secret access key - **Type:** string - **Environment:** `UNKEY_VAULT_S3_ACCESS_KEY_SECRET` - + + Enable Let's Encrypt for acme challenges @@ -175,7 +196,8 @@ Enable Let's Encrypt for acme challenges - **Type:** boolean - **Default:** `false` - **Environment:** `UNKEY_ACME_ENABLED` - + + Enable Cloudflare for wildcard certificates @@ -183,14 +205,16 @@ Enable Cloudflare for wildcard certificates - **Type:** boolean - **Default:** `false` - **Environment:** `UNKEY_ACME_CLOUDFLARE_ENABLED` - + + Cloudflare API token for Let's Encrypt - **Type:** string - **Environment:** `UNKEY_ACME_CLOUDFLARE_API_TOKEN` - + + Default domain for auto-generated hostnames @@ -198,4 +222,5 @@ Default domain for auto-generated hostnames - **Type:** string - **Default:** `"unkey.app"` - **Environment:** `UNKEY_DEFAULT_DOMAIN` - + +