diff --git a/apps/engineering/content/docs/cli/run/ctrl/index.mdx b/apps/engineering/content/docs/cli/run/ctrl/index.mdx
index 874e9dab4e..c47c74feca 100644
--- a/apps/engineering/content/docs/cli/run/ctrl/index.mdx
+++ b/apps/engineering/content/docs/cli/run/ctrl/index.mdx
@@ -10,7 +10,7 @@ unkey run ctrl [flags]
```
-Some flags are required for this command to work properly.
+ Some flags are required for this command to work properly.
## Flags
@@ -21,6 +21,7 @@ HTTP port for the control plane server to listen on. Default: 8080
- **Type:** integer
- **Default:** `8080`
- **Environment:** `UNKEY_HTTP_PORT`
+
@@ -29,6 +30,7 @@ Enable colored log output. Default: true
- **Type:** boolean
- **Default:** `true`
- **Environment:** `UNKEY_LOGS_COLOR`
+
@@ -36,6 +38,7 @@ Cloud platform identifier for this node. Used for logging and metrics.
- **Type:** string
- **Environment:** `UNKEY_PLATFORM`
+
@@ -43,6 +46,7 @@ Container image identifier. Used for logging and metrics.
- **Type:** string
- **Environment:** `UNKEY_IMAGE`
+
@@ -51,6 +55,7 @@ Geographic region identifier. Used for logging and routing. Default: unknown
- **Type:** string
- **Default:** `"unknown"`
- **Environment:** `AWS_REGION`
+
@@ -59,6 +64,7 @@ Unique identifier for this instance. Auto-generated if not provided.
- **Type:** string
- **Default:** `"ins_5PkxT8"`
- **Environment:** `UNKEY_INSTANCE_ID`
+
@@ -66,6 +72,7 @@ MySQL connection string for primary database. Required for all deployments. Exam
- **Type:** string
- **Environment:** `UNKEY_DATABASE_PRIMARY`
+
@@ -73,13 +80,7 @@ MySQL connection string for partition database. Required for all deployments. Ex
- **Type:** string
- **Environment:** `UNKEY_DATABASE_PARTITION`
-
-
-
-MySQL connection string for hydra database. Required for all deployments. Example: user:pass@host:3306/hydra?parseTime=true
-- **Type:** string
-- **Environment:** `UNKEY_DATABASE_HYDRA`
@@ -88,6 +89,7 @@ Enable OpenTelemetry tracing and metrics
- **Type:** boolean
- **Default:** `false`
- **Environment:** `UNKEY_OTEL`
+
@@ -96,6 +98,7 @@ Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provi
- **Type:** float
- **Default:** `0.25`
- **Environment:** `UNKEY_OTEL_TRACE_SAMPLING_RATE`
+
@@ -103,6 +106,7 @@ Path to TLS certificate file for HTTPS. Both cert and key must be provided to en
- **Type:** string
- **Environment:** `UNKEY_TLS_CERT_FILE`
+
@@ -110,6 +114,7 @@ Path to TLS key file for HTTPS. Both cert and key must be provided to enable HTT
- **Type:** string
- **Environment:** `UNKEY_TLS_KEY_FILE`
+
@@ -117,6 +122,7 @@ Authentication token for control plane API access. Required for secure deploymen
- **Type:** string
- **Environment:** `UNKEY_AUTH_TOKEN`
+
@@ -124,6 +130,7 @@ Full URL of the krane service for VM operations. Required for deployments. Examp
- **Type:** string
- **Environment:** `UNKEY_KRANE_ADDRESS`
+
@@ -131,6 +138,7 @@ API key for simple authentication (demo purposes only). Will be replaced with JW
- **Type:** string
- **Environment:** `UNKEY_API_KEY`
+
@@ -139,6 +147,7 @@ Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/age
- **Type:** string
- **Default:** `"/var/lib/spire/agent/agent.sock"`
- **Environment:** `UNKEY_SPIFFE_SOCKET_PATH`
+
@@ -146,6 +155,7 @@ Vault master keys for encryption
- **Type:** string[]
- **Environment:** `UNKEY_VAULT_MASTER_KEYS`
+
@@ -153,6 +163,7 @@ S3 Compatible Endpoint URL
- **Type:** string
- **Environment:** `UNKEY_VAULT_S3_URL`
+
@@ -160,6 +171,7 @@ S3 bucket name
- **Type:** string
- **Environment:** `UNKEY_VAULT_S3_BUCKET`
+
@@ -167,6 +179,7 @@ S3 access key ID
- **Type:** string
- **Environment:** `UNKEY_VAULT_S3_ACCESS_KEY_ID`
+
@@ -174,6 +187,7 @@ S3 secret access key
- **Type:** string
- **Environment:** `UNKEY_VAULT_S3_ACCESS_KEY_SECRET`
+
@@ -182,6 +196,7 @@ Enable Let's Encrypt for acme challenges
- **Type:** boolean
- **Default:** `false`
- **Environment:** `UNKEY_ACME_ENABLED`
+
@@ -190,6 +205,7 @@ Enable Cloudflare for wildcard certificates
- **Type:** boolean
- **Default:** `false`
- **Environment:** `UNKEY_ACME_CLOUDFLARE_ENABLED`
+
@@ -197,6 +213,7 @@ Cloudflare API token for Let's Encrypt
- **Type:** string
- **Environment:** `UNKEY_ACME_CLOUDFLARE_API_TOKEN`
+
@@ -205,4 +222,5 @@ Default domain for auto-generated hostnames
- **Type:** string
- **Default:** `"unkey.app"`
- **Environment:** `UNKEY_DEFAULT_DOMAIN`
+
diff --git a/apps/engineering/content/docs/infrastructure/database-schema.mdx b/apps/engineering/content/docs/infrastructure/database-schema.mdx
index 3862bfa704..963545960d 100644
--- a/apps/engineering/content/docs/infrastructure/database-schema.mdx
+++ b/apps/engineering/content/docs/infrastructure/database-schema.mdx
@@ -8,7 +8,6 @@ description: "How database schemas are managed and applied in the Unkey platform
Unkey uses multiple MySQL databases that are automatically created and initialized during development:
- **unkey**: Main application database containing APIs, keys, workspaces, and related data
-- **hydra**: Workflow orchestration engine database for managing deployment workflows
- **partition_00X**: Dataplane partition database
## Schema Files
@@ -17,6 +16,7 @@ Schema definitions are maintained in separate files:
- `go/pkg/db/schema.sql` - Main Unkey application schema
- `go/pkg/partition/schema.sql` - Dataplane schema
+
## Docker Development Setup
During local development, schemas are automatically applied via Docker:
diff --git a/deployment/Dockerfile.mysql b/deployment/Dockerfile.mysql
index 53e5494e3b..5aa2e0ee2d 100644
--- a/deployment/Dockerfile.mysql
+++ b/deployment/Dockerfile.mysql
@@ -6,7 +6,6 @@ COPY deployment/init-databases.sql /docker-entrypoint-initdb.d/00-init-databases
# Copy schemas from their respective packages
COPY go/pkg/db/schema.sql /docker-entrypoint-initdb.d/01-main-schema.sql
COPY go/pkg/partition/db/schema.sql /docker-entrypoint-initdb.d/02-partition-schema.sql
-COPY go/pkg/hydra/store/schema.sql /docker-entrypoint-initdb.d/03-hydra-schema.sql
# Copy seed data for local development
COPY deployment/04-seed-workspace.sql /docker-entrypoint-initdb.d/04-seed-workspace.sql
diff --git a/deployment/docker-compose.yaml b/deployment/docker-compose.yaml
index e509e093dd..fcc5da5c9d 100644
--- a/deployment/docker-compose.yaml
+++ b/deployment/docker-compose.yaml
@@ -336,7 +336,6 @@ services:
- /var/run/docker.sock:/var/run/docker.sock
environment:
UNKEY_DATABASE_PRIMARY: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true"
- UNKEY_DATABASE_HYDRA: "unkey:password@tcp(mysql:3306)/hydra?parseTime=true&interpolateParams=true"
UNKEY_DATABASE_PARTITION: "unkey:password@tcp(mysql:3306)/partition_001?parseTime=true&interpolateParams=true"
# Control plane configuration
diff --git a/deployment/init-databases.sql b/deployment/init-databases.sql
index fee86ce13e..a547aff02b 100644
--- a/deployment/init-databases.sql
+++ b/deployment/init-databases.sql
@@ -1,6 +1,5 @@
-- Initialize multiple databases for the Unkey deployment platform
CREATE DATABASE IF NOT EXISTS unkey;
-CREATE DATABASE IF NOT EXISTS hydra;
CREATE DATABASE IF NOT EXISTS partition_001;
-- Create the unkey user
@@ -8,6 +7,5 @@ CREATE USER IF NOT EXISTS 'unkey'@'%' IDENTIFIED BY 'password';
-- Grant permissions to unkey user for all databases
GRANT ALL PRIVILEGES ON unkey.* TO 'unkey'@'%';
-GRANT ALL PRIVILEGES ON hydra.* TO 'unkey'@'%';
GRANT ALL PRIVILEGES ON partition_001.* TO 'unkey'@'%';
FLUSH PRIVILEGES;
diff --git a/go/pkg/hydra/README.md b/go/pkg/hydra/README.md
deleted file mode 100644
index 5c38b8f594..0000000000
--- a/go/pkg/hydra/README.md
+++ /dev/null
@@ -1,519 +0,0 @@
-# Hydra π
-
-> **Distributed workflow orchestration engine for Go**
-
-Hydra is a robust, scalable workflow orchestration engine designed for reliable execution of multi-step business processes. Built with exactly-once execution guarantees, automatic retries, and comprehensive observability.
-
-## Features
-
-π **Exactly-Once Execution** - Workflows and steps execute exactly once, even with failures
-β‘ **Durable State** - All state persisted to database, survives crashes and restarts
-π **Automatic Retries** - Configurable retry policies with exponential backoff
-π **Rich Observability** - Built-in Prometheus metrics and structured logging
-β° **Flexible Scheduling** - Immediate execution, cron schedules, and sleep states
-ποΈ **Distributed Coordination** - Multiple workers with lease-based coordination
-π― **Type Safety** - Strongly-typed workflows with compile-time guarantees
-π§ **Checkpointing** - Automatic step result caching for fault tolerance
-
-## Quick Start
-
-### Installation
-
-```bash
-go get github.com/unkeyed/unkey/go/pkg/hydra
-```
-
-### Basic Example
-
-```go
-package main
-
-import (
- "context"
- "fmt"
- "time"
-
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra"
- "github.com/unkeyed/unkey/go/pkg/hydra/store/gorm"
- "gorm.io/driver/mysql"
- gormDriver "gorm.io/gorm"
-)
-
-// Define your workflow
-type OrderWorkflow struct{}
-
-func (w *OrderWorkflow) Name() string {
- return "order-processing"
-}
-
-func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error {
- // Step 1: Validate payment
- payment, err := hydra.Step(ctx, "validate-payment", func(stepCtx context.Context) (*Payment, error) {
- return validatePayment(stepCtx, req.PaymentID)
- })
- if err != nil {
- return err
- }
-
- // Step 2: Reserve inventory
- _, err = hydra.Step(ctx, "reserve-inventory", func(stepCtx context.Context) (*Reservation, error) {
- return reserveInventory(stepCtx, req.Items)
- })
- if err != nil {
- return err
- }
-
- // Step 3: Process order
- _, err = hydra.Step(ctx, "process-order", func(stepCtx context.Context) (*Order, error) {
- return processOrder(stepCtx, payment, req.Items)
- })
-
- return err
-}
-
-func main() {
- // Set up database
- db, err := gormDriver.Open(mysql.Open("dsn"), &gormDriver.Config{})
- if err != nil {
- panic(err)
- }
-
- // Create store
- store := hydra.NewGORMStore(db, clock.New())
-
- // Create engine
- engine := hydra.New(hydra.Config{
- Store: store,
- Namespace: "production",
- })
-
- // Create worker
- worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{
- WorkerID: "worker-1",
- Concurrency: 10,
- })
- if err != nil {
- panic(err)
- }
-
- // Register workflow
- err = hydra.RegisterWorkflow(worker, &OrderWorkflow{})
- if err != nil {
- panic(err)
- }
-
- // Start worker
- ctx := context.Background()
- err = worker.Start(ctx)
- if err != nil {
- panic(err)
- }
- defer worker.Shutdown(ctx)
-
- // Submit workflow
- executionID, err := engine.StartWorkflow(ctx, "order-processing", &OrderRequest{
- CustomerID: "cust_123",
- Items: []Item{{SKU: "item_456", Quantity: 2}},
- PaymentID: "pay_789",
- })
- if err != nil {
- panic(err)
- }
-
- fmt.Printf("Started workflow: %s\n", executionID)
-}
-```
-
-## Core Concepts
-
-### Engine
-The central orchestration component that manages workflow lifecycle and coordinates execution across workers.
-
-```go
-engine := hydra.New(hydra.Config{
- Store: store,
- Namespace: "production",
- Logger: logger,
-})
-```
-
-### Workers
-Distributed processing units that poll for workflows, acquire leases, and execute workflow logic.
-
-```go
-worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{
- WorkerID: "worker-1",
- Concurrency: 20,
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 30 * time.Second,
- ClaimTimeout: 5 * time.Minute,
-})
-```
-
-### Workflows
-Business logic containers that define a series of steps with exactly-once execution guarantees.
-
-```go
-type MyWorkflow struct{}
-
-func (w *MyWorkflow) Name() string { return "my-workflow" }
-
-func (w *MyWorkflow) Run(ctx hydra.WorkflowContext, req *MyRequest) error {
- // Implement your business logic using hydra.Step()
- return nil
-}
-```
-
-### Steps
-Individual units of work with automatic checkpointing and retry logic.
-
-```go
-result, err := hydra.Step(ctx, "api-call", func(stepCtx context.Context) (*APIResponse, error) {
- return apiClient.Call(stepCtx, request)
-})
-```
-
-## Advanced Features
-
-### Sleep States
-Suspend workflows for time-based coordination:
-
-```go
-// Sleep for 24 hours for manual approval
-err = hydra.Sleep(ctx, 24*time.Hour)
-if err != nil {
- return err
-}
-
-// Continue after sleep
-result, err := hydra.Step(ctx, "post-approval", func(stepCtx context.Context) (string, error) {
- return processApprovedRequest(stepCtx)
-})
-```
-
-### Cron Scheduling
-Schedule workflows to run automatically:
-
-```go
-err = engine.RegisterCron("0 0 * * *", "daily-report", func(ctx context.Context) error {
- // Generate daily report
- return generateDailyReport(ctx)
-})
-```
-
-### Error Handling & Retries
-Configure retry behavior per workflow:
-
-```go
-executionID, err := engine.StartWorkflow(ctx, "order-processing", request,
- hydra.WithMaxAttempts(5),
- hydra.WithRetryBackoff(2*time.Second),
- hydra.WithTimeout(10*time.Minute),
-)
-```
-
-### Custom Marshallers
-Use custom serialization formats:
-
-```go
-type ProtobufMarshaller struct{}
-
-func (p *ProtobufMarshaller) Marshal(v any) ([]byte, error) {
- // Implement protobuf marshalling
-}
-
-func (p *ProtobufMarshaller) Unmarshal(data []byte, v any) error {
- // Implement protobuf unmarshalling
-}
-
-engine := hydra.New(hydra.Config{
- Store: store,
- Marshaller: &ProtobufMarshaller{},
-})
-```
-
-## Observability
-
-### Prometheus Metrics
-
-Hydra provides comprehensive metrics out of the box:
-
-**Workflow Metrics:**
-- `hydra_workflows_started_total` - Total workflows started
-- `hydra_workflows_completed_total` - Total workflows completed/failed
-- `hydra_workflow_duration_seconds` - Workflow execution time
-- `hydra_workflow_queue_time_seconds` - Time spent waiting for execution
-- `hydra_workflows_active` - Currently running workflows per worker
-
-**Step Metrics:**
-- `hydra_steps_executed_total` - Total steps executed with status
-- `hydra_step_duration_seconds` - Individual step execution time
-- `hydra_steps_cached_total` - Steps served from checkpoint cache
-- `hydra_steps_retried_total` - Step retry attempts
-
-**Worker Metrics:**
-- `hydra_worker_polls_total` - Worker polling operations
-- `hydra_worker_heartbeats_total` - Worker heartbeat operations
-- `hydra_lease_acquisitions_total` - Workflow lease acquisitions
-- `hydra_worker_concurrency_current` - Current workflow concurrency per worker
-
-### Example Grafana Queries
-
-```promql
-# Workflow throughput
-rate(hydra_workflows_completed_total[5m])
-
-# Average workflow duration
-rate(hydra_workflow_duration_seconds_sum[5m]) / rate(hydra_workflow_duration_seconds_count[5m])
-
-# Step cache hit rate
-rate(hydra_steps_cached_total[5m]) / rate(hydra_steps_executed_total[5m])
-
-# Worker utilization
-hydra_workflows_active / hydra_worker_concurrency_current
-```
-
-## Architecture
-
-Hydra uses a lease-based coordination model for distributed execution:
-
-```
-βββββββββββββββ βββββββββββββββ βββββββββββββββ
-β Worker 1 β β Worker 2 β β Worker N β
-β β β β β β
-β βββββββββββ β β βββββββββββ β β βββββββββββ β
-β β Poll β β β β Poll β β β β Poll β β
-β β Execute β β β β Execute β β β β Execute β β
-β β Heartbeatβ β β β Heartbeatβ β β β Heartbeatβ β
-β βββββββββββ β β βββββββββββ β β βββββββββββ β
-βββββββββββββββ βββββββββββββββ βββββββββββββββ
- β β β
- βββββββββββββββββββββΌββββββββββββββββββββ
- β
- βββββββββββββββββββ
- β Database β
- β β
- β β’ Workflows β
- β β’ Steps β
- β β’ Leases β
- β β’ Cron Jobs β
- βββββββββββββββββββ
-```
-
-1. **Workers poll** the database for pending workflows
-2. **Workers acquire leases** on available workflows for exclusive execution
-3. **Workers execute** workflow logic with step-by-step checkpointing
-4. **Workers send heartbeats** to maintain lease ownership
-5. **Completed workflows** update status and release leases
-
-## Database Schema
-
-Hydra requires the following tables (auto-migrated with GORM):
-
-```sql
--- Workflow executions
-CREATE TABLE workflow_executions (
- id VARCHAR(255) PRIMARY KEY,
- workflow_name VARCHAR(255) NOT NULL,
- status VARCHAR(50) NOT NULL,
- namespace VARCHAR(255) NOT NULL,
- input_data LONGBLOB,
- output_data LONGBLOB,
- error_message TEXT,
- max_attempts INT NOT NULL,
- remaining_attempts INT NOT NULL,
- created_at BIGINT NOT NULL,
- started_at BIGINT,
- completed_at BIGINT,
- trigger_type VARCHAR(50),
- trigger_source VARCHAR(255),
- INDEX idx_workflow_executions_status_namespace (status, namespace),
- INDEX idx_workflow_executions_workflow_name (workflow_name)
-);
-
--- Workflow steps
-CREATE TABLE workflow_steps (
- id VARCHAR(255) PRIMARY KEY,
- execution_id VARCHAR(255) NOT NULL,
- step_name VARCHAR(255) NOT NULL,
- step_order INT NOT NULL,
- status VARCHAR(50) NOT NULL,
- namespace VARCHAR(255) NOT NULL,
- input_data LONGBLOB,
- output_data LONGBLOB,
- error_message TEXT,
- max_attempts INT NOT NULL,
- remaining_attempts INT NOT NULL,
- started_at BIGINT,
- completed_at BIGINT,
- UNIQUE KEY unique_execution_step (execution_id, step_name),
- INDEX idx_workflow_steps_execution_id (execution_id)
-);
-
--- Leases for coordination
-CREATE TABLE leases (
- resource_id VARCHAR(255) PRIMARY KEY,
- kind VARCHAR(50) NOT NULL,
- namespace VARCHAR(255) NOT NULL,
- worker_id VARCHAR(255) NOT NULL,
- acquired_at BIGINT NOT NULL,
- expires_at BIGINT NOT NULL,
- heartbeat_at BIGINT NOT NULL,
- INDEX idx_leases_expires_at (expires_at),
- INDEX idx_leases_worker_id (worker_id)
-);
-
--- Cron jobs
-CREATE TABLE cron_jobs (
- id VARCHAR(255) PRIMARY KEY,
- name VARCHAR(255) NOT NULL,
- cron_spec VARCHAR(255) NOT NULL,
- namespace VARCHAR(255) NOT NULL,
- workflow_name VARCHAR(255),
- enabled BOOLEAN NOT NULL DEFAULT TRUE,
- created_at BIGINT NOT NULL,
- updated_at BIGINT NOT NULL,
- next_run_at BIGINT NOT NULL,
- UNIQUE KEY unique_namespace_name (namespace, name),
- INDEX idx_cron_jobs_next_run_at (next_run_at, enabled)
-);
-```
-
-## Performance Considerations
-
-### Scaling Workers
-- **Horizontal scaling**: Add more worker instances
-- **Vertical scaling**: Increase concurrency per worker
-- **Database optimization**: Ensure proper indexing and connection pooling
-
-### Optimizing Workflows
-- **Idempotent steps**: Ensure steps can be safely retried
-- **Minimize step payload size**: Reduce serialization overhead
-- **Batch operations**: Combine multiple operations in single steps
-- **Use appropriate timeouts**: Balance responsiveness vs. reliability
-
-### Database Tuning
-```sql
--- Recommended indexes for performance
-CREATE INDEX idx_workflow_executions_polling
-ON workflow_executions (status, namespace, created_at);
-
-CREATE INDEX idx_leases_cleanup
-ON leases (expires_at);
-
-CREATE INDEX idx_workflow_steps_execution_order
-ON workflow_steps (execution_id, step_order);
-```
-
-## Best Practices
-
-### Workflow Design
-- β
**Keep workflows stateless** - Store state in steps, not workflow instances
-- β
**Make steps idempotent** - Steps should be safe to retry
-- β
**Use descriptive step names** - Names should be stable across deployments
-- β
**Handle errors gracefully** - Distinguish between retryable and permanent errors
-- β
**Minimize external dependencies** - Use timeouts and circuit breakers
-
-### Production Deployment
-- β
**Monitor metrics** - Set up alerts for error rates and latency
-- β
**Configure retries** - Set appropriate retry policies for your use case
-- β
**Database backup** - Ensure workflow state is backed up
-- β
**Graceful shutdown** - Handle SIGTERM to finish active workflows
-- β
**Resource limits** - Set memory and CPU limits for workers
-
-## Examples
-
-### Order Processing Workflow
-```go
-type OrderWorkflow struct {
- paymentService PaymentService
- inventoryService InventoryService
- shippingService ShippingService
-}
-
-func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error {
- // Validate and charge payment
- payment, err := hydra.Step(ctx, "process-payment", func(stepCtx context.Context) (*Payment, error) {
- return w.paymentService.ProcessPayment(stepCtx, &PaymentRequest{
- Amount: req.TotalAmount,
- Method: req.PaymentMethod,
- Customer: req.CustomerID,
- })
- })
- if err != nil {
- return err
- }
-
- // Reserve inventory
- reservation, err := hydra.Step(ctx, "reserve-inventory", func(stepCtx context.Context) (*Reservation, error) {
- return w.inventoryService.ReserveItems(stepCtx, req.Items)
- })
- if err != nil {
- // Refund payment on inventory failure
- hydra.Step(ctx, "refund-payment", func(stepCtx context.Context) (any, error) {
- return nil, w.paymentService.RefundPayment(stepCtx, payment.ID)
- })
- return err
- }
-
- // Create shipping label
- _, err = hydra.Step(ctx, "create-shipping", func(stepCtx context.Context) (*ShippingLabel, error) {
- return w.shippingService.CreateLabel(stepCtx, &ShippingRequest{
- Address: req.ShippingAddress,
- Items: req.Items,
- Reservation: reservation.ID,
- })
- })
-
- return err
-}
-```
-
-### Approval Workflow with Sleep
-```go
-func (w *ApprovalWorkflow) Run(ctx hydra.WorkflowContext, req *ApprovalRequest) error {
- // Submit for review
- _, err := hydra.Step(ctx, "submit-review", func(stepCtx context.Context) (*Review, error) {
- return w.reviewService.SubmitForReview(stepCtx, req)
- })
- if err != nil {
- return err
- }
-
- // Sleep for 48 hours to allow manual review
- err = hydra.Sleep(ctx, 48*time.Hour)
- if err != nil {
- return err
- }
-
- // Check approval status
- approval, err := hydra.Step(ctx, "check-approval", func(stepCtx context.Context) (*Approval, error) {
- return w.reviewService.GetApprovalStatus(stepCtx, req.ID)
- })
- if err != nil {
- return err
- }
-
- if approval.Status == "approved" {
- // Process approved request
- _, err = hydra.Step(ctx, "process-approved", func(stepCtx context.Context) (any, error) {
- return nil, w.processApprovedRequest(stepCtx, req)
- })
- }
-
- return err
-}
-```
-
-## Contributing
-
-We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
-
-## License
-
-This project is licensed under the MIT License - see the [LICENSE](../../LICENSE) file for details.
-
----
-
-**Need help?** Check out our [documentation](https://docs.unkey.com) or join our [Discord community](https://discord.gg/unkey).
\ No newline at end of file
diff --git a/go/pkg/hydra/circuit_breaker_test.go b/go/pkg/hydra/circuit_breaker_test.go
deleted file mode 100644
index aa575ab22f..0000000000
--- a/go/pkg/hydra/circuit_breaker_test.go
+++ /dev/null
@@ -1,96 +0,0 @@
-package hydra
-
-import (
- "context"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
-)
-
-// TestCircuitBreakerIntegration verifies that circuit breakers are properly
-// integrated into the worker and protect database operations
-func TestCircuitBreakerIntegration(t *testing.T) {
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- // Create worker with circuit breaker protection
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: "circuit-breaker-test-worker",
- Concurrency: 1,
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- require.NoError(t, err)
-
- // Register a simple workflow
- circuitTestWorkflow := &circuitBreakerTestWorkflow{
- engine: engine,
- name: "circuit-breaker-workflow",
- }
-
- err = RegisterWorkflow(worker, circuitTestWorkflow)
- require.NoError(t, err)
-
- ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
- defer cancel()
-
- // Start worker
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Create a workflow to test circuit breaker protection
- executionID, err := circuitTestWorkflow.Start(ctx, struct{}{})
- require.NoError(t, err)
- require.NotEmpty(t, executionID)
-
- // Advance time to trigger worker polling
- for i := 0; i < 5; i++ {
- testClock.Tick(200 * time.Millisecond)
- time.Sleep(10 * time.Millisecond)
- }
-
- // Verify workflow was processed (circuit breaker didn't block)
- finalWorkflow := waitForWorkflowCompletion(t, engine, executionID, 3*time.Second)
- require.NotNil(t, finalWorkflow)
-
-}
-
-// TestCircuitBreakerCompilation ensures the circuit breaker types compile correctly
-func TestCircuitBreakerCompilation(t *testing.T) {
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- // This test primarily ensures compilation works
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: "compilation-test-worker",
- Concurrency: 1,
- })
- require.NoError(t, err)
- require.NotNil(t, worker)
-
-}
-
-// circuitBreakerTestWorkflow is a minimal workflow for testing circuit breaker integration
-type circuitBreakerTestWorkflow struct {
- engine *Engine
- name string
-}
-
-func (w *circuitBreakerTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *circuitBreakerTestWorkflow) Run(ctx WorkflowContext, req any) error {
- _, err := Step(ctx, "circuit-breaker-step", func(context.Context) (string, error) {
- return "protected", nil
- })
- return err
-}
-
-func (w *circuitBreakerTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/complex_workflows_test.go b/go/pkg/hydra/complex_workflows_test.go
deleted file mode 100644
index c391d49cc6..0000000000
--- a/go/pkg/hydra/complex_workflows_test.go
+++ /dev/null
@@ -1,535 +0,0 @@
-package hydra
-
-import (
- "context"
- "fmt"
- "math/rand"
- "sync"
- "sync/atomic"
- "time"
-)
-
-// ComplexBillingWorkflow simulates a realistic billing workflow with multiple steps,
-// error handling, retries, and conditional logic
-type ComplexBillingWorkflow struct {
- engine *Engine
- name string
- failureRate float64 // Probability of step failure (0.0-1.0)
- chaosEnabled bool
- metrics *WorkflowMetrics
-}
-
-// WorkflowMetrics tracks detailed execution metrics
-type WorkflowMetrics struct {
- StepsExecuted atomic.Int64
- StepsRetried atomic.Int64
- StepsFailed atomic.Int64
- WorkflowsCompleted atomic.Int64
- WorkflowsFailed atomic.Int64
- TotalDuration atomic.Int64 // in milliseconds
- mu sync.RWMutex
- StepDurations map[string][]time.Duration
-}
-
-func NewWorkflowMetrics() *WorkflowMetrics {
- return &WorkflowMetrics{
- StepDurations: make(map[string][]time.Duration),
- }
-}
-
-func (m *WorkflowMetrics) RecordStepDuration(stepName string, duration time.Duration) {
- m.mu.Lock()
- defer m.mu.Unlock()
- m.StepDurations[stepName] = append(m.StepDurations[stepName], duration)
-}
-
-func (w *ComplexBillingWorkflow) Name() string {
- return w.name
-}
-
-func (w *ComplexBillingWorkflow) Run(ctx WorkflowContext, req any) error {
- startTime := time.Now()
- defer func() {
- w.metrics.TotalDuration.Add(time.Since(startTime).Milliseconds())
- }()
-
- // Step 1: Validate customer data
- customerID, err := Step(ctx, "validate-customer", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("validate-customer") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("customer validation failed")
- }
-
- // Simulate API call
- time.Sleep(time.Duration(rand.Intn(50)+10) * time.Millisecond)
- return "customer-123", nil
- })
-
- if err != nil {
- // Retry with exponential backoff
- w.metrics.StepsRetried.Add(1)
- time.Sleep(100 * time.Millisecond)
-
- customerID, err = Step(ctx, "validate-customer-retry", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
- time.Sleep(time.Duration(rand.Intn(30)+20) * time.Millisecond)
- return "customer-123", nil
- })
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("customer validation failed after retry: %w", err)
- }
- }
-
- // Step 2: Calculate invoice amount (parallel with usage fetch)
- var invoiceAmount float64
-
- // Use goroutines to simulate parallel step execution
- var wg sync.WaitGroup
- var calcErr, usageErr error
-
- wg.Add(2)
-
- // Calculate invoice in parallel
- go func() {
- defer wg.Done()
- var amountStr string
- amountStr, calcErr = Step(ctx, "calculate-invoice", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("calculate-invoice") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("invoice calculation error")
- }
-
- // Simulate complex calculation
- time.Sleep(time.Duration(rand.Intn(100)+50) * time.Millisecond)
- amount := float64(rand.Intn(10000)+100) / 100.0
- return fmt.Sprintf("%.2f", amount), nil
- })
-
- if calcErr == nil {
- fmt.Sscanf(amountStr, "%f", &invoiceAmount)
- }
- err = calcErr
- }()
-
- // Fetch usage data in parallel
- go func() {
- defer wg.Done()
- _, fetchErr := Step(ctx, "fetch-usage-data", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("fetch-usage-data") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("usage data fetch failed")
- }
-
- // Simulate database query
- time.Sleep(time.Duration(rand.Intn(80)+30) * time.Millisecond)
- return fmt.Sprintf("usage-%d-units", rand.Intn(1000)), nil
- })
-
- usageErr = fetchErr
- }()
-
- wg.Wait()
-
- if calcErr != nil || usageErr != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("parallel steps failed: calc=%v, usage=%v", calcErr, usageErr)
- }
-
- // Step 3: Apply discounts (conditional)
- if invoiceAmount > 100 {
- discountedAmount, discountErr := Step(ctx, "apply-discounts", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("apply-discounts") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("discount calculation failed")
- }
-
- // Simulate discount calculation
- time.Sleep(time.Duration(rand.Intn(40)+10) * time.Millisecond)
- discount := invoiceAmount * 0.1
- return fmt.Sprintf("%.2f", invoiceAmount-discount), nil
- })
-
- if discountErr == nil {
- fmt.Sscanf(discountedAmount, "%f", &invoiceAmount)
- }
- }
-
- // Step 4: Generate PDF invoice
- _, err = Step(ctx, "generate-pdf", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("generate-pdf") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("PDF generation failed")
- }
-
- // Simulate PDF generation (slow operation)
- time.Sleep(time.Duration(rand.Intn(200)+100) * time.Millisecond)
- return fmt.Sprintf("https://invoices.example.com/%s.pdf", customerID), nil
- })
-
- if err != nil {
- // Non-critical failure, continue
- // PDF generation is optional
- _ = err // Intentionally ignored
- }
-
- // Step 5: Send invoice email
- _, err = Step(ctx, "send-email", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("send-email") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("email sending failed")
- }
-
- // Simulate email API call
- time.Sleep(time.Duration(rand.Intn(60)+20) * time.Millisecond)
- return fmt.Sprintf("email-sent-to-%s", customerID), nil
- })
-
- if err != nil {
- // Retry email sending
- w.metrics.StepsRetried.Add(1)
- _, retryErr := Step(ctx, "send-email-retry", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
- time.Sleep(time.Duration(rand.Intn(40)+20) * time.Millisecond)
- return "email-sent-on-retry", nil
- })
-
- if retryErr != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("email sending failed after retry: %w", retryErr)
- }
- }
-
- // Step 6: Update billing status
- _, err = Step(ctx, "update-billing-status", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- if w.shouldFail("update-billing-status") {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("status update failed")
- }
-
- // Simulate database update
- time.Sleep(time.Duration(rand.Intn(30)+10) * time.Millisecond)
- return "status-updated", nil
- })
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("billing status update failed: %w", err)
- }
-
- w.metrics.WorkflowsCompleted.Add(1)
- return nil
-}
-
-func (w *ComplexBillingWorkflow) shouldFail(stepName string) bool {
- if !w.chaosEnabled {
- return false
- }
-
- // Introduce targeted chaos for specific steps
- failureRates := map[string]float64{
- "validate-customer": w.failureRate * 0.5, // Less likely to fail
- "calculate-invoice": w.failureRate,
- "fetch-usage-data": w.failureRate * 1.2, // More likely to fail
- "generate-pdf": w.failureRate * 2.0, // Much more likely to fail
- "send-email": w.failureRate * 1.5,
- "update-billing-status": w.failureRate * 0.8,
- }
-
- rate, ok := failureRates[stepName]
- if !ok {
- rate = w.failureRate
- }
-
- return rand.Float64() < rate
-}
-
-func (w *ComplexBillingWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// ComplexDataPipelineWorkflow simulates a data processing pipeline with
-// conditional branching, loops, and complex error handling
-type ComplexDataPipelineWorkflow struct {
- engine *Engine
- name string
- chaosEnabled bool
- metrics *WorkflowMetrics
-}
-
-func (w *ComplexDataPipelineWorkflow) Name() string {
- return w.name
-}
-
-func (w *ComplexDataPipelineWorkflow) Run(ctx WorkflowContext, req any) error {
- // Step 1: Fetch data sources
- sources, err := Step(ctx, "fetch-data-sources", func(stepCtx context.Context) ([]string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- // Simulate fetching multiple data sources
- time.Sleep(time.Duration(rand.Intn(50)+20) * time.Millisecond)
-
- numSources := rand.Intn(5) + 3
- sources := make([]string, numSources)
- for i := 0; i < numSources; i++ {
- sources[i] = fmt.Sprintf("source-%d", i)
- }
- return sources, nil
- })
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("failed to fetch data sources: %w", err)
- }
-
- // Step 2: Process each source (loop with error handling)
- var processedCount int
- for i, source := range sources {
- stepName := fmt.Sprintf("process-source-%d", i)
-
- _, stepErr := Step(ctx, stepName, func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- // Simulate processing with variable duration
- processingTime := time.Duration(rand.Intn(100)+50) * time.Millisecond
- time.Sleep(processingTime)
-
- // Random failures
- if w.chaosEnabled && rand.Float64() < 0.2 {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("processing failed for %s", source)
- }
-
- return fmt.Sprintf("processed-%s", source), nil
- })
-
- if stepErr != nil {
- // Continue processing other sources
- continue
- }
- processedCount++
- }
-
- // Step 3: Validate processing results
- if processedCount < len(sources)/2 {
- // Too many failures, trigger cleanup
- _, cleanupErr := Step(ctx, "cleanup-failed-processing", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
- time.Sleep(50 * time.Millisecond)
- return "cleanup-complete", nil
- })
-
- if cleanupErr != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("cleanup failed: %w", cleanupErr)
- }
-
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("too many source processing failures: %d/%d", processedCount, len(sources))
- }
-
- // Step 4: Aggregate results
- _, err = Step(ctx, "aggregate-results", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- // Simulate complex aggregation
- time.Sleep(time.Duration(rand.Intn(150)+100) * time.Millisecond)
-
- if w.chaosEnabled && rand.Float64() < 0.1 {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("aggregation failed")
- }
-
- return fmt.Sprintf("aggregated-%d-results", processedCount), nil
- })
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("result aggregation failed: %w", err)
- }
-
- // Step 5: Publish results (with circuit breaker pattern)
- var publishAttempts int
- for publishAttempts < 3 {
- _, err = Step(ctx, fmt.Sprintf("publish-attempt-%d", publishAttempts), func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
- publishAttempts++
-
- // Simulate flaky external service
- if w.chaosEnabled && rand.Float64() < 0.4 {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("publish service unavailable")
- }
-
- time.Sleep(time.Duration(rand.Intn(80)+40) * time.Millisecond)
- return "published-successfully", nil
- })
-
- if err == nil {
- break
- }
-
- // Exponential backoff
- w.metrics.StepsRetried.Add(1)
- time.Sleep(time.Duration(publishAttempts*100) * time.Millisecond)
- }
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("failed to publish after %d attempts: %w", publishAttempts, err)
- }
-
- w.metrics.WorkflowsCompleted.Add(1)
- return nil
-}
-
-func (w *ComplexDataPipelineWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// ComplexStateMachineWorkflow tests complex state transitions and decision points
-type ComplexStateMachineWorkflow struct {
- engine *Engine
- name string
- chaosEnabled bool
- metrics *WorkflowMetrics
-}
-
-func (w *ComplexStateMachineWorkflow) Name() string {
- return w.name
-}
-
-func (w *ComplexStateMachineWorkflow) Run(ctx WorkflowContext, req any) error {
- // Initialize with random state
- initialState := rand.Intn(3)
-
- // Step 1: Determine initial action based on state
- action, err := Step(ctx, "determine-initial-action", func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- actions := []string{"process", "review", "escalate"}
- return actions[initialState], nil
- })
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return err
- }
-
- // Step 2: Execute state machine transitions
- currentState := action
- transitions := 0
- maxTransitions := 10
-
- for transitions < maxTransitions {
- nextState, transitionErr := Step(ctx, fmt.Sprintf("transition-%d-from-%s", transitions, currentState),
- func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- // Simulate state transition logic
- time.Sleep(time.Duration(rand.Intn(50)+20) * time.Millisecond)
-
- // Random transition failures
- if w.chaosEnabled && rand.Float64() < 0.15 {
- w.metrics.StepsFailed.Add(1)
- return "", fmt.Errorf("transition failed from %s", currentState)
- }
-
- // State transition rules
- switch currentState {
- case "process":
- if rand.Float64() < 0.7 {
- return "review", nil
- }
- return "escalate", nil
- case "review":
- if rand.Float64() < 0.5 {
- return "approve", nil
- } else if rand.Float64() < 0.8 {
- return "reject", nil
- }
- return "process", nil
- case "escalate":
- if rand.Float64() < 0.6 {
- return "review", nil
- }
- return "terminate", nil
- case "approve", "reject", "terminate":
- return currentState, nil // Terminal states
- default:
- return "error", nil
- }
- })
-
- if transitionErr != nil {
- // Handle transition failure
- _, recoveryErr := Step(ctx, fmt.Sprintf("recover-transition-%d", transitions),
- func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
- w.metrics.StepsRetried.Add(1)
- time.Sleep(30 * time.Millisecond)
- return "review", nil // Safe state
- })
-
- if recoveryErr != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("state machine recovery failed: %w", recoveryErr)
- }
- nextState = "review"
- }
-
- currentState = nextState
- transitions++
-
- // Check for terminal states
- if currentState == "approve" || currentState == "reject" || currentState == "terminate" {
- break
- }
- }
-
- // Step 3: Finalize based on terminal state
- _, err = Step(ctx, fmt.Sprintf("finalize-%s", currentState), func(stepCtx context.Context) (string, error) {
- w.metrics.StepsExecuted.Add(1)
-
- switch currentState {
- case "approve":
- time.Sleep(80 * time.Millisecond)
- return "approved-and-processed", nil
- case "reject":
- time.Sleep(40 * time.Millisecond)
- return "rejected-and-notified", nil
- case "terminate":
- time.Sleep(20 * time.Millisecond)
- return "terminated-with-cleanup", nil
- default:
- return "", fmt.Errorf("invalid terminal state: %s", currentState)
- }
- })
-
- if err != nil {
- w.metrics.WorkflowsFailed.Add(1)
- return fmt.Errorf("finalization failed: %w", err)
- }
-
- w.metrics.WorkflowsCompleted.Add(1)
- return nil
-}
-
-func (w *ComplexStateMachineWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/cron.go b/go/pkg/hydra/cron.go
deleted file mode 100644
index 4477f55421..0000000000
--- a/go/pkg/hydra/cron.go
+++ /dev/null
@@ -1,229 +0,0 @@
-package hydra
-
-import (
- "context"
- "encoding/json"
- "errors"
- "fmt"
- "strconv"
- "strings"
- "time"
-)
-
-// CronHandler defines the function signature for cron job handlers
-type CronHandler func(ctx context.Context, payload CronPayload) error
-
-type CronPayload struct {
- CronJobID string `json:"cron_job_id"`
- CronName string `json:"cron_name"`
- ScheduledAt int64 `json:"scheduled_at"` // When this execution was scheduled
- ActualRunAt int64 `json:"actual_run_at"` // When it actually ran
- Namespace string `json:"namespace"`
-}
-
-func (p CronPayload) Marshal() ([]byte, error) {
- return json.Marshal(p)
-}
-
-func (p *CronPayload) Unmarshal(data []byte) error {
- return json.Unmarshal(data, p)
-}
-
-func calculateNextRun(cronSpec string, from time.Time) int64 {
- schedule, err := parseCronSpec(cronSpec)
- if err != nil {
- return from.Add(5 * time.Minute).UnixMilli()
- }
-
- next := schedule.next(from)
- return next.UnixMilli()
-}
-
-type cronSchedule struct {
- minute uint64 // bits 0-59
- hour uint64 // bits 0-23
- dom uint64 // bits 1-31, day of month
- month uint64 // bits 1-12
- dow uint64 // bits 0-6, day of week (0=Sunday)
-}
-
-func parseCronSpec(spec string) (*cronSchedule, error) {
- fields := strings.Fields(spec)
- if len(fields) != 5 {
- return nil, errors.New("cron spec must have 5 fields")
- }
-
- minute, err := parseField(fields[0], 0, 59)
- if err != nil {
- return nil, fmt.Errorf("invalid minute field: %w", err)
- }
-
- hour, err := parseField(fields[1], 0, 23)
- if err != nil {
- return nil, fmt.Errorf("invalid hour field: %w", err)
- }
-
- dom, err := parseField(fields[2], 1, 31)
- if err != nil {
- return nil, fmt.Errorf("invalid day of month field: %w", err)
- }
-
- month, err := parseField(fields[3], 1, 12)
- if err != nil {
- return nil, fmt.Errorf("invalid month field: %w", err)
- }
-
- dow, err := parseField(fields[4], 0, 6)
- if err != nil {
- return nil, fmt.Errorf("invalid day of week field: %w", err)
- }
-
- return &cronSchedule{
- minute: minute,
- hour: hour,
- dom: dom,
- month: month,
- dow: dow,
- }, nil
-}
-
-func parseField(field string, minimum, maximum int) (uint64, error) {
- if field == "*" {
- var mask uint64
- for i := minimum; i <= maximum; i++ {
- mask |= 1 << i
- }
- return mask, nil
- }
-
- parts := strings.Split(field, ",")
- var mask uint64
-
- for _, part := range parts {
- // nolint:nestif
- if strings.Contains(part, "/") {
- stepParts := strings.Split(part, "/")
- if len(stepParts) != 2 {
- return 0, errors.New("invalid step syntax")
- }
-
- step, err := strconv.Atoi(stepParts[1])
- if err != nil || step <= 0 {
- return 0, errors.New("invalid step value")
- }
-
- rangeStart := minimum
- rangeEnd := maximum
-
- if stepParts[0] != "*" {
- if strings.Contains(stepParts[0], "-") {
- rangeParts := strings.Split(stepParts[0], "-")
- if len(rangeParts) != 2 {
- return 0, errors.New("invalid range syntax")
- }
- rangeStart, err = strconv.Atoi(rangeParts[0])
- if err != nil || rangeStart < minimum || rangeStart > maximum {
- return 0, errors.New("invalid range start")
- }
- rangeEnd, err = strconv.Atoi(rangeParts[1])
- if err != nil || rangeEnd < minimum || rangeEnd > maximum {
- return 0, errors.New("invalid range end")
- }
- } else {
- rangeStart, err = strconv.Atoi(stepParts[0])
- if err != nil || rangeStart < minimum || rangeStart > maximum {
- return 0, errors.New("invalid step start value")
- }
- rangeEnd = rangeStart
- }
- }
-
- for i := rangeStart; i <= rangeEnd; i += step {
- mask |= 1 << i
- }
-
- } else if strings.Contains(part, "-") {
- rangeParts := strings.Split(part, "-")
- if len(rangeParts) != 2 {
- return 0, errors.New("invalid range syntax")
- }
-
- start, err := strconv.Atoi(rangeParts[0])
- if err != nil || start < minimum || start > maximum {
- return 0, errors.New("invalid range start")
- }
-
- end, err := strconv.Atoi(rangeParts[1])
- if err != nil || end < minimum || end > maximum {
- return 0, errors.New("invalid range end")
- }
-
- for i := start; i <= end; i++ {
- mask |= 1 << i
- }
-
- } else {
- val, err := strconv.Atoi(part)
- if err != nil || val < minimum || val > maximum {
- return 0, errors.New("invalid single value")
- }
- mask |= 1 << val
- }
- }
-
- return mask, nil
-}
-
-func (s *cronSchedule) next(t time.Time) time.Time {
- next := t.Add(time.Minute).Truncate(time.Minute)
-
- end := t.Add(4 * 365 * 24 * time.Hour) // 4 years
-
- for next.Before(end) {
- if s.matches(next) {
- return next
- }
-
- next = next.Add(time.Minute)
- }
-
- return t.Add(365 * 24 * time.Hour)
-}
-
-func (s *cronSchedule) matches(t time.Time) bool {
- if s.minute&(1< 0 {
- count++
- }
- }
- return count
-}
-
-func (t *ConcurrentExecutionTracker) GetMissingWorkflows(allWorkflowIDs []string) []string {
- t.mu.Lock()
- defer t.mu.Unlock()
-
- var missing []string
- for _, workflowID := range allWorkflowIDs {
- if record, exists := t.executions[workflowID]; !exists || record.ExecutionCount == 0 {
- missing = append(missing, workflowID)
- }
- }
- return missing
-}
-
-func (t *ConcurrentExecutionTracker) AnalyzeResults(testCtx *testing.T) ConsistencyResults {
- t.mu.Lock()
- defer t.mu.Unlock()
-
- results := ConsistencyResults{}
-
- for workflowID, record := range t.executions {
- if record.ExecutionCount > 0 {
- results.WorkflowsExecuted++
- }
-
- if record.ExecutionCount > 1 {
- results.DuplicateExecutions++
- testCtx.Errorf("DUPLICATE EXECUTION: Workflow %s executed %d times by workers %v",
- workflowID, record.ExecutionCount, record.WorkerIDs)
- }
-
- if record.Failed {
- results.FailedWorkflows++
- }
-
- // Detect race conditions (multiple workers starting execution within 100ms)
- if len(record.Timestamps) > 1 {
- for i := 1; i < len(record.Timestamps); i++ {
- if record.Timestamps[i].Sub(record.Timestamps[i-1]) < 100*time.Millisecond {
- results.RaceConditions++
- testCtx.Errorf("RACE CONDITION: Workflow %s had concurrent executions by %v",
- workflowID, record.WorkerIDs)
- break
- }
- }
- }
- }
-
- return results
-}
-
-// consistencyTestWorkflow tracks executions to detect consistency violations
-type consistencyTestWorkflow struct {
- engine *Engine
- name string
- tracker *ConcurrentExecutionTracker
-}
-
-func (w *consistencyTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *consistencyTestWorkflow) Run(ctx WorkflowContext, req any) error {
- workflowID := ctx.ExecutionID()
-
- // Record that this workflow started executing
- w.tracker.RecordExecution(workflowID, "unknown-worker") // We could get worker ID from context
-
- // Simulate some work with a step
- _, err := Step(ctx, "consistency-step", func(context.Context) (string, error) {
- // Small delay to increase chance of race conditions
- time.Sleep(10 * time.Millisecond)
- return "consistent", nil
- })
-
- // Record completion
- w.tracker.RecordCompletion(workflowID, err == nil)
-
- return err
-}
-
-func (w *consistencyTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// raceConditionTestWorkflow creates multiple steps to test for race conditions
-type raceConditionTestWorkflow struct {
- engine *Engine
- name string
-}
-
-func (w *raceConditionTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *raceConditionTestWorkflow) Run(ctx WorkflowContext, req any) error {
- // Create multiple steps that might race with each other
- const numSteps = 20
-
- // Use a WaitGroup to ensure all steps complete
- var wg sync.WaitGroup
- var stepErrors atomic.Int64
-
- for i := 0; i < numSteps; i++ {
- wg.Add(1)
- go func(stepIndex int) {
- defer wg.Done()
-
- stepName := fmt.Sprintf("race-step-%d", stepIndex)
- _, err := Step(ctx, stepName, func(context.Context) (string, error) {
- return fmt.Sprintf("result-%d", stepIndex), nil
- })
-
- if err != nil {
- stepErrors.Add(1)
- }
- }(i)
- }
-
- wg.Wait()
-
- if stepErrors.Load() > 0 {
- return fmt.Errorf("race condition test failed: %d step errors", stepErrors.Load())
- }
-
- return nil
-}
-
-func (w *raceConditionTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// transactionTestWorkflow tests database transaction integrity
-type transactionTestWorkflow struct {
- engine *Engine
- name string
-}
-
-func (w *transactionTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *transactionTestWorkflow) Run(ctx WorkflowContext, req any) error {
- mode, ok := req.(string)
- if !ok {
- mode = "normal"
- }
-
- // Create a step that tests transaction boundaries
- _, err := Step(ctx, "transaction-step", func(stepCtx context.Context) (string, error) {
- switch mode {
- case "normal":
- return "transaction-success", nil
- case "error":
- return "", fmt.Errorf("simulated step error")
- default:
- return "unknown-mode", nil
- }
- })
-
- return err
-}
-
-func (w *transactionTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/debug_test.go b/go/pkg/hydra/debug_test.go
deleted file mode 100644
index 5d4f73df47..0000000000
--- a/go/pkg/hydra/debug_test.go
+++ /dev/null
@@ -1,74 +0,0 @@
-package hydra
-
-import (
- "context"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
-)
-
-// TestBasicWorkflowExecution tests the most basic workflow execution
-func TestBasicWorkflowExecution(t *testing.T) {
- realClock := clock.New()
- engine := newTestEngineWithClock(t, realClock)
-
- // Create a very simple workflow
- simpleWorkflow := &debugWorkflow{
- engine: engine,
- name: "debug-workflow",
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- // Start worker
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: "debug-worker",
- Concurrency: 1,
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 2 * time.Second,
- ClaimTimeout: 10 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, simpleWorkflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Submit a single workflow
- workflowID, err := simpleWorkflow.Start(ctx, "test-payload")
- require.NoError(t, err)
- require.NotEmpty(t, workflowID)
-
- // Wait for completion
- finalWorkflow := waitForWorkflowCompletion(t, engine, workflowID, 8*time.Second)
- require.NotNil(t, finalWorkflow, "Workflow should complete")
-
-}
-
-type debugWorkflow struct {
- engine *Engine
- name string
-}
-
-func (w *debugWorkflow) Name() string {
- return w.name
-}
-
-func (w *debugWorkflow) Run(ctx WorkflowContext, req any) error {
- // Very simple step
- _, err := Step(ctx, "debug-step", func(context.Context) (string, error) {
- time.Sleep(50 * time.Millisecond)
- return "success", nil
- })
- return err
-}
-
-func (w *debugWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/doc.go b/go/pkg/hydra/doc.go
deleted file mode 100644
index 2e165266a8..0000000000
--- a/go/pkg/hydra/doc.go
+++ /dev/null
@@ -1,252 +0,0 @@
-// Package hydra provides a distributed workflow orchestration engine designed
-// for reliable execution of multi-step business processes at scale.
-//
-// Hydra implements the Temporal-style workflow pattern with durable execution,
-// automatic retries, checkpointing, and distributed coordination. It supports
-// both simple sequential workflows and complex long-running processes with
-// sleep states, cron scheduling, and step-level fault tolerance.
-//
-// # Core Concepts
-//
-// Engine: The central orchestration component that manages workflow lifecycle,
-// worker coordination, and persistence. Each engine instance operates within
-// a specific namespace for tenant isolation.
-//
-// Workers: Distributed processing units that poll for pending workflows,
-// acquire leases for exclusive execution, and run workflow logic. Workers
-// support concurrent execution with configurable limits and automatic
-// heartbeat management.
-//
-// Workflows: Business logic containers that define a series of steps to be
-// executed. Workflows are stateless functions that can be suspended, resumed,
-// and retried while maintaining exactly-once execution guarantees.
-//
-// Steps: Individual units of work within a workflow. Steps support automatic
-// checkpointing, retry logic, and result caching to ensure idempotent execution
-// even across worker failures or restarts.
-//
-// # Key Features
-//
-// Exactly-Once Execution: Workflows and steps execute exactly once, even in
-// the presence of worker failures, network partitions, or duplicate deliveries.
-//
-// Durable State: All workflow state is persisted to a database, allowing
-// workflows to survive process restarts and infrastructure failures.
-//
-// Distributed Coordination: Multiple workers can safely operate on the same
-// workflow queue using lease-based coordination and circuit breaker protection.
-//
-// Comprehensive Observability: Built-in Prometheus metrics track workflow
-// throughput, latency, error rates, and system health across all components.
-//
-// Flexible Scheduling: Support for immediate execution, cron-based scheduling,
-// and workflow sleep states for time-based coordination.
-//
-// # Basic Usage
-//
-// Creating an engine and worker:
-//
-// // Create the engine with database DSN
-// engine, err := hydra.NewEngine(hydra.Config{
-// DSN: "user:password@tcp(localhost:3306)/hydra",
-// Namespace: "production",
-// Logger: logger,
-// })
-// if err != nil {
-// return err
-// }
-//
-// // Create and configure a worker
-// worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{
-// WorkerID: "worker-1",
-// Concurrency: 10,
-// PollInterval: 100 * time.Millisecond,
-// HeartbeatInterval: 30 * time.Second,
-// ClaimTimeout: 5 * time.Minute,
-// })
-//
-// Defining a workflow:
-//
-// type OrderWorkflow struct {
-// engine *hydra.Engine
-// }
-//
-// func (w *OrderWorkflow) Name() string {
-// return "order-processing"
-// }
-//
-// func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error {
-// // Step 1: Validate payment
-// payment, err := hydra.Step(ctx, "validate-payment", func(stepCtx context.Context) (*Payment, error) {
-// return validatePayment(stepCtx, req.PaymentID)
-// })
-// if err != nil {
-// return err
-// }
-//
-// // Step 2: Reserve inventory
-// reservation, err := hydra.Step(ctx, "reserve-inventory", func(stepCtx context.Context) (*Reservation, error) {
-// return reserveInventory(stepCtx, req.Items)
-// })
-// if err != nil {
-// return err
-// }
-//
-// // Step 3: Process order
-// _, err = hydra.Step(ctx, "process-order", func(stepCtx context.Context) (*Order, error) {
-// return processOrder(stepCtx, payment, reservation)
-// })
-//
-// return err
-// }
-//
-// Starting workflows:
-//
-// // Register the workflow with the worker
-// orderWorkflow := &OrderWorkflow{engine: engine}
-// err = hydra.RegisterWorkflow(worker, orderWorkflow)
-// if err != nil {
-// return err
-// }
-//
-// // Start the worker
-// ctx := context.Background()
-// err = worker.Start(ctx)
-// if err != nil {
-// return err
-// }
-// defer worker.Shutdown(ctx)
-//
-// // Submit a workflow for execution
-// request := &OrderRequest{
-// CustomerID: "cust_123",
-// Items: []Item{{SKU: "item_456", Quantity: 2}},
-// PaymentID: "pay_789",
-// }
-//
-// executionID, err := engine.StartWorkflow(ctx, "order-processing", request)
-// if err != nil {
-// return err
-// }
-//
-// fmt.Printf("Started workflow execution: %s\n", executionID)
-//
-// # Marshalling Options
-//
-// Hydra supports multiple marshalling formats for workflow payloads and step results:
-//
-// JSON Marshaller (Default):
-//
-// engine, err := hydra.NewEngine(hydra.Config{
-// Marshaller: hydra.NewJSONMarshaller(), // Default if not specified
-// // ... other config
-// })
-//
-// # Advanced Features
-//
-// Sleep States: Workflows can suspend execution and resume after a specified
-// duration, allowing for time-based coordination and human approval processes:
-//
-// // Sleep for 24 hours for manual approval
-// return hydra.Sleep(ctx, 24*time.Hour)
-//
-// Cron Scheduling: Register workflows to run on a schedule:
-//
-// err = engine.RegisterCron("0 0 * * *", "daily-report", func(ctx context.Context) error {
-// // Generate daily report
-// return generateDailyReport(ctx)
-// })
-//
-// Error Handling and Retries: Configure retry behavior at the workflow level:
-//
-// executionID, err := engine.StartWorkflow(ctx, "order-processing", request,
-// hydra.WithMaxAttempts(5),
-// hydra.WithRetryBackoff(2*time.Second),
-// hydra.WithTimeout(10*time.Minute),
-// )
-//
-// # Observability
-//
-// Hydra provides comprehensive Prometheus metrics out of the box:
-//
-// Workflow Metrics:
-// - hydra_workflows_started_total: Total workflows started
-// - hydra_workflows_completed_total: Total workflows completed/failed
-// - hydra_workflow_duration_seconds: Workflow execution time
-// - hydra_workflow_queue_time_seconds: Time spent waiting for execution
-// - hydra_workflows_active: Currently running workflows per worker
-//
-// Step Metrics:
-// - hydra_steps_executed_total: Total steps executed with status
-// - hydra_step_duration_seconds: Individual step execution time
-// - hydra_steps_cached_total: Steps served from checkpoint cache
-// - hydra_steps_retried_total: Step retry attempts
-//
-// Worker Metrics:
-// - hydra_worker_polls_total: Worker polling operations
-// - hydra_worker_heartbeats_total: Worker heartbeat operations
-// - hydra_lease_acquisitions_total: Workflow lease acquisitions
-// - hydra_worker_concurrency_current: Current workflow concurrency per worker
-//
-// Error and Performance Metrics:
-// - hydra_errors_total: Categorized error counts
-// - hydra_payload_size_bytes: Workflow and step payload sizes
-// - hydra_db_operations_total: Database operation counts and latency
-//
-// All metrics include rich labels for namespace, workflow names, worker IDs,
-// and status information, enabling detailed monitoring and alerting.
-//
-// # Architecture
-//
-// Hydra uses a lease-based coordination model to ensure exactly-once execution:
-//
-// 1. Workers poll the database for pending workflows in their namespace
-// 2. Workers attempt to acquire exclusive leases on available workflows
-// 3. Successful lease holders execute the workflow logic
-// 4. Workers send periodic heartbeats to maintain lease ownership
-// 5. Completed workflows update their status and release the lease
-// 6. Failed workers automatically lose their leases after timeout
-//
-// This design provides fault tolerance without requiring complex consensus
-// protocols or external coordination services.
-//
-// # Database Schema
-//
-// Hydra requires the following database tables:
-//
-// - workflow_executions: Stores workflow state, status, and metadata
-// - workflow_steps: Tracks individual step execution and results
-// - leases: Manages worker coordination and exclusive access
-// - cron_jobs: Stores scheduled workflow definitions
-//
-// The schema should be created using the provided schema.sql file.
-//
-// # Error Handling
-//
-// Hydra distinguishes between different types of errors:
-//
-// Transient Errors: Network timeouts, temporary database failures, etc.
-// These trigger automatic retries based on the configured retry policy.
-//
-// Permanent Errors: Validation failures, business logic errors, etc.
-// These immediately fail the workflow without retries.
-//
-// Workflow Suspension: Controlled suspension using Sleep() for time-based
-// coordination or external event waiting.
-//
-// # Performance Considerations
-//
-// - Workers use circuit breakers to prevent cascading failures
-// - Database queries are optimized with appropriate indexes
-// - Lease timeouts prevent stuck workflows from blocking execution
-// - Configurable concurrency limits prevent resource exhaustion
-// - Built-in connection pooling and retry logic for database operations
-//
-// # Thread Safety
-//
-// All Hydra components are thread-safe and designed for concurrent access:
-// - Multiple workers can safely operate on the same workflow queue
-// - Step execution is atomic and isolated using database transactions
-// - Workflow state updates use optimistic locking to prevent race conditions
-// - Metrics collection is thread-safe and non-blocking
-package hydra
diff --git a/go/pkg/hydra/engine.go b/go/pkg/hydra/engine.go
deleted file mode 100644
index 5130ec5e51..0000000000
--- a/go/pkg/hydra/engine.go
+++ /dev/null
@@ -1,318 +0,0 @@
-package hydra
-
-import (
- "context"
- "database/sql"
- "fmt"
- "time"
-
- "github.com/unkeyed/unkey/go/pkg/assert"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/metrics"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/otel/logging"
- "github.com/unkeyed/unkey/go/pkg/otel/tracing"
- "github.com/unkeyed/unkey/go/pkg/retry"
- "github.com/unkeyed/unkey/go/pkg/uid"
- "go.opentelemetry.io/otel/attribute"
-
- // MySQL driver
- _ "github.com/go-sql-driver/mysql"
-)
-
-// Config holds the configuration for creating a new Engine instance.
-//
-// All fields except Store are optional and will use sensible defaults
-// if not provided.
-type Config struct {
- // DSN is the database connection string for MySQL.
- // This field is required and cannot be empty.
- // The engine will create an SQLC store from this connection.
- DSN string
-
- // Namespace provides tenant isolation for workflows. All workflows
- // created by this engine will be scoped to this namespace.
- // Defaults to "default" if not specified.
- Namespace string
-
- // Clock provides time-related operations for testing and scheduling.
- // Defaults to a real clock implementation if not specified.
- Clock clock.Clock
-
- // Logger handles structured logging for the engine operations.
- // Defaults to a no-op logger if not specified.
- Logger logging.Logger
-
- // Marshaller handles serialization of workflow payloads and step results.
- // Defaults to JSON marshalling if not specified.
- Marshaller Marshaller
-}
-
-// NewConfig creates a default config with sensible defaults.
-//
-// The returned config uses:
-// - "default" namespace
-// - Real clock implementation
-// - All other fields will be set to their defaults when passed to New()
-func NewConfig() Config {
- return Config{
- DSN: "",
- Namespace: "default",
- Clock: clock.New(),
- Logger: nil,
- Marshaller: nil,
- }
-}
-
-// Engine is the core workflow orchestration engine that manages workflow
-// lifecycle, coordination, and execution.
-//
-// The engine is responsible for:
-// - Starting new workflows and managing their state
-// - Coordinating workflow execution across multiple workers
-// - Handling cron-based scheduled workflows
-// - Providing namespace isolation for multi-tenant deployments
-// - Recording metrics and managing observability
-//
-// Engine instances are thread-safe and can be shared across multiple
-// workers and goroutines.
-type Engine struct {
- db *sql.DB
- namespace string
- cronHandlers map[string]CronHandler
- clock clock.Clock
- logger logging.Logger
- marshaller Marshaller
-}
-
-// New creates a new Engine instance with the provided configuration.
-//
-// The engine will validate the configuration and apply defaults for
-// any missing optional fields. The Store field is required and the
-// function will panic if it is nil.
-//
-// Example:
-//
-// engine := hydra.New(hydra.Config{
-// Store: gormStore,
-// Namespace: "production",
-// Logger: logger,
-// })
-func New(config Config) (*Engine, error) {
-
- err := assert.All(
- assert.NotEmpty(config.DSN),
- assert.NotNil(config.Clock),
- assert.NotEmpty(config.Namespace),
- assert.NotNil(config.Logger),
- assert.NotNil(config.Marshaller),
- )
- if err != nil {
- return nil, err
- }
-
- var db *sql.DB
- err = retry.New(
- retry.Attempts(10),
- retry.Backoff(func(n int) time.Duration {
- return time.Duration(n) * time.Second
- }),
- ).Do(func() error {
- var openErr error
- db, openErr = sql.Open("mysql", config.DSN)
- if openErr != nil {
- config.Logger.Info("mysql not ready yet, retrying...", "error", openErr.Error())
- }
- return openErr
-
- })
-
- if err != nil {
- return nil, fmt.Errorf("hydra: failed to open database connection: %w", err)
- }
-
- err = retry.New(
- retry.Attempts(10),
- retry.Backoff(func(n int) time.Duration {
- return time.Duration(n) * time.Second
- }),
- ).Do(func() error {
- return db.Ping()
- })
- // Test the connection
- if err != nil {
- db.Close()
- return nil, fmt.Errorf("hydra: failed to ping database: %v", err)
- }
-
- return &Engine{
- db: db,
- namespace: config.Namespace,
- cronHandlers: make(map[string]CronHandler),
- clock: config.Clock,
- logger: config.Logger,
- marshaller: config.Marshaller,
- }, nil
-}
-
-// GetNamespace returns the namespace for this engine instance.
-//
-// This method is primarily used by workers and internal components
-// to scope database operations to the correct tenant namespace.
-func (e *Engine) GetNamespace() string {
- return e.namespace
-}
-
-// GetDB returns the database connection for direct query usage
-func (e *Engine) GetDB() *sql.DB {
- return e.db
-}
-
-// RegisterCron registers a cron job with the given schedule and handler.
-//
-// The cronSpec follows standard cron syntax (e.g., "0 0 * * *" for daily at midnight).
-// The name must be unique within this engine's namespace. The handler will be
-// called according to the schedule.
-//
-// Example:
-//
-// err := engine.RegisterCron("0 */6 * * *", "cleanup-task", func(ctx context.Context) error {
-// return performCleanup(ctx)
-// })
-//
-// Returns an error if a cron job with the same name is already registered.
-func (e *Engine) RegisterCron(cronSpec, name string, handler CronHandler) error {
- if _, exists := e.cronHandlers[name]; exists {
- return fmt.Errorf("cron %q is already registered", name)
- }
-
- e.cronHandlers[name] = handler
-
- // Use new Query pattern instead of store abstraction
- return store.Query.CreateCronJob(context.Background(), e.db, store.CreateCronJobParams{
- ID: uid.New(uid.CronJobPrefix),
- Name: name,
- CronSpec: cronSpec,
- Namespace: e.namespace,
- WorkflowName: sql.NullString{String: "", Valid: false}, // Empty since this uses a handler, not a workflow
- Enabled: true,
- CreatedAt: e.clock.Now().UnixMilli(),
- UpdatedAt: e.clock.Now().UnixMilli(),
- LastRunAt: sql.NullInt64{Int64: 0, Valid: false},
- NextRunAt: calculateNextRun(cronSpec, e.clock.Now()),
- })
-}
-
-// StartWorkflow starts a new workflow execution with the given name and payload.
-//
-// This method creates a new workflow execution record in the database and makes
-// it available for workers to pick up and execute. The workflow will be queued
-// in a pending state until a worker acquires a lease and begins execution.
-//
-// Parameters:
-// - ctx: Context for the operation, which may include cancellation and timeouts
-// - workflowName: Must match the Name() method of a registered workflow type
-// - payload: The input data for the workflow, which will be serialized and stored
-// - opts: Optional configuration for retry behavior, timeouts, and trigger metadata
-//
-// Returns:
-// - executionID: A unique identifier for this workflow execution
-// - error: Any error that occurred during workflow creation
-//
-// The payload will be marshalled using the engine's configured marshaller (JSON by default)
-// and must be serializable. The workflow will be executed with the configured retry
-// policy and timeout settings.
-//
-// Example:
-//
-// executionID, err := engine.StartWorkflow(ctx, "order-processing", &OrderRequest{
-// CustomerID: "cust_123",
-// Items: []Item{{SKU: "item_456", Quantity: 2}},
-// }, hydra.WithMaxAttempts(5), hydra.WithTimeout(30*time.Minute))
-//
-// Metrics recorded:
-// - hydra_workflows_started_total (counter)
-// - hydra_workflows_queued (gauge)
-// - hydra_payload_size_bytes (histogram)
-func (e *Engine) StartWorkflow(ctx context.Context, workflowName string, payload any, opts ...WorkflowOption) (string, error) {
- // Start tracing span for workflow creation
- ctx, span := tracing.Start(ctx, "hydra.engine.StartWorkflow")
- defer span.End()
-
- executionID := uid.New(uid.WorkflowPrefix)
-
- span.SetAttributes(
- attribute.String("hydra.workflow.name", workflowName),
- attribute.String("hydra.execution.id", executionID),
- attribute.String("hydra.namespace", e.namespace),
- )
-
- config := &WorkflowConfig{
- MaxAttempts: 3, // Default to 3 attempts total (1 initial + 2 retries)
- TimeoutDuration: 1 * time.Hour,
- RetryBackoff: 1 * time.Second,
- TriggerType: store.WorkflowExecutionsTriggerTypeApi, // Default trigger type
- TriggerSource: nil,
- }
- for _, opt := range opts {
- opt(config)
- }
-
- span.SetAttributes(
- attribute.String("hydra.trigger.type", string(config.TriggerType)),
- )
-
- data, err := e.marshaller.Marshal(payload)
- if err != nil {
- metrics.SerializationErrorsTotal.WithLabelValues(e.namespace, workflowName, "input").Inc()
- tracing.RecordError(span, err)
- return "", fmt.Errorf("failed to marshal payload: %w", err)
- }
-
- // Record payload size
- metrics.RecordPayloadSize(e.namespace, workflowName, "input", len(data))
-
- // Extract trace ID and span ID from span context for workflow correlation
- traceID := ""
- spanID := ""
- if spanContext := span.SpanContext(); spanContext.IsValid() {
- traceID = spanContext.TraceID().String()
- spanID = spanContext.SpanID().String()
- }
-
- // Use new Query pattern instead of store abstraction
- err = store.Query.CreateWorkflow(ctx, e.db, store.CreateWorkflowParams{
- ID: executionID,
- WorkflowName: workflowName,
- Status: store.WorkflowExecutionsStatusPending,
- InputData: data,
- OutputData: []byte{},
- ErrorMessage: sql.NullString{String: "", Valid: false},
- CreatedAt: e.clock.Now().UnixMilli(),
- StartedAt: sql.NullInt64{Int64: 0, Valid: false},
- CompletedAt: sql.NullInt64{Int64: 0, Valid: false},
- MaxAttempts: config.MaxAttempts,
- RemainingAttempts: config.MaxAttempts, // Start with full attempts available
- NextRetryAt: sql.NullInt64{Int64: 0, Valid: false},
- Namespace: e.namespace,
- TriggerType: store.NullWorkflowExecutionsTriggerType{WorkflowExecutionsTriggerType: store.WorkflowExecutionsTriggerTypeApi, Valid: false}, // Trigger type conversion not implemented
- TriggerSource: sql.NullString{String: "", Valid: false},
- SleepUntil: sql.NullInt64{Int64: 0, Valid: false},
- TraceID: sql.NullString{String: traceID, Valid: traceID != ""},
- SpanID: sql.NullString{String: spanID, Valid: spanID != ""},
- })
- if err != nil {
- metrics.RecordError(e.namespace, "engine", "workflow_creation_failed")
- tracing.RecordError(span, err)
- return "", fmt.Errorf("failed to create workflow: %w", err)
- }
-
- // Record workflow started
- triggerTypeStr := string(config.TriggerType)
- metrics.WorkflowsStartedTotal.WithLabelValues(e.namespace, workflowName, triggerTypeStr).Inc()
- metrics.WorkflowsQueued.WithLabelValues(e.namespace, "pending").Inc()
-
- span.SetAttributes(attribute.String("hydra.workflow.status", "created"))
-
- return executionID, nil
-}
diff --git a/go/pkg/hydra/engine_test.go b/go/pkg/hydra/engine_test.go
deleted file mode 100644
index d898e53dc9..0000000000
--- a/go/pkg/hydra/engine_test.go
+++ /dev/null
@@ -1,164 +0,0 @@
-package hydra
-
-import (
- "context"
- "sync/atomic"
- "testing"
- "time"
-
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/hydra/testharness"
-)
-
-// Test workflow that counts executions and emits events
-type CountingWorkflow struct {
- counter *int64
- events *testharness.EventCollector
-}
-
-func (w *CountingWorkflow) Name() string {
- return "counting-workflow"
-}
-
-func (w *CountingWorkflow) Run(ctx WorkflowContext, req struct{}) error {
- w.events.Emit(ctx, testharness.WorkflowStarted, "Starting counting workflow")
-
- _, err := Step(ctx, "increment", func(stepCtx context.Context) (string, error) {
- w.events.Emit(ctx, testharness.StepExecuting, "Executing increment step", "step_name", "increment")
-
- // This should only execute exactly once
- atomic.AddInt64(w.counter, 1)
-
- w.events.Emit(ctx, testharness.StepExecuted, "Completed increment step", "step_name", "increment", "result", "incremented")
-
- return "incremented", nil
- })
-
- if err != nil {
- w.events.Emit(ctx, testharness.WorkflowFailed, "Workflow failed", "error", err.Error())
- } else {
- w.events.Emit(ctx, testharness.WorkflowCompleted, "Workflow completed successfully")
- }
-
- return err
-}
-
-// CRITICAL CORRECTNESS TESTS
-
-func TestBasicWorkflowRegistration(t *testing.T) {
- // Given: An engine instance and workflow
- e := newTestEngine(t)
- events := testharness.NewEventCollector()
- workflow := &CountingWorkflow{
- counter: new(int64),
- events: events,
- }
-
- // When: Creating worker and registering workflow
- worker, err := NewWorker(e, WorkerConfig{
- Concurrency: 1,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(context.Background())
-
- // Then: Worker should start without error
- require.NoError(t, err)
- require.NotNil(t, worker)
- defer worker.Shutdown(context.Background())
-
- // And: We should be able to start a workflow
- executionID, err := e.StartWorkflow(context.Background(), workflow.Name(), struct{}{})
- require.NoError(t, err)
- require.NotEmpty(t, executionID)
-}
-
-func TestStepExecutesExactlyOnce(t *testing.T) {
- // Given: A workflow with a step that increments a counter and emits events
- testClock := clock.NewTestClock()
- e := newTestEngineWithClock(t, testClock)
- events := testharness.NewEventCollector()
- counter := int64(0)
- workflow := &CountingWorkflow{
- counter: &counter,
- events: events,
- }
-
- // When: Creating worker, registering workflow, and starting
- worker, err := NewWorker(e, WorkerConfig{
- Concurrency: 1,
- PollInterval: 100 * time.Millisecond, // Fast polling for test
- })
- require.NoError(t, err)
- defer worker.Shutdown(context.Background())
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(context.Background())
- require.NoError(t, err)
-
- // Give worker time to start polling
- time.Sleep(50 * time.Millisecond)
-
- // Start workflow execution
- executionID, err := e.StartWorkflow(context.Background(), workflow.Name(), struct{}{})
- require.NoError(t, err)
- require.NotEmpty(t, executionID)
-
- // Trigger worker polling with test clock
- for i := 0; i < 10; i++ {
- testClock.Tick(200 * time.Millisecond)
- time.Sleep(10 * time.Millisecond)
-
- // Check if workflow has been picked up
- currentStatus, err := store.Query.GetWorkflow(context.Background(), e.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: e.GetNamespace(),
- })
- require.NoError(t, err)
- if currentStatus.Status != store.WorkflowExecutionsStatusPending {
- break
- }
- }
-
- // Wait for completion
- completedWorkflow := waitForWorkflowCompletion(t, e, executionID, 3*time.Second)
- require.NotNil(t, completedWorkflow)
-
- // Then: Assert using both counter and events
- finalCount := atomic.LoadInt64(&counter)
-
- // Check events for detailed analysis
- stepExecutions := events.FilterWithData(testharness.StepExecuting, "step_name", "increment")
- stepCompletions := events.FilterWithData(testharness.StepExecuted, "step_name", "increment")
- workflowCompletions := events.Filter(testharness.WorkflowCompleted)
-
- // The critical assertion: step should execute exactly once
- assert.Equal(t, int64(1), finalCount, "Counter should be incremented exactly once")
- assert.Equal(t, 1, len(stepExecutions), "Step should be executed exactly once")
- assert.Equal(t, 1, len(stepCompletions), "Step should complete exactly once")
- assert.Equal(t, 1, len(workflowCompletions), "Workflow should complete exactly once")
-}
-
-func TestStepCheckpointingPreventsReExecution(t *testing.T) {
- t.Skip("TODO: Implement checkpointing test")
-}
-
-func TestWorkflowTerminatesEventually(t *testing.T) {
- t.Skip("TODO: Implement retry limit testing")
-}
-
-func TestWorkerCrashRecovery(t *testing.T) {
- t.Skip("TODO: Implement worker crash recovery testing")
-}
-
-func TestNoDuplicateStepExecution(t *testing.T) {
- t.Skip("TODO: Implement concurrency safety testing")
-}
diff --git a/go/pkg/hydra/marshaller.go b/go/pkg/hydra/marshaller.go
deleted file mode 100644
index 38eeb94c6e..0000000000
--- a/go/pkg/hydra/marshaller.go
+++ /dev/null
@@ -1,58 +0,0 @@
-package hydra
-
-import (
- "encoding/json"
-)
-
-// Marshaller defines the interface for serializing workflow payloads and step results.
-//
-// The marshaller is responsible for converting Go values to and from byte arrays
-// for storage in the database. Custom marshallers can be implemented to support
-// different serialization formats like Protocol Buffers, MessagePack, or custom
-// binary formats.
-//
-// Implementations must ensure that:
-// - Marshal and Unmarshal are inverse operations
-// - The same input always produces the same output (deterministic)
-// - All workflow payload types are supported
-// - Error handling is consistent and informative
-type Marshaller interface {
- // Marshal converts a Go value to bytes for storage.
- // The value may be any type used in workflow payloads or step results.
- Marshal(v any) ([]byte, error)
-
- // Unmarshal converts stored bytes back to a Go value.
- // The target value should be a pointer to the desired type.
- Unmarshal(data []byte, v any) error
-}
-
-// JSONMarshaller implements Marshaller using standard Go JSON encoding.
-//
-// This is the default marshaller used by Hydra engines. It provides
-// good compatibility with most Go types and is human-readable for
-// debugging purposes.
-//
-// Limitations:
-// - Cannot handle circular references
-// - Maps with non-string keys are not supported
-// - Precision may be lost for large integers
-// - Custom types need JSON tags for proper serialization
-type JSONMarshaller struct{}
-
-// NewJSONMarshaller creates a new JSON-based marshaller.
-//
-// This is the default marshaller used when no custom marshaller
-// is provided to the engine configuration.
-func NewJSONMarshaller() Marshaller {
- return &JSONMarshaller{}
-}
-
-// Marshal implements Marshaller.Marshal using encoding/json.
-func (j *JSONMarshaller) Marshal(v any) ([]byte, error) {
- return json.Marshal(v)
-}
-
-// Unmarshal implements Marshaller.Unmarshal using encoding/json.
-func (j *JSONMarshaller) Unmarshal(data []byte, v any) error {
- return json.Unmarshal(data, v)
-}
diff --git a/go/pkg/hydra/marshaller_test.go b/go/pkg/hydra/marshaller_test.go
deleted file mode 100644
index 5d5a621c28..0000000000
--- a/go/pkg/hydra/marshaller_test.go
+++ /dev/null
@@ -1,64 +0,0 @@
-package hydra
-
-import (
- "testing"
-
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
-)
-
-func TestJSONMarshaller(t *testing.T) {
- marshaller := NewJSONMarshaller()
-
- t.Run("MarshalUnmarshalStruct", func(t *testing.T) {
- type TestStruct struct {
- Name string `json:"name"`
- Value int `json:"value"`
- }
-
- original := TestStruct{Name: "test", Value: 42}
-
- // Marshal
- data, err := marshaller.Marshal(original)
- require.NoError(t, err)
- assert.Contains(t, string(data), "test")
- assert.Contains(t, string(data), "42")
-
- // Unmarshal
- var result TestStruct
- err = marshaller.Unmarshal(data, &result)
- require.NoError(t, err)
- assert.Equal(t, original, result)
- })
-
- t.Run("MarshalUnmarshalPrimitive", func(t *testing.T) {
- original := "hello world"
-
- data, err := marshaller.Marshal(original)
- require.NoError(t, err)
-
- var result string
- err = marshaller.Unmarshal(data, &result)
- require.NoError(t, err)
- assert.Equal(t, original, result)
- })
-
- t.Run("MarshalUnmarshalMap", func(t *testing.T) {
- original := map[string]interface{}{
- "key1": "value1",
- "key2": 123,
- "key3": true,
- }
-
- data, err := marshaller.Marshal(original)
- require.NoError(t, err)
-
- var result map[string]interface{}
- err = marshaller.Unmarshal(data, &result)
- require.NoError(t, err)
- assert.Equal(t, "value1", result["key1"])
- // Note: JSON unmarshaling converts numbers to float64
- assert.Equal(t, float64(123), result["key2"])
- assert.Equal(t, true, result["key3"])
- })
-}
diff --git a/go/pkg/hydra/metrics/example_usage.go b/go/pkg/hydra/metrics/example_usage.go
deleted file mode 100644
index 775b7b6594..0000000000
--- a/go/pkg/hydra/metrics/example_usage.go
+++ /dev/null
@@ -1,89 +0,0 @@
-package metrics
-
-import (
- "time"
-)
-
-const exampleNamespace = "production"
-
-func ExampleWorkflowMetrics() {
- namespace := exampleNamespace
- workflowName := "user-onboarding"
-
- WorkflowsStartedTotal.WithLabelValues(namespace, workflowName, "manual").Inc()
-
- WorkflowsQueued.WithLabelValues(namespace, "pending").Set(42)
- WorkflowsActive.WithLabelValues(namespace, "worker-1").Set(5)
-
- start := time.Now()
- ObserveWorkflowDuration(namespace, workflowName, "completed", start)
- WorkflowsCompletedTotal.WithLabelValues(namespace, workflowName, "completed").Inc()
-}
-
-func ExampleStepMetrics() {
- namespace := exampleNamespace
- workflowName := "order-processing"
- stepName := "charge-payment"
-
- start := time.Now()
- ObserveStepDuration(namespace, workflowName, stepName, "completed", start)
- StepsExecutedTotal.WithLabelValues(namespace, workflowName, stepName, "completed").Inc()
-
- StepsCachedTotal.WithLabelValues(namespace, workflowName, stepName).Inc()
-}
-
-func ExampleDatabaseMetrics() {
- start := time.Now()
- ObserveDbOperation("select", "workflow_executions", "success", start)
-
- DbConnectionsActive.WithLabelValues("worker-1").Set(15)
-}
-
-func ExampleSleepMetrics() {
- namespace := exampleNamespace
- workflowName := "user-onboarding"
-
- SleepsStartedTotal.WithLabelValues(namespace, workflowName).Inc()
- SleepsResumedTotal.WithLabelValues(namespace, workflowName).Inc()
-
- actualSleepDuration := 25 * time.Minute // actual time slept
- SleepDurationSeconds.WithLabelValues(namespace, workflowName).Observe(actualSleepDuration.Seconds())
-
- CronTriggersTotal.WithLabelValues(namespace, "daily-report", "success").Inc()
-}
-
-func ExampleErrorMetrics() {
- namespace := exampleNamespace
-
- RecordError(namespace, "step", "timeout")
- RecordError(namespace, "client", "serialization")
- RecordError(namespace, "store", "connection")
-
- PanicsTotal.WithLabelValues("worker-1", "step_execution").Inc()
-
- TimeoutsTotal.WithLabelValues(namespace, "workflow_execution").Inc()
-}
-
-func ExamplePayloadMetrics() {
- namespace := exampleNamespace
- workflowName := "image-processing"
-
- inputSize := 1024 * 50 // 50KB input
- outputSize := 1024 * 5 // 5KB output
-
- RecordPayloadSize(namespace, workflowName, "input", inputSize)
- RecordPayloadSize(namespace, workflowName, "output", outputSize)
-
- SerializationErrorsTotal.WithLabelValues(namespace, workflowName, "input").Inc()
-}
-
-func ExampleWorkerMetrics() {
- workerID := "worker-1"
- namespace := exampleNamespace
-
- WorkerHeartbeatsTotal.WithLabelValues(workerID, namespace, "success").Inc()
- WorkerPollsTotal.WithLabelValues(workerID, namespace, "found_work").Inc()
- LeaseAcquisitionsTotal.WithLabelValues(workerID, "workflow", "success").Inc()
-
- WorkerConcurrencyCurrent.WithLabelValues(workerID, namespace).Set(8)
-}
diff --git a/go/pkg/hydra/metrics/metrics.go b/go/pkg/hydra/metrics/metrics.go
deleted file mode 100644
index f90aabd6d0..0000000000
--- a/go/pkg/hydra/metrics/metrics.go
+++ /dev/null
@@ -1,351 +0,0 @@
-package metrics
-
-import (
- "os"
- "time"
-
- "github.com/prometheus/client_golang/prometheus"
- "github.com/prometheus/client_golang/prometheus/promauto"
- "github.com/unkeyed/unkey/go/pkg/version"
-)
-
-var constLabels = prometheus.Labels{
- "region": os.Getenv("UNKEY_REGION"),
- "version": version.Version,
-}
-
-var workflowLatencyBuckets = []float64{
- 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0,
- 10.0, 30.0, 60.0, 120.0, 300.0, 600.0,
-}
-
-var stepLatencyBuckets = []float64{
- 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5,
- 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
-}
-
-var dbLatencyBuckets = []float64{
- 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
- 0.2, 0.5, 1.0, 2.0, 5.0,
-}
-
-var WorkflowsStartedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "workflows_started_total",
- Help: "Total number of workflows started",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "trigger_type"},
-)
-
-var WorkflowsCompletedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "workflows_completed_total",
- Help: "Total number of workflows completed",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "status"},
-)
-
-var WorkflowsRetriedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "workflows_retried_total",
- Help: "Total number of workflow retry attempts",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "attempt"},
-)
-
-var WorkflowDurationSeconds = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "workflow_duration_seconds",
- Help: "Time taken to complete workflows",
- ConstLabels: constLabels,
- Buckets: workflowLatencyBuckets,
- },
- []string{"namespace", "workflow_name", "status"},
-)
-
-var WorkflowQueueTimeSeconds = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "workflow_queue_time_seconds",
- Help: "Time workflow spent queued before execution",
- ConstLabels: constLabels,
- Buckets: workflowLatencyBuckets,
- },
- []string{"namespace", "workflow_name"},
-)
-
-var WorkflowsActive = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: "hydra",
- Name: "workflows_active",
- Help: "Currently running workflows",
- ConstLabels: constLabels,
- },
- []string{"namespace", "worker_id"},
-)
-
-var WorkflowsQueued = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: "hydra",
- Name: "workflows_queued",
- Help: "Workflows waiting to be processed",
- ConstLabels: constLabels,
- },
- []string{"namespace", "status"},
-)
-
-var StepsExecutedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "steps_executed_total",
- Help: "Total number of workflow steps executed",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "step_name", "status"},
-)
-
-var StepsRetriedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "steps_retried_total",
- Help: "Total number of step retry attempts",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "step_name"},
-)
-
-var StepsCachedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "steps_cached_total",
- Help: "Steps skipped due to checkpointing",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "step_name"},
-)
-
-var StepDurationSeconds = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "step_duration_seconds",
- Help: "Time taken to execute workflow steps",
- ConstLabels: constLabels,
- Buckets: stepLatencyBuckets,
- },
- []string{"namespace", "workflow_name", "step_name", "status"},
-)
-
-var WorkerHeartbeatsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "worker_heartbeats_total",
- Help: "Total number of worker heartbeat operations",
- ConstLabels: constLabels,
- },
- []string{"worker_id", "namespace", "status"},
-)
-
-var LeaseAcquisitionsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "lease_acquisitions_total",
- Help: "Total number of lease acquisition attempts",
- ConstLabels: constLabels,
- },
- []string{"worker_id", "resource_type", "status"},
-)
-
-var WorkerPollsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "worker_polls_total",
- Help: "Total number of worker polling operations",
- ConstLabels: constLabels,
- },
- []string{"worker_id", "namespace", "status"},
-)
-
-var WorkerConcurrencyCurrent = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: "hydra",
- Name: "worker_concurrency_current",
- Help: "Current workflow concurrency per worker",
- ConstLabels: constLabels,
- },
- []string{"worker_id", "namespace"},
-)
-
-var DbOperationsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "db_operations_total",
- Help: "Total number of database operations",
- ConstLabels: constLabels,
- },
- []string{"operation", "table", "status"},
-)
-
-var DbOperationDurationSeconds = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "db_operation_duration_seconds",
- Help: "Time taken for database operations",
- ConstLabels: constLabels,
- Buckets: dbLatencyBuckets,
- },
- []string{"operation", "table", "status"},
-)
-
-var DbConnectionsActive = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: "hydra",
- Name: "db_connections_active",
- Help: "Active database connections",
- ConstLabels: constLabels,
- },
- []string{"worker_id"},
-)
-
-var SleepsStartedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "sleeps_started_total",
- Help: "Total number of sleep operations initiated",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name"},
-)
-
-var SleepsResumedTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "sleeps_resumed_total",
- Help: "Total number of workflows resumed from sleep",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name"},
-)
-
-var CronTriggersTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "cron_triggers_total",
- Help: "Total number of cron-triggered workflows",
- ConstLabels: constLabels,
- },
- []string{"namespace", "cron_name", "status"},
-)
-
-var SleepDurationSeconds = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "sleep_duration_seconds",
- Help: "Actual sleep durations",
- ConstLabels: constLabels,
- Buckets: workflowLatencyBuckets,
- },
- []string{"namespace", "workflow_name"},
-)
-
-var CronExecutionLatencySeconds = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "cron_execution_latency_seconds",
- Help: "Delay between scheduled and actual cron execution",
- ConstLabels: constLabels,
- Buckets: []float64{0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0, 300.0},
- },
- []string{"namespace", "cron_name"},
-)
-
-var WorkflowsSleeping = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: "hydra",
- Name: "workflows_sleeping",
- Help: "Currently sleeping workflows",
- ConstLabels: constLabels,
- },
- []string{"namespace"},
-)
-
-var ErrorsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "errors_total",
- Help: "Total number of errors across all components",
- ConstLabels: constLabels,
- },
- []string{"namespace", "component", "error_type"},
-)
-
-var PanicsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "panics_total",
- Help: "Total number of panic recoveries",
- ConstLabels: constLabels,
- },
- []string{"worker_id", "component"},
-)
-
-var TimeoutsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "timeouts_total",
- Help: "Total number of operation timeouts",
- ConstLabels: constLabels,
- },
- []string{"namespace", "operation_type"},
-)
-
-var PayloadSizeBytes = promauto.NewHistogramVec(
- prometheus.HistogramOpts{
- Subsystem: "hydra",
- Name: "payload_size_bytes",
- Help: "Size of workflow and step payloads",
- ConstLabels: constLabels,
- Buckets: []float64{100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000},
- },
- []string{"namespace", "workflow_name", "direction"},
-)
-
-var SerializationErrorsTotal = promauto.NewCounterVec(
- prometheus.CounterOpts{
- Subsystem: "hydra",
- Name: "serialization_errors_total",
- Help: "Total number of payload serialization errors",
- ConstLabels: constLabels,
- },
- []string{"namespace", "workflow_name", "direction"},
-)
-
-func ObserveWorkflowDuration(namespace, workflowName, status string, start time.Time) {
- duration := time.Since(start)
- WorkflowDurationSeconds.WithLabelValues(namespace, workflowName, status).Observe(duration.Seconds())
-}
-
-func ObserveStepDuration(namespace, workflowName, stepName, status string, start time.Time) {
- duration := time.Since(start)
- StepDurationSeconds.WithLabelValues(namespace, workflowName, stepName, status).Observe(duration.Seconds())
-}
-
-func ObserveDbOperation(operation, table, status string, start time.Time) {
- duration := time.Since(start)
- DbOperationsTotal.WithLabelValues(operation, table, status).Inc()
- DbOperationDurationSeconds.WithLabelValues(operation, table, status).Observe(duration.Seconds())
-}
-
-func RecordError(namespace, component, errorType string) {
- ErrorsTotal.WithLabelValues(namespace, component, errorType).Inc()
-}
-
-func RecordPayloadSize(namespace, workflowName, direction string, size int) {
- PayloadSizeBytes.WithLabelValues(namespace, workflowName, direction).Observe(float64(size))
-}
diff --git a/go/pkg/hydra/simple_consistency_test.go b/go/pkg/hydra/simple_consistency_test.go
deleted file mode 100644
index e49778bb56..0000000000
--- a/go/pkg/hydra/simple_consistency_test.go
+++ /dev/null
@@ -1,270 +0,0 @@
-package hydra
-
-import (
- "context"
- "fmt"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/hydra/testharness"
-)
-
-// TestSimpleDataConsistency tests basic data consistency using event collection
-func TestSimpleDataConsistency(t *testing.T) {
- realClock := clock.New()
- engine := newTestEngineWithClock(t, realClock)
-
- const numWorkflows = 10
-
- // Create event collector
- eventCollector := testharness.NewEventCollector()
-
- // Create event-aware workflow
- workflow := &eventTrackingWorkflow{
- engine: engine,
- name: "simple-consistency-workflow",
- collector: eventCollector,
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 12*time.Second)
- defer cancel()
-
- // Start a single worker
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: "simple-consistency-worker",
- Concurrency: 2,
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second,
- ClaimTimeout: 5 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Submit workflows
- workflowIDs := make([]string, numWorkflows)
- for i := 0; i < numWorkflows; i++ {
- workflowID, startErr := workflow.Start(ctx, fmt.Sprintf("payload-%d", i))
- require.NoError(t, startErr)
- workflowIDs[i] = workflowID
- }
-
- // Wait for all workflows to finish using event collection
- require.Eventually(t, func() bool {
- completedEvents := eventCollector.Count(testharness.WorkflowCompleted)
- failedEvents := eventCollector.Count(testharness.WorkflowFailed)
- totalFinished := completedEvents + failedEvents
-
- return totalFinished == numWorkflows
- }, 10*time.Second, 200*time.Millisecond, "All workflows should finish")
-
- // Verify exactly-once execution using events
- for _, workflowID := range workflowIDs {
- // Verify exactly one started event
- startedEvents := eventCollector.FilterWithData(testharness.WorkflowStarted, "execution_id", workflowID)
- require.Len(t, startedEvents, 1, "Workflow %s should start exactly once", workflowID)
-
- // Verify exactly one completion event
- completedEvents := eventCollector.FilterWithData(testharness.WorkflowCompleted, "execution_id", workflowID)
- failedEvents := eventCollector.FilterWithData(testharness.WorkflowFailed, "execution_id", workflowID)
- totalCompletions := len(completedEvents) + len(failedEvents)
- require.Equal(t, 1, totalCompletions, "Workflow %s should complete exactly once", workflowID)
-
- // Verify exactly one step execution
- stepExecutingEvents := eventCollector.FilterWithData(testharness.StepExecuting, "execution_id", workflowID)
- require.Len(t, stepExecutingEvents, 1, "Workflow %s should have exactly one step execution", workflowID)
- }
-
- // Verify database consistency
- // GetAllWorkflows was removed - check completed workflows indirectly
- // Since we know we created specific workflows, verify them individually
- completedInDB := 0
- for _, id := range workflowIDs {
- wf, err := store.Query.GetWorkflow(context.Background(), engine.GetDB(), store.GetWorkflowParams{
- ID: id,
- Namespace: engine.GetNamespace(),
- })
- if err == nil && wf.Status == store.WorkflowExecutionsStatusCompleted {
- completedInDB++
- }
- }
-
- completedEventsCount := eventCollector.Count(testharness.WorkflowCompleted)
- require.Equal(t, completedEventsCount, completedInDB,
- "Database completed count should match completed events")
-
-}
-
-// TestConcurrentWorkerConsistency tests consistency with multiple workers using events
-func TestConcurrentWorkerConsistency(t *testing.T) {
- realClock := clock.New()
- engine := newTestEngineWithClock(t, realClock)
-
- const (
- numWorkers = 3
- numWorkflows = 15
- )
-
- // Create event collector
- eventCollector := testharness.NewEventCollector()
-
- // Create event-aware workflow
- workflow := &eventTrackingWorkflow{
- engine: engine,
- name: "concurrent-consistency-workflow",
- collector: eventCollector,
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
- defer cancel()
-
- // Start multiple workers
- workers := make([]Worker, numWorkers)
- for i := 0; i < numWorkers; i++ {
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: fmt.Sprintf("concurrent-worker-%d", i),
- Concurrency: 2,
- PollInterval: 50 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second,
- ClaimTimeout: 5 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- workers[i] = worker
- }
-
- // Submit workflows
- workflowIDs := make([]string, numWorkflows)
- for i := 0; i < numWorkflows; i++ {
- workflowID, err := workflow.Start(ctx, fmt.Sprintf("concurrent-payload-%d", i))
- require.NoError(t, err)
- workflowIDs[i] = workflowID
- }
-
- // Wait for workflows to finish using event collection
- require.Eventually(t, func() bool {
- completedEvents := eventCollector.Count(testharness.WorkflowCompleted)
- failedEvents := eventCollector.Count(testharness.WorkflowFailed)
- totalFinished := completedEvents + failedEvents
-
- return totalFinished == numWorkflows
- }, 12*time.Second, 300*time.Millisecond, "All concurrent workflows should finish")
-
- // Verify exactly-once execution for each workflow
- duplicateExecutions := 0
- duplicateCompletions := 0
-
- for _, workflowID := range workflowIDs {
- // Check for duplicate workflow executions
- startedEvents := eventCollector.FilterWithData(testharness.WorkflowStarted, "execution_id", workflowID)
- if len(startedEvents) > 1 {
- duplicateExecutions++
- t.Errorf("DUPLICATE EXECUTION: Workflow %s started %d times", workflowID, len(startedEvents))
- }
- require.Len(t, startedEvents, 1, "Workflow %s should start exactly once", workflowID)
-
- // Check for duplicate completions
- completedEvents := eventCollector.FilterWithData(testharness.WorkflowCompleted, "execution_id", workflowID)
- failedEvents := eventCollector.FilterWithData(testharness.WorkflowFailed, "execution_id", workflowID)
- totalCompletions := len(completedEvents) + len(failedEvents)
-
- if totalCompletions > 1 {
- duplicateCompletions++
- t.Errorf("DUPLICATE COMPLETION: Workflow %s completed %d times (%d completed + %d failed)",
- workflowID, totalCompletions, len(completedEvents), len(failedEvents))
- }
- require.Equal(t, 1, totalCompletions, "Workflow %s should complete exactly once", workflowID)
-
- // Verify exactly one step execution
- stepExecutingEvents := eventCollector.FilterWithData(testharness.StepExecuting, "execution_id", workflowID)
- require.Len(t, stepExecutingEvents, 1, "Workflow %s should have exactly one step execution", workflowID)
- }
-
- // Assert no duplicates were found
- require.Equal(t, 0, duplicateExecutions, "Should have zero duplicate workflow executions")
- require.Equal(t, 0, duplicateCompletions, "Should have zero duplicate workflow completions")
-
- // Verify database consistency
- // GetAllWorkflows was removed - check completed workflows indirectly
- // Since we know we created specific workflows, verify them individually
- completedInDB := 0
- for _, id := range workflowIDs {
- wf, err := store.Query.GetWorkflow(context.Background(), engine.GetDB(), store.GetWorkflowParams{
- ID: id,
- Namespace: engine.GetNamespace(),
- })
- if err == nil && wf.Status == store.WorkflowExecutionsStatusCompleted {
- completedInDB++
- }
- }
-
- completedEventsCount := eventCollector.Count(testharness.WorkflowCompleted)
- require.Equal(t, completedEventsCount, completedInDB,
- "Database completed count should match completed events")
-
-}
-
-// eventTrackingWorkflow emits events during execution for testing
-type eventTrackingWorkflow struct {
- engine *Engine
- name string
- collector *testharness.EventCollector
-}
-
-func (w *eventTrackingWorkflow) Name() string {
- return w.name
-}
-
-func (w *eventTrackingWorkflow) Run(ctx WorkflowContext, req any) error {
- // Emit workflow started event
- w.collector.Emit(ctx, testharness.WorkflowStarted, "Workflow execution started")
-
- // Emit step executing event
- w.collector.Emit(ctx, testharness.StepExecuting, "Step execution started", "step_name", "consistency-step")
-
- // Execute the step
- result, err := Step(ctx, "consistency-step", func(stepCtx context.Context) (string, error) {
- // Simulate some work
- time.Sleep(20 * time.Millisecond)
- return "step-completed", nil
- })
-
- if err != nil {
- // Emit step failed event
- w.collector.Emit(ctx, testharness.StepFailed, "Step execution failed",
- "step_name", "consistency-step", "error", err.Error())
-
- // Emit workflow failed event
- w.collector.Emit(ctx, testharness.WorkflowFailed, "Workflow execution failed", "error", err.Error())
-
- return err
- }
-
- // Emit step executed event
- w.collector.Emit(ctx, testharness.StepExecuted, "Step execution completed",
- "step_name", "consistency-step", "result", result)
-
- // Emit workflow completed event
- w.collector.Emit(ctx, testharness.WorkflowCompleted, "Workflow execution completed")
-
- return nil
-}
-
-func (w *eventTrackingWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/sleep.go b/go/pkg/hydra/sleep.go
deleted file mode 100644
index b824a8844e..0000000000
--- a/go/pkg/hydra/sleep.go
+++ /dev/null
@@ -1,133 +0,0 @@
-package hydra
-
-import (
- "database/sql"
- "fmt"
- "time"
-
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/uid"
-)
-
-// Sleep suspends workflow execution for the specified duration.
-//
-// This function allows workflows to pause execution and resume after
-// a specified time period. The workflow will be marked as sleeping
-// and workers will not attempt to execute it until the sleep duration
-// has elapsed.
-//
-// Sleep is useful for:
-// - Time-based coordination (e.g., waiting for settlement periods)
-// - Human approval workflows (e.g., waiting for manual intervention)
-// - Rate limiting and backoff strategies
-// - Scheduled processing windows
-//
-// The sleep duration is durable - if the worker crashes or restarts
-// during the sleep period, the workflow will still resume at the
-// correct time.
-//
-// Example usage:
-//
-// // Sleep for 24 hours for manual approval
-// err = hydra.Sleep(ctx, 24*time.Hour)
-// if err != nil {
-// return err
-// }
-//
-// // Continue with post-approval processing
-// result, err := hydra.Step(ctx, "post-approval", func(stepCtx context.Context) (string, error) {
-// return processApprovedRequest(stepCtx)
-// })
-//
-// Note: Sleep creates an internal step to track the sleep state.
-// The step name is generated automatically based on the duration.
-//
-// Metrics recorded:
-// - hydra_sleeps_started_total (counter)
-// - hydra_workflows_sleeping (gauge)
-func Sleep(ctx WorkflowContext, duration time.Duration) error {
- wctx, ok := ctx.(*workflowContext)
- if !ok {
- return fmt.Errorf("invalid workflow context")
- }
-
- stepName := fmt.Sprintf("sleep-%d", duration.Milliseconds())
-
- _, err := store.Query.GetCompletedStep(wctx.ctx, wctx.db, store.GetCompletedStepParams{
- Namespace: wctx.namespace,
- ExecutionID: wctx.ExecutionID(),
- StepName: stepName,
- })
- if err == nil {
- return nil
- }
-
- now := time.Now().UnixMilli()
- existingStep, err := store.Query.GetStep(wctx.ctx, wctx.db, store.GetStepParams{
- Namespace: wctx.namespace,
- ExecutionID: wctx.ExecutionID(),
- StepName: stepName,
- })
- if err == nil && existingStep.StartedAt.Valid {
- sleepUntil := existingStep.StartedAt.Int64 + duration.Milliseconds()
-
- if sleepUntil <= now {
- return wctx.markStepCompleted(existingStep.ID, []byte("{}"))
- }
- return store.Query.SleepWorkflow(wctx.ctx, wctx.db, store.SleepWorkflowParams{
- SleepUntil: sql.NullInt64{Int64: sleepUntil, Valid: true},
- ID: wctx.ExecutionID(),
- Namespace: wctx.namespace,
- })
- }
-
- sleepUntil := now + duration.Milliseconds()
-
- // Create sleep step with lease validation - only if worker holds valid lease
- stepID := uid.New(uid.StepPrefix)
- result, err := wctx.db.ExecContext(wctx.ctx, `
- INSERT INTO workflow_steps (
- id, execution_id, step_name, status, output_data, error_message,
- started_at, completed_at, max_attempts, remaining_attempts, namespace
- )
- SELECT ?, ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?
- WHERE EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- stepID,
- wctx.ExecutionID(),
- stepName,
- store.WorkflowStepsStatusRunning,
- []byte{},
- sql.NullString{String: "", Valid: false},
- sql.NullInt64{Int64: now, Valid: true},
- sql.NullInt64{Int64: 0, Valid: false},
- 1, // Sleep doesn't need retries
- 1,
- wctx.namespace,
- wctx.ExecutionID(), // resource_id for lease check
- wctx.workerID, // worker_id for lease check
- now, // expires_at check
- )
- if err != nil {
- return fmt.Errorf("failed to create sleep step: %w", err)
- }
-
- // Check if the step creation actually happened (lease validation)
- rowsAffected, err := result.RowsAffected()
- if err != nil {
- return fmt.Errorf("failed to check step creation result: %w", err)
- }
- if rowsAffected == 0 {
- return fmt.Errorf("sleep step creation failed: lease expired or invalid")
- }
-
- return store.Query.SleepWorkflow(wctx.ctx, wctx.db, store.SleepWorkflowParams{
- SleepUntil: sql.NullInt64{Int64: sleepUntil, Valid: true},
- ID: wctx.ExecutionID(),
- Namespace: wctx.namespace,
- })
-}
diff --git a/go/pkg/hydra/step.go b/go/pkg/hydra/step.go
deleted file mode 100644
index 2dc8c209ad..0000000000
--- a/go/pkg/hydra/step.go
+++ /dev/null
@@ -1,311 +0,0 @@
-package hydra
-
-import (
- "context"
- "database/sql"
- "fmt"
- "reflect"
- "time"
-
- "github.com/unkeyed/unkey/go/pkg/hydra/metrics"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/otel/tracing"
- "github.com/unkeyed/unkey/go/pkg/uid"
- "go.opentelemetry.io/otel/attribute"
-)
-
-// Step executes a named step within a workflow with automatic checkpointing and retry logic.
-//
-// Steps are the fundamental units of work in Hydra workflows. They provide:
-// - Exactly-once execution guarantees
-// - Automatic result caching (checkpointing)
-// - Built-in retry logic for transient failures
-// - Comprehensive metrics and observability
-//
-// Parameters:
-// - ctx: The workflow context from the workflow's Run() method
-// - stepName: A unique name for this step within the workflow
-// - fn: The function to execute, which should be idempotent
-//
-// The stepName must be unique within the workflow and should remain stable
-// across deployments. If a step has already completed successfully, its
-// cached result will be returned without re-executing the function.
-//
-// The function fn receives a standard Go context and should:
-// - Be idempotent (safe to run multiple times)
-// - Handle context cancellation gracefully
-// - Return consistent results for the same inputs
-// - Use the provided context for any I/O operations
-//
-// Example usage:
-//
-// // Simple step with string result
-// result, err := hydra.Step(ctx, "fetch-user", func(stepCtx context.Context) (string, error) {
-// user, err := userService.GetUser(stepCtx, userID)
-// if err != nil {
-// return "", err
-// }
-// return user.Name, nil
-// })
-//
-// // Step with complex result type
-// order, err := hydra.Step(ctx, "create-order", func(stepCtx context.Context) (*Order, error) {
-// return orderService.CreateOrder(stepCtx, &CreateOrderRequest{
-// CustomerID: customerID,
-// Items: items,
-// })
-// })
-//
-// Metrics recorded:
-// - hydra_steps_executed_total (counter with status)
-// - hydra_step_duration_seconds (histogram)
-// - hydra_steps_cached_total (counter for cache hits)
-// - hydra_steps_retried_total (counter for retry attempts)
-//
-// Returns the result of the function execution or the cached result if the
-// step has already completed successfully.
-func Step[TResponse any](ctx WorkflowContext, stepName string, fn func(context.Context) (TResponse, error)) (TResponse, error) {
- var zero TResponse
-
- wctx, ok := ctx.(*workflowContext)
- if !ok {
- return zero, fmt.Errorf("invalid workflow context")
- }
-
- // Start tracing span for this step
- stepCtx, span := tracing.Start(wctx.ctx, fmt.Sprintf("hydra.step.%s", stepName))
- defer span.End()
-
- span.SetAttributes(
- attribute.String("hydra.step.name", stepName),
- attribute.String("hydra.workflow.name", wctx.workflowName),
- attribute.String("hydra.execution.id", wctx.executionID),
- attribute.String("hydra.namespace", wctx.namespace),
- )
-
- existing, err := store.Query.GetCompletedStep(wctx.ctx, wctx.db, store.GetCompletedStepParams{
- Namespace: wctx.namespace,
- ExecutionID: wctx.ExecutionID(),
- StepName: stepName,
- })
- if err == nil {
- // Record cached step hit
- metrics.StepsCachedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName).Inc()
- span.SetAttributes(attribute.Bool("hydra.step.cached", true))
-
- responseType := reflect.TypeOf((*TResponse)(nil)).Elem()
- var response TResponse
-
- if responseType.Kind() == reflect.Ptr {
- responseValue := reflect.New(responseType.Elem())
- var ok bool
- response, ok = responseValue.Interface().(TResponse)
- if !ok {
- conversionErr := fmt.Errorf("failed to convert response to expected type")
- tracing.RecordError(span, conversionErr)
- return zero, conversionErr
- }
- }
-
- if len(existing.OutputData) > 0 {
- err = wctx.marshaller.Unmarshal(existing.OutputData, &response)
- if err != nil {
- metrics.RecordError(wctx.namespace, "step", "unmarshal_cached_result_failed")
- tracing.RecordError(span, err)
- return zero, fmt.Errorf("failed to unmarshal cached step result: %w", err)
- }
- }
-
- return response, nil
- }
-
- span.SetAttributes(attribute.Bool("hydra.step.cached", false))
-
- _, err = store.Query.GetStep(wctx.ctx, wctx.db, store.GetStepParams{
- Namespace: wctx.namespace,
- ExecutionID: wctx.ExecutionID(),
- StepName: stepName,
- })
- var stepToUse *store.WorkflowStep
- shouldCreateNewStep := err != nil // sql.ErrNoRows means step doesn't exist, so create new one
-
- stepStartTime := time.Now()
-
- if shouldCreateNewStep {
- stepID := uid.New(uid.StepPrefix)
-
- // Create step with lease validation - only if worker holds valid lease
- now := time.Now().UnixMilli()
- createResult, createErr := wctx.db.ExecContext(wctx.ctx, `
- INSERT INTO workflow_steps (
- id, execution_id, step_name, status, output_data, error_message,
- started_at, completed_at, max_attempts, remaining_attempts, namespace
- )
- SELECT ?, ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?
- WHERE EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- stepID,
- wctx.ExecutionID(),
- stepName,
- store.WorkflowStepsStatusRunning,
- []byte{},
- sql.NullString{String: "", Valid: false},
- sql.NullInt64{Int64: stepStartTime.UnixMilli(), Valid: true},
- sql.NullInt64{Int64: 0, Valid: false},
- wctx.stepMaxAttempts,
- wctx.stepMaxAttempts,
- wctx.namespace,
- wctx.ExecutionID(), // resource_id for lease check
- wctx.workerID, // worker_id for lease check
- now, // expires_at check
- )
- if createErr != nil {
- return zero, fmt.Errorf("failed to create step: %w", createErr)
- }
-
- // Check if the step creation actually happened (lease validation)
- rowsAffected, rowsErr := createResult.RowsAffected()
- if rowsErr != nil {
- return zero, fmt.Errorf("failed to check step creation result: %w", rowsErr)
- }
- if rowsAffected == 0 {
- return zero, fmt.Errorf("step creation failed: lease expired or invalid")
- }
-
- // Step created successfully
- span.SetAttributes(attribute.Bool("hydra.step.new", true))
- } else {
- // Update existing step to running status with lease validation
- now := time.Now().UnixMilli()
- updateResult, updateErr := wctx.db.ExecContext(wctx.ctx, `
- UPDATE workflow_steps
- SET status = ?, completed_at = ?, output_data = ?, error_message = ?
- WHERE workflow_steps.namespace = ? AND execution_id = ? AND step_name = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- store.WorkflowStepsStatusRunning,
- sql.NullInt64{Int64: 0, Valid: false},
- []byte{},
- sql.NullString{String: "", Valid: false},
- wctx.namespace,
- wctx.ExecutionID(),
- stepName,
- wctx.ExecutionID(), // resource_id for lease check
- wctx.workerID, // worker_id for lease check
- now, // expires_at check
- )
- if updateErr != nil {
- return zero, fmt.Errorf("failed to update step status: %w", updateErr)
- }
-
- // Check if the update actually happened (lease validation)
- rowsAffected, rowsErr := updateResult.RowsAffected()
- if rowsErr != nil {
- return zero, fmt.Errorf("failed to check step update result: %w", rowsErr)
- }
- if rowsAffected == 0 {
- return zero, fmt.Errorf("step update failed: lease expired or invalid")
- }
-
- // Get the step after successful update
- stepResult, getErr := store.Query.GetStep(wctx.ctx, wctx.db, store.GetStepParams{
- Namespace: wctx.namespace,
- ExecutionID: wctx.ExecutionID(),
- StepName: stepName,
- })
- stepToUse = &stepResult
- if getErr != nil {
- return zero, fmt.Errorf("failed to retrieve updated step: %w", getErr)
- }
-
- // Record step retry
- if stepToUse.RemainingAttempts < stepToUse.MaxAttempts {
- metrics.StepsRetriedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName).Inc()
- span.SetAttributes(attribute.Bool("hydra.step.retry", true))
- }
- span.SetAttributes(attribute.Bool("hydra.step.new", false))
- }
-
- response, err := fn(stepCtx)
- if err != nil {
- tracing.RecordError(span, err)
- wctx.logger.Error("step execution failed", "error", err.Error())
- span.SetAttributes(attribute.String("hydra.step.status", "failed"))
-
- if markErr := wctx.markStepFailed(stepName, err.Error()); markErr != nil {
- metrics.RecordError(wctx.namespace, "step", "mark_step_failed_error")
- }
- metrics.ObserveStepDuration(wctx.namespace, wctx.workflowName, stepName, "failed", stepStartTime)
- metrics.StepsExecutedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName, "failed").Inc()
- return zero, fmt.Errorf("step execution failed: %w", err)
- }
-
- respData, err := wctx.marshaller.Marshal(response)
- if err != nil {
- tracing.RecordError(span, err)
- span.SetAttributes(attribute.String("hydra.step.status", "failed"))
-
- if markErr := wctx.markStepFailed(stepName, err.Error()); markErr != nil {
- metrics.RecordError(wctx.namespace, "step", "mark_step_failed_error")
- }
- metrics.ObserveStepDuration(wctx.namespace, wctx.workflowName, stepName, "failed", stepStartTime)
- metrics.StepsExecutedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName, "failed").Inc()
- metrics.RecordError(wctx.namespace, "step", "marshal_response_failed")
- return zero, fmt.Errorf("failed to marshal response: %w", err)
- }
-
- err = wctx.markStepCompleted(stepName, respData)
- if err != nil {
- tracing.RecordError(span, err)
- span.SetAttributes(attribute.String("hydra.step.status", "failed"))
- metrics.RecordError(wctx.namespace, "step", "mark_completed_failed")
- return zero, fmt.Errorf("failed to mark step completed: %w", err)
- }
-
- span.SetAttributes(attribute.String("hydra.step.status", "completed"))
- metrics.ObserveStepDuration(wctx.namespace, wctx.workflowName, stepName, "completed", stepStartTime)
- metrics.StepsExecutedTotal.WithLabelValues(wctx.namespace, wctx.workflowName, stepName, "completed").Inc()
-
- return response, nil
-}
-
-// StepVoid executes a named step within a workflow that performs side effects but doesn't return a value.
-//
-// This is a convenience wrapper around Step for functions that only return an error.
-// It's perfect for steps that perform database updates, send notifications, or other
-// side effects where the result itself isn't needed by subsequent steps.
-//
-// Parameters:
-// - ctx: The workflow context from the workflow's Run() method
-// - stepName: A unique name for this step within the workflow
-// - fn: The function to execute, which should be idempotent and only return an error
-//
-// Example usage:
-//
-// // Database update step
-// err := hydra.StepVoid(ctx, "update-user-status", func(stepCtx context.Context) error {
-// return userService.UpdateStatus(stepCtx, userID, "active")
-// })
-//
-// // Notification step
-// err := hydra.StepVoid(ctx, "send-email", func(stepCtx context.Context) error {
-// return emailService.SendWelcomeEmail(stepCtx, userEmail)
-// })
-//
-// Returns only an error if the step execution fails.
-func StepVoid(ctx WorkflowContext, stepName string, fn func(context.Context) error) error {
- _, err := Step(ctx, stepName, func(stepCtx context.Context) (*struct{}, error) {
- if err := fn(stepCtx); err != nil {
- return nil, err
- }
- return &struct{}{}, nil
- })
- return err
-}
diff --git a/go/pkg/hydra/step_atomicity_test.go b/go/pkg/hydra/step_atomicity_test.go
deleted file mode 100644
index e76b68634c..0000000000
--- a/go/pkg/hydra/step_atomicity_test.go
+++ /dev/null
@@ -1,217 +0,0 @@
-package hydra
-
-import (
- "context"
- "sync/atomic"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/uid"
-)
-
-// TestStepExecutionAtomicity ensures that step execution is atomic:
-// either a step fully completes (executes + status update) or it doesn't execute at all.
-// This prevents duplicate side effects when status updates fail after step execution.
-func TestStepExecutionAtomicity(t *testing.T) {
- // Arrange: Create engine with test clock and a workflow that tracks execution attempts
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- var stepExecutionCount int64
- var sideEffectsCount int64 // Track side effects that should only happen once
-
- // Create a workflow with a step that has side effects
- workflow := &atomicityTestWorkflow{
- engine: engine,
- name: "atomicity-test-workflow",
- stepFunc: func(ctx context.Context) (string, error) {
- // This represents the step execution with side effects
- _ = atomic.AddInt64(&stepExecutionCount, 1)
-
- // Simulate important side effects (e.g., sending email, charging payment, etc.)
- atomic.AddInt64(&sideEffectsCount, 1)
-
- return "step-result", nil
- },
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- // Act: Start workflow
- executionID, err := workflow.Start(ctx, struct{}{})
- require.NoError(t, err)
-
- // Start worker
- worker, err := NewWorker(engine, WorkerConfig{
- Concurrency: 1,
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second,
- ClaimTimeout: 5 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Trigger workflow execution
- require.Eventually(t, func() bool {
- testClock.Tick(200 * time.Millisecond)
- time.Sleep(10 * time.Millisecond)
-
- // Check if workflow completed
- currentStatus, getErr := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- if getErr != nil {
- return false
- }
- return currentStatus.Status == store.WorkflowExecutionsStatusCompleted
- }, 5*time.Second, 50*time.Millisecond, "Workflow should complete")
-
- // Assert: Step should execute exactly once despite any potential failures
- finalExecutionCount := atomic.LoadInt64(&stepExecutionCount)
- finalSideEffectsCount := atomic.LoadInt64(&sideEffectsCount)
-
- require.Equal(t, int64(1), finalExecutionCount,
- "ATOMICITY VIOLATION: Step executed %d times instead of 1. "+
- "This indicates non-atomic step execution where the step ran multiple times.", finalExecutionCount)
-
- require.Equal(t, int64(1), finalSideEffectsCount,
- "SIDE EFFECT DUPLICATION: Side effects occurred %d times instead of 1. "+
- "This could mean duplicate emails sent, multiple payments charged, etc.", finalSideEffectsCount)
-
- // Verify the workflow completed successfully
- finalWorkflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- require.NoError(t, err)
- require.Equal(t, store.WorkflowExecutionsStatusCompleted, finalWorkflow.Status,
- "Workflow should complete successfully")
-
-}
-
-// TestStepExecutionAtomicityWithFailures tests atomicity when database operations fail
-func TestStepExecutionAtomicityWithFailures(t *testing.T) {
- // This test would be more complex and would require mocking the store
- // to simulate failures during status updates after step execution.
- // For now, we'll focus on the basic atomicity test above.
- t.Skip("TODO: Implement test with simulated database failures during status updates")
-}
-
-// TestConcurrentStepExecution tests that multiple workers don't execute the same step
-func TestConcurrentStepExecution(t *testing.T) {
- // Arrange: Create engine with test clock
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- var stepExecutionCount int64
-
- // Create a workflow with a step that takes time to execute
- workflow := &atomicityTestWorkflow{
- engine: engine,
- name: "concurrent-test-workflow",
- stepFunc: func(ctx context.Context) (string, error) {
- _ = atomic.AddInt64(&stepExecutionCount, 1)
-
- // Simulate some work time
- time.Sleep(100 * time.Millisecond)
-
- return "concurrent-result", nil
- },
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- // Start workflow
- executionID, err := workflow.Start(ctx, struct{}{})
- require.NoError(t, err)
-
- // Start multiple workers that might try to process the same workflow
- worker1ID := uid.New(uid.WorkerPrefix)
- worker2ID := uid.New(uid.WorkerPrefix)
-
- worker1, err := NewWorker(engine, WorkerConfig{
- WorkerID: worker1ID,
- Concurrency: 1,
- PollInterval: 50 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second,
- ClaimTimeout: 5 * time.Second,
- })
- require.NoError(t, err)
-
- worker2, err := NewWorker(engine, WorkerConfig{
- WorkerID: worker2ID,
- Concurrency: 1,
- PollInterval: 50 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second,
- ClaimTimeout: 5 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker1, workflow)
- require.NoError(t, err)
- err = RegisterWorkflow(worker2, workflow)
- require.NoError(t, err)
-
- err = worker1.Start(ctx)
- require.NoError(t, err)
- defer worker1.Shutdown(ctx)
-
- err = worker2.Start(ctx)
- require.NoError(t, err)
- defer worker2.Shutdown(ctx)
-
- // Trigger both workers to poll simultaneously
- require.Eventually(t, func() bool {
- testClock.Tick(100 * time.Millisecond)
- time.Sleep(20 * time.Millisecond)
-
- // Check if workflow completed
- currentStatus, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- if err != nil {
- return false
- }
- return currentStatus.Status == store.WorkflowExecutionsStatusCompleted
- }, 5*time.Second, 50*time.Millisecond, "Workflow should complete with concurrent workers")
-
- // Assert: Step should execute exactly once even with multiple workers
- finalExecutionCount := atomic.LoadInt64(&stepExecutionCount)
- require.Equal(t, int64(1), finalExecutionCount,
- "CONCURRENCY VIOLATION: Step executed %d times instead of 1. "+
- "Multiple workers executed the same step, violating exactly-once guarantees.", finalExecutionCount)
-
-}
-
-// atomicityTestWorkflow is a test workflow for testing step execution atomicity
-type atomicityTestWorkflow struct {
- engine *Engine
- name string
- stepFunc func(ctx context.Context) (string, error)
-}
-
-func (w *atomicityTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *atomicityTestWorkflow) Run(ctx WorkflowContext, req any) error {
- _, err := Step(ctx, "atomic-step", w.stepFunc)
- return err
-}
-
-func (w *atomicityTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/step_idempotency_test.go b/go/pkg/hydra/step_idempotency_test.go
deleted file mode 100644
index e58113d926..0000000000
--- a/go/pkg/hydra/step_idempotency_test.go
+++ /dev/null
@@ -1,168 +0,0 @@
-package hydra
-
-import (
- "context"
- "sync/atomic"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
-)
-
-// TestStepIdempotencyDuringWorkerFailure guarantees that workflow steps are idempotent
-// and execute exactly once, even when workers fail and other workers resume the workflow.
-//
-// This prevents duplicate side effects like sending emails twice, processing payments
-// multiple times, or creating duplicate database records during worker handoffs.
-func TestStepIdempotencyDuringWorkerFailure(t *testing.T) {
- // Arrange: Create engine with test clock for deterministic timing
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- var stepExecutionCount int64
-
- // Create a workflow with one step that takes time to execute
- workflow := &testWorkflow{
- engine: engine,
- name: "idempotency-test-workflow",
- stepFunc: func(ctx context.Context) (string, error) {
- // This should only execute once, but the bug causes it to execute multiple times
- atomic.AddInt64(&stepExecutionCount, 1)
-
- // Step executes instantly - we'll control timing via test clock and worker coordination
- return "step-completed", nil
- },
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- // Act: Start workflow using the preferred pattern
- executionID, err := workflow.Start(ctx, struct{}{})
- require.NoError(t, err)
-
- // Start first worker to begin processing
- worker1, err := NewWorker(engine, WorkerConfig{
- Concurrency: 1,
- PollInterval: 50 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second, // Short heartbeat for faster cleanup
- ClaimTimeout: 2 * time.Second, // Short timeout to simulate crash
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker1, workflow)
- require.NoError(t, err)
-
- err = worker1.Start(ctx)
- require.NoError(t, err)
-
- // Let worker1 start processing - advance test clock to trigger polling
- testClock.Tick(100 * time.Millisecond) // Trigger initial poll
-
- // Give a brief moment for worker1 to process (this is unavoidable for goroutine coordination)
- time.Sleep(50 * time.Millisecond)
-
- // Keep triggering polls until workflow is picked up
- for i := 0; i < 10; i++ {
- testClock.Tick(100 * time.Millisecond)
- time.Sleep(10 * time.Millisecond)
-
- // Check if workflow has been picked up
- currentStatus, getErr := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- require.NoError(t, getErr)
- if currentStatus.Status != store.WorkflowExecutionsStatusPending {
- break
- }
- }
-
- // Check that workflow is being processed
- _, err = store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- require.NoError(t, err)
-
- // Simulate worker1 crash by shutting it down
- err = worker1.Shutdown(context.Background())
- require.NoError(t, err)
-
- // Advance time to expire the lease and trigger cleanup
- // Leases expire after ClaimTimeout (2 seconds), cleanup runs every HeartbeatInterval * 2 (2 seconds)
- testClock.Tick(3 * time.Second) // Advance past lease expiration + cleanup interval
-
- // Start worker2 to take over the workflow
- worker2, err := NewWorker(engine, WorkerConfig{
- Concurrency: 1,
- PollInterval: 50 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second, // Short heartbeat for faster cleanup
- ClaimTimeout: 5 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker2, workflow)
- require.NoError(t, err)
-
- err = worker2.Start(ctx)
- require.NoError(t, err)
- defer worker2.Shutdown(ctx)
-
- // Advance time to trigger worker2 polling and cleanup detection
- testClock.Tick(200 * time.Millisecond) // Trigger worker2 polling
-
- // Keep triggering polls and cleanup until workflow is picked up by worker2
- for i := 0; i < 20; i++ {
- testClock.Tick(200 * time.Millisecond) // Trigger polling and cleanup
- time.Sleep(10 * time.Millisecond)
-
- // Check if workflow has been picked up
- currentStatus, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- require.NoError(t, err)
- if currentStatus.Status != store.WorkflowExecutionsStatusPending {
- break
- }
- }
-
- // Wait for worker2 to complete the workflow
- finalResult := waitForWorkflowCompletion(t, engine, executionID, 3*time.Second)
- // Check final step execution count
- finalCount := atomic.LoadInt64(&stepExecutionCount)
-
- // Assert: Step idempotency - should execute exactly once despite worker failure
- require.Equal(t, int64(1), finalCount,
- "STEP IDEMPOTENCY VIOLATION: Step executed %d times instead of 1. "+
- "This could cause duplicate side effects like sending emails twice, "+
- "processing payments multiple times, or creating duplicate records.", finalCount)
-
- // Verify workflow completed successfully
- require.Equal(t, store.WorkflowExecutionsStatusCompleted, finalResult.Status, "Workflow should complete successfully despite worker crash")
-}
-
-// testWorkflow is a minimal workflow for testing step idempotency
-type testWorkflow struct {
- engine *Engine
- name string
- stepFunc func(ctx context.Context) (string, error)
-}
-
-func (w *testWorkflow) Name() string {
- return w.name
-}
-
-func (w *testWorkflow) Run(ctx WorkflowContext, req any) error {
- _, err := Step(ctx, "test-step", w.stepFunc)
- return err
-}
-
-// Start is a convenience method that starts this workflow using the embedded engine
-// This encourages a cleaner API pattern: workflow.Start() instead of engine.StartWorkflow()
-func (w *testWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/store/db.go b/go/pkg/hydra/store/db.go
deleted file mode 100644
index a52622d0b3..0000000000
--- a/go/pkg/hydra/store/db.go
+++ /dev/null
@@ -1,27 +0,0 @@
-package store
-
-import (
- "context"
- "database/sql"
-)
-
-type DBTX interface {
- ExecContext(context.Context, string, ...interface{}) (sql.Result, error)
- PrepareContext(context.Context, string) (*sql.Stmt, error)
- QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error)
- QueryRowContext(context.Context, string, ...interface{}) *sql.Row
-}
-
-func New(db DBTX) *Queries {
- return &Queries{db: db}
-}
-
-type Queries struct {
- db DBTX
-}
-
-func (q *Queries) WithTx(tx *sql.Tx) *Queries {
- return &Queries{
- db: tx,
- }
-}
diff --git a/go/pkg/hydra/store/generate.go b/go/pkg/hydra/store/generate.go
deleted file mode 100644
index 712573026b..0000000000
--- a/go/pkg/hydra/store/generate.go
+++ /dev/null
@@ -1,6 +0,0 @@
-package store
-
-//go:generate sqlc generate -f sqlc.json
-// we copy all of the relevant bits into queries.go and don't want the default
-// exports that get generated
-//go:generate rm delete_me.go
diff --git a/go/pkg/hydra/store/models.go b/go/pkg/hydra/store/models.go
deleted file mode 100644
index e948928a3d..0000000000
--- a/go/pkg/hydra/store/models.go
+++ /dev/null
@@ -1,245 +0,0 @@
-// Code generated by sqlc. DO NOT EDIT.
-// versions:
-// sqlc v1.29.0
-
-package store
-
-import (
- "database/sql"
- "database/sql/driver"
- "fmt"
-)
-
-type LeasesKind string
-
-const (
- LeasesKindWorkflow LeasesKind = "workflow"
- LeasesKindStep LeasesKind = "step"
- LeasesKindCronJob LeasesKind = "cron_job"
-)
-
-func (e *LeasesKind) Scan(src interface{}) error {
- switch s := src.(type) {
- case []byte:
- *e = LeasesKind(s)
- case string:
- *e = LeasesKind(s)
- default:
- return fmt.Errorf("unsupported scan type for LeasesKind: %T", src)
- }
- return nil
-}
-
-type NullLeasesKind struct {
- LeasesKind LeasesKind `json:"leases_kind"`
- Valid bool `json:"valid"` // Valid is true if LeasesKind is not NULL
-}
-
-// Scan implements the Scanner interface.
-func (ns *NullLeasesKind) Scan(value interface{}) error {
- if value == nil {
- ns.LeasesKind, ns.Valid = "", false
- return nil
- }
- ns.Valid = true
- return ns.LeasesKind.Scan(value)
-}
-
-// Value implements the driver Valuer interface.
-func (ns NullLeasesKind) Value() (driver.Value, error) {
- if !ns.Valid {
- return nil, nil
- }
- return string(ns.LeasesKind), nil
-}
-
-type WorkflowExecutionsStatus string
-
-const (
- WorkflowExecutionsStatusPending WorkflowExecutionsStatus = "pending"
- WorkflowExecutionsStatusRunning WorkflowExecutionsStatus = "running"
- WorkflowExecutionsStatusSleeping WorkflowExecutionsStatus = "sleeping"
- WorkflowExecutionsStatusCompleted WorkflowExecutionsStatus = "completed"
- WorkflowExecutionsStatusFailed WorkflowExecutionsStatus = "failed"
-)
-
-func (e *WorkflowExecutionsStatus) Scan(src interface{}) error {
- switch s := src.(type) {
- case []byte:
- *e = WorkflowExecutionsStatus(s)
- case string:
- *e = WorkflowExecutionsStatus(s)
- default:
- return fmt.Errorf("unsupported scan type for WorkflowExecutionsStatus: %T", src)
- }
- return nil
-}
-
-type NullWorkflowExecutionsStatus struct {
- WorkflowExecutionsStatus WorkflowExecutionsStatus `json:"workflow_executions_status"`
- Valid bool `json:"valid"` // Valid is true if WorkflowExecutionsStatus is not NULL
-}
-
-// Scan implements the Scanner interface.
-func (ns *NullWorkflowExecutionsStatus) Scan(value interface{}) error {
- if value == nil {
- ns.WorkflowExecutionsStatus, ns.Valid = "", false
- return nil
- }
- ns.Valid = true
- return ns.WorkflowExecutionsStatus.Scan(value)
-}
-
-// Value implements the driver Valuer interface.
-func (ns NullWorkflowExecutionsStatus) Value() (driver.Value, error) {
- if !ns.Valid {
- return nil, nil
- }
- return string(ns.WorkflowExecutionsStatus), nil
-}
-
-type WorkflowExecutionsTriggerType string
-
-const (
- WorkflowExecutionsTriggerTypeManual WorkflowExecutionsTriggerType = "manual"
- WorkflowExecutionsTriggerTypeCron WorkflowExecutionsTriggerType = "cron"
- WorkflowExecutionsTriggerTypeEvent WorkflowExecutionsTriggerType = "event"
- WorkflowExecutionsTriggerTypeApi WorkflowExecutionsTriggerType = "api"
-)
-
-func (e *WorkflowExecutionsTriggerType) Scan(src interface{}) error {
- switch s := src.(type) {
- case []byte:
- *e = WorkflowExecutionsTriggerType(s)
- case string:
- *e = WorkflowExecutionsTriggerType(s)
- default:
- return fmt.Errorf("unsupported scan type for WorkflowExecutionsTriggerType: %T", src)
- }
- return nil
-}
-
-type NullWorkflowExecutionsTriggerType struct {
- WorkflowExecutionsTriggerType WorkflowExecutionsTriggerType `json:"workflow_executions_trigger_type"`
- Valid bool `json:"valid"` // Valid is true if WorkflowExecutionsTriggerType is not NULL
-}
-
-// Scan implements the Scanner interface.
-func (ns *NullWorkflowExecutionsTriggerType) Scan(value interface{}) error {
- if value == nil {
- ns.WorkflowExecutionsTriggerType, ns.Valid = "", false
- return nil
- }
- ns.Valid = true
- return ns.WorkflowExecutionsTriggerType.Scan(value)
-}
-
-// Value implements the driver Valuer interface.
-func (ns NullWorkflowExecutionsTriggerType) Value() (driver.Value, error) {
- if !ns.Valid {
- return nil, nil
- }
- return string(ns.WorkflowExecutionsTriggerType), nil
-}
-
-type WorkflowStepsStatus string
-
-const (
- WorkflowStepsStatusPending WorkflowStepsStatus = "pending"
- WorkflowStepsStatusRunning WorkflowStepsStatus = "running"
- WorkflowStepsStatusCompleted WorkflowStepsStatus = "completed"
- WorkflowStepsStatusFailed WorkflowStepsStatus = "failed"
-)
-
-func (e *WorkflowStepsStatus) Scan(src interface{}) error {
- switch s := src.(type) {
- case []byte:
- *e = WorkflowStepsStatus(s)
- case string:
- *e = WorkflowStepsStatus(s)
- default:
- return fmt.Errorf("unsupported scan type for WorkflowStepsStatus: %T", src)
- }
- return nil
-}
-
-type NullWorkflowStepsStatus struct {
- WorkflowStepsStatus WorkflowStepsStatus `json:"workflow_steps_status"`
- Valid bool `json:"valid"` // Valid is true if WorkflowStepsStatus is not NULL
-}
-
-// Scan implements the Scanner interface.
-func (ns *NullWorkflowStepsStatus) Scan(value interface{}) error {
- if value == nil {
- ns.WorkflowStepsStatus, ns.Valid = "", false
- return nil
- }
- ns.Valid = true
- return ns.WorkflowStepsStatus.Scan(value)
-}
-
-// Value implements the driver Valuer interface.
-func (ns NullWorkflowStepsStatus) Value() (driver.Value, error) {
- if !ns.Valid {
- return nil, nil
- }
- return string(ns.WorkflowStepsStatus), nil
-}
-
-type CronJob struct {
- ID string `db:"id" json:"id"`
- Name string `db:"name" json:"name"`
- CronSpec string `db:"cron_spec" json:"cron_spec"`
- Namespace string `db:"namespace" json:"namespace"`
- WorkflowName sql.NullString `db:"workflow_name" json:"workflow_name"`
- Enabled bool `db:"enabled" json:"enabled"`
- CreatedAt int64 `db:"created_at" json:"created_at"`
- UpdatedAt int64 `db:"updated_at" json:"updated_at"`
- LastRunAt sql.NullInt64 `db:"last_run_at" json:"last_run_at"`
- NextRunAt int64 `db:"next_run_at" json:"next_run_at"`
-}
-
-type Lease struct {
- ResourceID string `db:"resource_id" json:"resource_id"`
- Kind LeasesKind `db:"kind" json:"kind"`
- Namespace string `db:"namespace" json:"namespace"`
- WorkerID string `db:"worker_id" json:"worker_id"`
- AcquiredAt int64 `db:"acquired_at" json:"acquired_at"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
- HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"`
-}
-
-type WorkflowExecution struct {
- ID string `db:"id" json:"id"`
- WorkflowName string `db:"workflow_name" json:"workflow_name"`
- Status WorkflowExecutionsStatus `db:"status" json:"status"`
- InputData []byte `db:"input_data" json:"input_data"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- CreatedAt int64 `db:"created_at" json:"created_at"`
- StartedAt sql.NullInt64 `db:"started_at" json:"started_at"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- MaxAttempts int32 `db:"max_attempts" json:"max_attempts"`
- RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"`
- NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"`
- Namespace string `db:"namespace" json:"namespace"`
- TriggerType NullWorkflowExecutionsTriggerType `db:"trigger_type" json:"trigger_type"`
- TriggerSource sql.NullString `db:"trigger_source" json:"trigger_source"`
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
- TraceID sql.NullString `db:"trace_id" json:"trace_id"`
- SpanID sql.NullString `db:"span_id" json:"span_id"`
-}
-
-type WorkflowStep struct {
- ID string `db:"id" json:"id"`
- ExecutionID string `db:"execution_id" json:"execution_id"`
- StepName string `db:"step_name" json:"step_name"`
- Status WorkflowStepsStatus `db:"status" json:"status"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- StartedAt sql.NullInt64 `db:"started_at" json:"started_at"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- MaxAttempts int32 `db:"max_attempts" json:"max_attempts"`
- RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"`
- Namespace string `db:"namespace" json:"namespace"`
-}
diff --git a/go/pkg/hydra/store/querier.go b/go/pkg/hydra/store/querier.go
deleted file mode 100644
index a9a208732a..0000000000
--- a/go/pkg/hydra/store/querier.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Code generated by sqlc. DO NOT EDIT.
-// versions:
-// sqlc v1.29.0
-
-package store
-
-import (
- "context"
-)
-
-type Querier interface {
- CleanupExpiredLeases(ctx context.Context, db DBTX, arg CleanupExpiredLeasesParams) error
- CompleteWorkflow(ctx context.Context, db DBTX, arg CompleteWorkflowParams) error
- CreateCronJob(ctx context.Context, db DBTX, arg CreateCronJobParams) error
- CreateLease(ctx context.Context, db DBTX, arg CreateLeaseParams) error
- CreateStep(ctx context.Context, db DBTX, arg CreateStepParams) error
- CreateWorkflow(ctx context.Context, db DBTX, arg CreateWorkflowParams) error
- GetCompletedStep(ctx context.Context, db DBTX, arg GetCompletedStepParams) (WorkflowStep, error)
- GetCronJob(ctx context.Context, db DBTX, arg GetCronJobParams) (CronJob, error)
- GetCronJobs(ctx context.Context, db DBTX, namespace string) ([]CronJob, error)
- GetDueCronJobs(ctx context.Context, db DBTX, arg GetDueCronJobsParams) ([]CronJob, error)
- GetLease(ctx context.Context, db DBTX, arg GetLeaseParams) (Lease, error)
- GetPendingWorkflows(ctx context.Context, db DBTX, arg GetPendingWorkflowsParams) ([]WorkflowExecution, error)
- GetPendingWorkflowsFiltered(ctx context.Context, db DBTX, arg GetPendingWorkflowsFilteredParams) ([]WorkflowExecution, error)
- GetSleepingWorkflows(ctx context.Context, db DBTX, arg GetSleepingWorkflowsParams) ([]WorkflowExecution, error)
- GetStep(ctx context.Context, db DBTX, arg GetStepParams) (WorkflowStep, error)
- GetWorkflow(ctx context.Context, db DBTX, arg GetWorkflowParams) (WorkflowExecution, error)
- HeartbeatLease(ctx context.Context, db DBTX, arg HeartbeatLeaseParams) error
- ReleaseLease(ctx context.Context, db DBTX, arg ReleaseLeaseParams) error
- ResetOrphanedWorkflows(ctx context.Context, db DBTX, arg ResetOrphanedWorkflowsParams) error
- SleepWorkflow(ctx context.Context, db DBTX, arg SleepWorkflowParams) error
- UpdateCronJob(ctx context.Context, db DBTX, arg UpdateCronJobParams) error
- UpdateCronJobLastRun(ctx context.Context, db DBTX, arg UpdateCronJobLastRunParams) error
- UpdateLease(ctx context.Context, db DBTX, arg UpdateLeaseParams) error
- UpdateStepStatus(ctx context.Context, db DBTX, arg UpdateStepStatusParams) error
- UpdateStepStatusWithLease(ctx context.Context, db DBTX, arg UpdateStepStatusWithLeaseParams) error
- UpdateWorkflowFields(ctx context.Context, db DBTX, arg UpdateWorkflowFieldsParams) error
- UpdateWorkflowToRunning(ctx context.Context, db DBTX, arg UpdateWorkflowToRunningParams) error
-}
-
-var _ Querier = (*Queries)(nil)
diff --git a/go/pkg/hydra/store/queries.go b/go/pkg/hydra/store/queries.go
deleted file mode 100644
index f7208632a4..0000000000
--- a/go/pkg/hydra/store/queries.go
+++ /dev/null
@@ -1,22 +0,0 @@
-package store
-
-// Query provides access to the generated database queries defined in the SQL files
-//
-// Example usage:
-//
-// import (
-// "context"
-// "database/sql"
-// "github.com/unkeyed/unkey/go/pkg/hydra/store"
-// )
-//
-// func GetWorkflow(ctx context.Context, db *sql.DB, namespace, id string) (store.WorkflowExecution, error) {
-// return store.Query.GetWorkflow(ctx, db, store.GetWorkflowParams{
-// ID: id,
-// Namespace: namespace,
-// })
-// }
-//
-// The Query object contains all the database operations defined in the SQL files
-// and automatically generated by sqlc.
-var Query Querier = &Queries{db: nil}
diff --git a/go/pkg/hydra/store/queries/workflows.sql b/go/pkg/hydra/store/queries/workflows.sql
deleted file mode 100644
index dde1870bfa..0000000000
--- a/go/pkg/hydra/store/queries/workflows.sql
+++ /dev/null
@@ -1,189 +0,0 @@
--- name: GetWorkflow :one
-SELECT * FROM workflow_executions
-WHERE id = ? AND namespace = ?;
-
--- name: CreateWorkflow :exec
-INSERT INTO workflow_executions (
- id, workflow_name, status, input_data, output_data, error_message,
- created_at, started_at, completed_at, max_attempts, remaining_attempts,
- next_retry_at, namespace, trigger_type, trigger_source, sleep_until,
- trace_id, span_id
-) VALUES (
- ?, ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?,
- ?, ?
-);
-
--- name: GetPendingWorkflows :many
-SELECT * FROM workflow_executions
-WHERE namespace = ?
- AND (
- status = 'pending'
- OR (status = 'failed' AND next_retry_at <= ?)
- OR (status = 'sleeping' AND sleep_until <= ?)
- )
-ORDER BY created_at ASC
-LIMIT ?;
-
--- name: GetPendingWorkflowsFiltered :many
-SELECT * FROM workflow_executions
-WHERE namespace = ?
- AND (
- status = 'pending'
- OR (status = 'failed' AND next_retry_at <= ?)
- OR (status = 'sleeping' AND sleep_until <= ?)
- )
- AND workflow_name IN (/*SLICE:workflow_names*/?)
-ORDER BY created_at ASC
-LIMIT ?;
-
--- name: UpdateWorkflowFields :exec
-UPDATE workflow_executions
-SET
- status = COALESCE(?, status),
- error_message = COALESCE(?, error_message),
- completed_at = COALESCE(?, completed_at),
- started_at = COALESCE(?, started_at),
- output_data = COALESCE(?, output_data),
- remaining_attempts = COALESCE(?, remaining_attempts),
- next_retry_at = COALESCE(?, next_retry_at),
- sleep_until = COALESCE(?, sleep_until)
-WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- );
-
--- name: UpdateStepStatus :exec
-UPDATE workflow_steps
-SET status = ?, completed_at = ?, output_data = ?, error_message = ?
-WHERE namespace = ? AND execution_id = ? AND step_name = ?;
-
--- name: SleepWorkflow :exec
-UPDATE workflow_executions
-SET status = 'sleeping', sleep_until = ?
-WHERE id = ? AND namespace = ?;
-
--- name: CreateStep :exec
-INSERT INTO workflow_steps (
- id, execution_id, step_name, status, output_data, error_message,
- started_at, completed_at, max_attempts, remaining_attempts, namespace
-) VALUES (
- ?, ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?
-);
-
--- name: GetStep :one
-SELECT * FROM workflow_steps
-WHERE namespace = ? AND execution_id = ? AND step_name = ?;
-
--- name: GetCompletedStep :one
-SELECT * FROM workflow_steps
-WHERE namespace = ? AND execution_id = ? AND step_name = ? AND status = 'completed';
-
--- name: UpdateStepStatusWithLease :exec
-UPDATE workflow_steps
-SET status = ?, completed_at = ?, output_data = ?, error_message = ?
-WHERE workflow_steps.namespace = ? AND execution_id = ? AND step_name = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- );
-
--- name: GetLease :one
-SELECT * FROM leases
-WHERE resource_id = ? AND kind = ?;
-
--- name: CreateLease :exec
-INSERT INTO leases (
- resource_id, kind, namespace, worker_id, acquired_at, expires_at, heartbeat_at
-) VALUES (
- ?, ?, ?, ?, ?, ?, ?
-);
-
--- name: UpdateLease :exec
-UPDATE leases
-SET worker_id = ?, acquired_at = ?, expires_at = ?, heartbeat_at = ?
-WHERE resource_id = ? AND kind = ? AND expires_at < ?;
-
--- name: UpdateWorkflowToRunning :exec
-UPDATE workflow_executions
-SET status = 'running',
- started_at = CASE WHEN started_at IS NULL THEN ? ELSE started_at END,
- sleep_until = NULL
-WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- );
-
--- name: CompleteWorkflow :exec
-UPDATE workflow_executions
-SET status = 'completed', completed_at = ?, output_data = ?
-WHERE id = ? AND namespace = ?;
-
--- name: HeartbeatLease :exec
-UPDATE leases
-SET heartbeat_at = ?, expires_at = ?
-WHERE resource_id = ? AND worker_id = ?;
-
--- name: ReleaseLease :exec
-DELETE FROM leases
-WHERE resource_id = ? AND worker_id = ?;
-
--- name: GetSleepingWorkflows :many
-SELECT * FROM workflow_executions
-WHERE namespace = ? AND status = 'sleeping' AND sleep_until <= ?
-ORDER BY sleep_until ASC;
-
--- name: GetCronJob :one
-SELECT * FROM cron_jobs
-WHERE namespace = ? AND name = ?;
-
--- name: GetCronJobs :many
-SELECT * FROM cron_jobs
-WHERE namespace = ? AND enabled = true;
-
--- name: GetDueCronJobs :many
-SELECT * FROM cron_jobs
-WHERE namespace = ? AND enabled = true AND next_run_at <= ?;
-
--- name: CreateCronJob :exec
-INSERT INTO cron_jobs (
- id, name, cron_spec, namespace, workflow_name, enabled,
- created_at, updated_at, last_run_at, next_run_at
-) VALUES (
- ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
-) ON DUPLICATE KEY UPDATE
- cron_spec = sqlc.arg('cron_spec'), enabled = sqlc.arg('enabled'), updated_at = sqlc.arg('updated_at'), next_run_at = sqlc.arg('next_run_at'), last_run_at = sqlc.arg('last_run_at'), next_run_at = sqlc.arg('next_run_at');
-
--- name: UpdateCronJob :exec
-UPDATE cron_jobs
-SET cron_spec = ?, workflow_name = ?, enabled = ?, updated_at = ?, next_run_at = ?
-WHERE id = ? AND namespace = ?;
-
--- name: UpdateCronJobLastRun :exec
-UPDATE cron_jobs
-SET last_run_at = ?, next_run_at = ?, updated_at = ?
-WHERE id = ? AND namespace = ?;
-
--- name: CleanupExpiredLeases :exec
-DELETE FROM leases
-WHERE namespace = ? AND expires_at < ?;
-
-
--- name: ResetOrphanedWorkflows :exec
-UPDATE workflow_executions
-SET status = 'pending'
-WHERE workflow_executions.namespace = ?
- AND workflow_executions.status = 'running'
- AND workflow_executions.id NOT IN (
- SELECT resource_id
- FROM leases
- WHERE kind = 'workflow' AND leases.namespace = ?
- );
-
diff --git a/go/pkg/hydra/store/schema.sql b/go/pkg/hydra/store/schema.sql
deleted file mode 100644
index ff2c054eb1..0000000000
--- a/go/pkg/hydra/store/schema.sql
+++ /dev/null
@@ -1,72 +0,0 @@
-CREATE DATABASE IF NOT EXISTS `hydra`;
-USE `hydra`;
-
-CREATE TABLE IF NOT EXISTS workflow_executions (
- id VARCHAR(255) PRIMARY KEY,
- workflow_name VARCHAR(255) NOT NULL,
- status ENUM('pending', 'running', 'sleeping', 'completed', 'failed') NOT NULL,
- input_data LONGBLOB, -- Large binary data for workflow inputs
- output_data MEDIUMBLOB, -- Medium binary data for workflow outputs
- error_message TEXT,
-
- created_at BIGINT NOT NULL,
- started_at BIGINT,
- completed_at BIGINT,
- max_attempts INT NOT NULL,
- remaining_attempts INT NOT NULL,
- next_retry_at BIGINT,
-
- namespace VARCHAR(255) NOT NULL,
-
- trigger_type ENUM('manual', 'cron', 'event', 'api'),
- trigger_source VARCHAR(255),
-
- sleep_until BIGINT,
-
- trace_id VARCHAR(255),
- span_id VARCHAR(255)
-);
-
-CREATE TABLE IF NOT EXISTS workflow_steps (
- id VARCHAR(255) PRIMARY KEY,
- execution_id VARCHAR(255) NOT NULL,
- step_name VARCHAR(255) NOT NULL,
- status ENUM('pending', 'running', 'completed', 'failed') NOT NULL,
- output_data LONGBLOB,
- error_message TEXT,
-
- started_at BIGINT,
- completed_at BIGINT,
-
- max_attempts INT NOT NULL,
- remaining_attempts INT NOT NULL,
-
- namespace VARCHAR(255) NOT NULL
-);
-
--- Cron Jobs Table
-CREATE TABLE IF NOT EXISTS `cron_jobs` (
- `id` varchar(255) NOT NULL,
- `name` varchar(255) NOT NULL,
- `cron_spec` varchar(255) NOT NULL,
- `namespace` varchar(255) NOT NULL,
- `workflow_name` varchar(255) DEFAULT NULL,
- `enabled` tinyint(1) NOT NULL DEFAULT '1',
- `created_at` bigint NOT NULL,
- `updated_at` bigint NOT NULL,
- `last_run_at` bigint DEFAULT NULL,
- `next_run_at` bigint NOT NULL,
- PRIMARY KEY (`id`),
- UNIQUE KEY `cron_jobs_name_namespace_idx` (`name`,`namespace`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
-
--- Leases Table (step kind included for GORM compatibility, though unused)
-CREATE TABLE IF NOT EXISTS leases (
- resource_id VARCHAR(255) PRIMARY KEY,
- kind ENUM('workflow', 'step', 'cron_job') NOT NULL,
- namespace VARCHAR(255) NOT NULL,
- worker_id VARCHAR(255) NOT NULL,
- acquired_at BIGINT NOT NULL,
- expires_at BIGINT NOT NULL,
- heartbeat_at BIGINT NOT NULL
-);
diff --git a/go/pkg/hydra/store/sqlc.json b/go/pkg/hydra/store/sqlc.json
deleted file mode 100644
index 7d861e85f8..0000000000
--- a/go/pkg/hydra/store/sqlc.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
- "version": "2",
- "sql": [
- {
- "engine": "mysql",
- "queries": "queries/",
- "schema": "schema.sql",
- "gen": {
- "go": {
- "package": "store",
- "out": ".",
- "emit_json_tags": true,
- "emit_db_tags": true,
- "emit_prepared_queries": false,
- "emit_interface": true,
- "emit_exact_table_names": false,
- "emit_empty_slices": true,
- "emit_methods_with_db_argument": true,
- "output_db_file_name": "delete_me",
- "overrides": [
- {
- "column": "workflow_executions.input_data",
- "go_type": {
- "type": "[]byte"
- }
- },
- {
- "column": "workflow_executions.output_data",
- "go_type": {
- "type": "[]byte"
- }
- },
- {
- "column": "workflow_steps.output_data",
- "go_type": {
- "type": "[]byte"
- }
- }
- ]
- }
- }
- }
- ]
-}
diff --git a/go/pkg/hydra/store/workflows.sql.go b/go/pkg/hydra/store/workflows.sql.go
deleted file mode 100644
index 40bbe1dfed..0000000000
--- a/go/pkg/hydra/store/workflows.sql.go
+++ /dev/null
@@ -1,962 +0,0 @@
-// Code generated by sqlc. DO NOT EDIT.
-// versions:
-// sqlc v1.29.0
-// source: workflows.sql
-
-package store
-
-import (
- "context"
- "database/sql"
-)
-
-const cleanupExpiredLeases = `-- name: CleanupExpiredLeases :exec
-DELETE FROM leases
-WHERE namespace = ? AND expires_at < ?
-`
-
-type CleanupExpiredLeasesParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
-}
-
-func (q *Queries) CleanupExpiredLeases(ctx context.Context, db DBTX, arg CleanupExpiredLeasesParams) error {
- _, err := db.ExecContext(ctx, cleanupExpiredLeases, arg.Namespace, arg.ExpiresAt)
- return err
-}
-
-const completeWorkflow = `-- name: CompleteWorkflow :exec
-UPDATE workflow_executions
-SET status = 'completed', completed_at = ?, output_data = ?
-WHERE id = ? AND namespace = ?
-`
-
-type CompleteWorkflowParams struct {
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
-}
-
-func (q *Queries) CompleteWorkflow(ctx context.Context, db DBTX, arg CompleteWorkflowParams) error {
- _, err := db.ExecContext(ctx, completeWorkflow,
- arg.CompletedAt,
- arg.OutputData,
- arg.ID,
- arg.Namespace,
- )
- return err
-}
-
-const createCronJob = `-- name: CreateCronJob :exec
-INSERT INTO cron_jobs (
- id, name, cron_spec, namespace, workflow_name, enabled,
- created_at, updated_at, last_run_at, next_run_at
-) VALUES (
- ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
-) ON DUPLICATE KEY UPDATE
- cron_spec = ?, enabled = ?, updated_at = ?, next_run_at = ?, last_run_at = ?, next_run_at = ?
-`
-
-type CreateCronJobParams struct {
- ID string `db:"id" json:"id"`
- Name string `db:"name" json:"name"`
- CronSpec string `db:"cron_spec" json:"cron_spec"`
- Namespace string `db:"namespace" json:"namespace"`
- WorkflowName sql.NullString `db:"workflow_name" json:"workflow_name"`
- Enabled bool `db:"enabled" json:"enabled"`
- CreatedAt int64 `db:"created_at" json:"created_at"`
- UpdatedAt int64 `db:"updated_at" json:"updated_at"`
- LastRunAt sql.NullInt64 `db:"last_run_at" json:"last_run_at"`
- NextRunAt int64 `db:"next_run_at" json:"next_run_at"`
-}
-
-func (q *Queries) CreateCronJob(ctx context.Context, db DBTX, arg CreateCronJobParams) error {
- _, err := db.ExecContext(ctx, createCronJob,
- arg.ID,
- arg.Name,
- arg.CronSpec,
- arg.Namespace,
- arg.WorkflowName,
- arg.Enabled,
- arg.CreatedAt,
- arg.UpdatedAt,
- arg.LastRunAt,
- arg.NextRunAt,
- arg.CronSpec,
- arg.Enabled,
- arg.UpdatedAt,
- arg.NextRunAt,
- arg.LastRunAt,
- arg.NextRunAt,
- )
- return err
-}
-
-const createLease = `-- name: CreateLease :exec
-INSERT INTO leases (
- resource_id, kind, namespace, worker_id, acquired_at, expires_at, heartbeat_at
-) VALUES (
- ?, ?, ?, ?, ?, ?, ?
-)
-`
-
-type CreateLeaseParams struct {
- ResourceID string `db:"resource_id" json:"resource_id"`
- Kind LeasesKind `db:"kind" json:"kind"`
- Namespace string `db:"namespace" json:"namespace"`
- WorkerID string `db:"worker_id" json:"worker_id"`
- AcquiredAt int64 `db:"acquired_at" json:"acquired_at"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
- HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"`
-}
-
-func (q *Queries) CreateLease(ctx context.Context, db DBTX, arg CreateLeaseParams) error {
- _, err := db.ExecContext(ctx, createLease,
- arg.ResourceID,
- arg.Kind,
- arg.Namespace,
- arg.WorkerID,
- arg.AcquiredAt,
- arg.ExpiresAt,
- arg.HeartbeatAt,
- )
- return err
-}
-
-const createStep = `-- name: CreateStep :exec
-INSERT INTO workflow_steps (
- id, execution_id, step_name, status, output_data, error_message,
- started_at, completed_at, max_attempts, remaining_attempts, namespace
-) VALUES (
- ?, ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?
-)
-`
-
-type CreateStepParams struct {
- ID string `db:"id" json:"id"`
- ExecutionID string `db:"execution_id" json:"execution_id"`
- StepName string `db:"step_name" json:"step_name"`
- Status WorkflowStepsStatus `db:"status" json:"status"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- StartedAt sql.NullInt64 `db:"started_at" json:"started_at"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- MaxAttempts int32 `db:"max_attempts" json:"max_attempts"`
- RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"`
- Namespace string `db:"namespace" json:"namespace"`
-}
-
-func (q *Queries) CreateStep(ctx context.Context, db DBTX, arg CreateStepParams) error {
- _, err := db.ExecContext(ctx, createStep,
- arg.ID,
- arg.ExecutionID,
- arg.StepName,
- arg.Status,
- arg.OutputData,
- arg.ErrorMessage,
- arg.StartedAt,
- arg.CompletedAt,
- arg.MaxAttempts,
- arg.RemainingAttempts,
- arg.Namespace,
- )
- return err
-}
-
-const createWorkflow = `-- name: CreateWorkflow :exec
-INSERT INTO workflow_executions (
- id, workflow_name, status, input_data, output_data, error_message,
- created_at, started_at, completed_at, max_attempts, remaining_attempts,
- next_retry_at, namespace, trigger_type, trigger_source, sleep_until,
- trace_id, span_id
-) VALUES (
- ?, ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?,
- ?, ?, ?, ?, ?,
- ?, ?
-)
-`
-
-type CreateWorkflowParams struct {
- ID string `db:"id" json:"id"`
- WorkflowName string `db:"workflow_name" json:"workflow_name"`
- Status WorkflowExecutionsStatus `db:"status" json:"status"`
- InputData []byte `db:"input_data" json:"input_data"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- CreatedAt int64 `db:"created_at" json:"created_at"`
- StartedAt sql.NullInt64 `db:"started_at" json:"started_at"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- MaxAttempts int32 `db:"max_attempts" json:"max_attempts"`
- RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"`
- NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"`
- Namespace string `db:"namespace" json:"namespace"`
- TriggerType NullWorkflowExecutionsTriggerType `db:"trigger_type" json:"trigger_type"`
- TriggerSource sql.NullString `db:"trigger_source" json:"trigger_source"`
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
- TraceID sql.NullString `db:"trace_id" json:"trace_id"`
- SpanID sql.NullString `db:"span_id" json:"span_id"`
-}
-
-func (q *Queries) CreateWorkflow(ctx context.Context, db DBTX, arg CreateWorkflowParams) error {
- _, err := db.ExecContext(ctx, createWorkflow,
- arg.ID,
- arg.WorkflowName,
- arg.Status,
- arg.InputData,
- arg.OutputData,
- arg.ErrorMessage,
- arg.CreatedAt,
- arg.StartedAt,
- arg.CompletedAt,
- arg.MaxAttempts,
- arg.RemainingAttempts,
- arg.NextRetryAt,
- arg.Namespace,
- arg.TriggerType,
- arg.TriggerSource,
- arg.SleepUntil,
- arg.TraceID,
- arg.SpanID,
- )
- return err
-}
-
-const getCompletedStep = `-- name: GetCompletedStep :one
-SELECT id, execution_id, step_name, status, output_data, error_message, started_at, completed_at, max_attempts, remaining_attempts, namespace FROM workflow_steps
-WHERE namespace = ? AND execution_id = ? AND step_name = ? AND status = 'completed'
-`
-
-type GetCompletedStepParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- ExecutionID string `db:"execution_id" json:"execution_id"`
- StepName string `db:"step_name" json:"step_name"`
-}
-
-func (q *Queries) GetCompletedStep(ctx context.Context, db DBTX, arg GetCompletedStepParams) (WorkflowStep, error) {
- row := db.QueryRowContext(ctx, getCompletedStep, arg.Namespace, arg.ExecutionID, arg.StepName)
- var i WorkflowStep
- err := row.Scan(
- &i.ID,
- &i.ExecutionID,
- &i.StepName,
- &i.Status,
- &i.OutputData,
- &i.ErrorMessage,
- &i.StartedAt,
- &i.CompletedAt,
- &i.MaxAttempts,
- &i.RemainingAttempts,
- &i.Namespace,
- )
- return i, err
-}
-
-const getCronJob = `-- name: GetCronJob :one
-SELECT id, name, cron_spec, namespace, workflow_name, enabled, created_at, updated_at, last_run_at, next_run_at FROM cron_jobs
-WHERE namespace = ? AND name = ?
-`
-
-type GetCronJobParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- Name string `db:"name" json:"name"`
-}
-
-func (q *Queries) GetCronJob(ctx context.Context, db DBTX, arg GetCronJobParams) (CronJob, error) {
- row := db.QueryRowContext(ctx, getCronJob, arg.Namespace, arg.Name)
- var i CronJob
- err := row.Scan(
- &i.ID,
- &i.Name,
- &i.CronSpec,
- &i.Namespace,
- &i.WorkflowName,
- &i.Enabled,
- &i.CreatedAt,
- &i.UpdatedAt,
- &i.LastRunAt,
- &i.NextRunAt,
- )
- return i, err
-}
-
-const getCronJobs = `-- name: GetCronJobs :many
-SELECT id, name, cron_spec, namespace, workflow_name, enabled, created_at, updated_at, last_run_at, next_run_at FROM cron_jobs
-WHERE namespace = ? AND enabled = true
-`
-
-func (q *Queries) GetCronJobs(ctx context.Context, db DBTX, namespace string) ([]CronJob, error) {
- rows, err := db.QueryContext(ctx, getCronJobs, namespace)
- if err != nil {
- return nil, err
- }
- defer rows.Close()
- items := []CronJob{}
- for rows.Next() {
- var i CronJob
- if err := rows.Scan(
- &i.ID,
- &i.Name,
- &i.CronSpec,
- &i.Namespace,
- &i.WorkflowName,
- &i.Enabled,
- &i.CreatedAt,
- &i.UpdatedAt,
- &i.LastRunAt,
- &i.NextRunAt,
- ); err != nil {
- return nil, err
- }
- items = append(items, i)
- }
- if err := rows.Close(); err != nil {
- return nil, err
- }
- if err := rows.Err(); err != nil {
- return nil, err
- }
- return items, nil
-}
-
-const getDueCronJobs = `-- name: GetDueCronJobs :many
-SELECT id, name, cron_spec, namespace, workflow_name, enabled, created_at, updated_at, last_run_at, next_run_at FROM cron_jobs
-WHERE namespace = ? AND enabled = true AND next_run_at <= ?
-`
-
-type GetDueCronJobsParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- NextRunAt int64 `db:"next_run_at" json:"next_run_at"`
-}
-
-func (q *Queries) GetDueCronJobs(ctx context.Context, db DBTX, arg GetDueCronJobsParams) ([]CronJob, error) {
- rows, err := db.QueryContext(ctx, getDueCronJobs, arg.Namespace, arg.NextRunAt)
- if err != nil {
- return nil, err
- }
- defer rows.Close()
- items := []CronJob{}
- for rows.Next() {
- var i CronJob
- if err := rows.Scan(
- &i.ID,
- &i.Name,
- &i.CronSpec,
- &i.Namespace,
- &i.WorkflowName,
- &i.Enabled,
- &i.CreatedAt,
- &i.UpdatedAt,
- &i.LastRunAt,
- &i.NextRunAt,
- ); err != nil {
- return nil, err
- }
- items = append(items, i)
- }
- if err := rows.Close(); err != nil {
- return nil, err
- }
- if err := rows.Err(); err != nil {
- return nil, err
- }
- return items, nil
-}
-
-const getLease = `-- name: GetLease :one
-SELECT resource_id, kind, namespace, worker_id, acquired_at, expires_at, heartbeat_at FROM leases
-WHERE resource_id = ? AND kind = ?
-`
-
-type GetLeaseParams struct {
- ResourceID string `db:"resource_id" json:"resource_id"`
- Kind LeasesKind `db:"kind" json:"kind"`
-}
-
-func (q *Queries) GetLease(ctx context.Context, db DBTX, arg GetLeaseParams) (Lease, error) {
- row := db.QueryRowContext(ctx, getLease, arg.ResourceID, arg.Kind)
- var i Lease
- err := row.Scan(
- &i.ResourceID,
- &i.Kind,
- &i.Namespace,
- &i.WorkerID,
- &i.AcquiredAt,
- &i.ExpiresAt,
- &i.HeartbeatAt,
- )
- return i, err
-}
-
-const getPendingWorkflows = `-- name: GetPendingWorkflows :many
-SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions
-WHERE namespace = ?
- AND (
- status = 'pending'
- OR (status = 'failed' AND next_retry_at <= ?)
- OR (status = 'sleeping' AND sleep_until <= ?)
- )
-ORDER BY created_at ASC
-LIMIT ?
-`
-
-type GetPendingWorkflowsParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"`
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
- Limit int32 `db:"limit" json:"limit"`
-}
-
-func (q *Queries) GetPendingWorkflows(ctx context.Context, db DBTX, arg GetPendingWorkflowsParams) ([]WorkflowExecution, error) {
- rows, err := db.QueryContext(ctx, getPendingWorkflows,
- arg.Namespace,
- arg.NextRetryAt,
- arg.SleepUntil,
- arg.Limit,
- )
- if err != nil {
- return nil, err
- }
- defer rows.Close()
- items := []WorkflowExecution{}
- for rows.Next() {
- var i WorkflowExecution
- if err := rows.Scan(
- &i.ID,
- &i.WorkflowName,
- &i.Status,
- &i.InputData,
- &i.OutputData,
- &i.ErrorMessage,
- &i.CreatedAt,
- &i.StartedAt,
- &i.CompletedAt,
- &i.MaxAttempts,
- &i.RemainingAttempts,
- &i.NextRetryAt,
- &i.Namespace,
- &i.TriggerType,
- &i.TriggerSource,
- &i.SleepUntil,
- &i.TraceID,
- &i.SpanID,
- ); err != nil {
- return nil, err
- }
- items = append(items, i)
- }
- if err := rows.Close(); err != nil {
- return nil, err
- }
- if err := rows.Err(); err != nil {
- return nil, err
- }
- return items, nil
-}
-
-const getPendingWorkflowsFiltered = `-- name: GetPendingWorkflowsFiltered :many
-SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions
-WHERE namespace = ?
- AND (
- status = 'pending'
- OR (status = 'failed' AND next_retry_at <= ?)
- OR (status = 'sleeping' AND sleep_until <= ?)
- )
- AND workflow_name IN (/*SLICE:workflow_names*/?)
-ORDER BY created_at ASC
-LIMIT ?
-`
-
-type GetPendingWorkflowsFilteredParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"`
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
- WorkflowName string `db:"workflow_name" json:"workflow_name"`
- Limit int32 `db:"limit" json:"limit"`
-}
-
-func (q *Queries) GetPendingWorkflowsFiltered(ctx context.Context, db DBTX, arg GetPendingWorkflowsFilteredParams) ([]WorkflowExecution, error) {
- rows, err := db.QueryContext(ctx, getPendingWorkflowsFiltered,
- arg.Namespace,
- arg.NextRetryAt,
- arg.SleepUntil,
- arg.WorkflowName,
- arg.Limit,
- )
- if err != nil {
- return nil, err
- }
- defer rows.Close()
- items := []WorkflowExecution{}
- for rows.Next() {
- var i WorkflowExecution
- if err := rows.Scan(
- &i.ID,
- &i.WorkflowName,
- &i.Status,
- &i.InputData,
- &i.OutputData,
- &i.ErrorMessage,
- &i.CreatedAt,
- &i.StartedAt,
- &i.CompletedAt,
- &i.MaxAttempts,
- &i.RemainingAttempts,
- &i.NextRetryAt,
- &i.Namespace,
- &i.TriggerType,
- &i.TriggerSource,
- &i.SleepUntil,
- &i.TraceID,
- &i.SpanID,
- ); err != nil {
- return nil, err
- }
- items = append(items, i)
- }
- if err := rows.Close(); err != nil {
- return nil, err
- }
- if err := rows.Err(); err != nil {
- return nil, err
- }
- return items, nil
-}
-
-const getSleepingWorkflows = `-- name: GetSleepingWorkflows :many
-SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions
-WHERE namespace = ? AND status = 'sleeping' AND sleep_until <= ?
-ORDER BY sleep_until ASC
-`
-
-type GetSleepingWorkflowsParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
-}
-
-func (q *Queries) GetSleepingWorkflows(ctx context.Context, db DBTX, arg GetSleepingWorkflowsParams) ([]WorkflowExecution, error) {
- rows, err := db.QueryContext(ctx, getSleepingWorkflows, arg.Namespace, arg.SleepUntil)
- if err != nil {
- return nil, err
- }
- defer rows.Close()
- items := []WorkflowExecution{}
- for rows.Next() {
- var i WorkflowExecution
- if err := rows.Scan(
- &i.ID,
- &i.WorkflowName,
- &i.Status,
- &i.InputData,
- &i.OutputData,
- &i.ErrorMessage,
- &i.CreatedAt,
- &i.StartedAt,
- &i.CompletedAt,
- &i.MaxAttempts,
- &i.RemainingAttempts,
- &i.NextRetryAt,
- &i.Namespace,
- &i.TriggerType,
- &i.TriggerSource,
- &i.SleepUntil,
- &i.TraceID,
- &i.SpanID,
- ); err != nil {
- return nil, err
- }
- items = append(items, i)
- }
- if err := rows.Close(); err != nil {
- return nil, err
- }
- if err := rows.Err(); err != nil {
- return nil, err
- }
- return items, nil
-}
-
-const getStep = `-- name: GetStep :one
-SELECT id, execution_id, step_name, status, output_data, error_message, started_at, completed_at, max_attempts, remaining_attempts, namespace FROM workflow_steps
-WHERE namespace = ? AND execution_id = ? AND step_name = ?
-`
-
-type GetStepParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- ExecutionID string `db:"execution_id" json:"execution_id"`
- StepName string `db:"step_name" json:"step_name"`
-}
-
-func (q *Queries) GetStep(ctx context.Context, db DBTX, arg GetStepParams) (WorkflowStep, error) {
- row := db.QueryRowContext(ctx, getStep, arg.Namespace, arg.ExecutionID, arg.StepName)
- var i WorkflowStep
- err := row.Scan(
- &i.ID,
- &i.ExecutionID,
- &i.StepName,
- &i.Status,
- &i.OutputData,
- &i.ErrorMessage,
- &i.StartedAt,
- &i.CompletedAt,
- &i.MaxAttempts,
- &i.RemainingAttempts,
- &i.Namespace,
- )
- return i, err
-}
-
-const getWorkflow = `-- name: GetWorkflow :one
-SELECT id, workflow_name, status, input_data, output_data, error_message, created_at, started_at, completed_at, max_attempts, remaining_attempts, next_retry_at, namespace, trigger_type, trigger_source, sleep_until, trace_id, span_id FROM workflow_executions
-WHERE id = ? AND namespace = ?
-`
-
-type GetWorkflowParams struct {
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
-}
-
-func (q *Queries) GetWorkflow(ctx context.Context, db DBTX, arg GetWorkflowParams) (WorkflowExecution, error) {
- row := db.QueryRowContext(ctx, getWorkflow, arg.ID, arg.Namespace)
- var i WorkflowExecution
- err := row.Scan(
- &i.ID,
- &i.WorkflowName,
- &i.Status,
- &i.InputData,
- &i.OutputData,
- &i.ErrorMessage,
- &i.CreatedAt,
- &i.StartedAt,
- &i.CompletedAt,
- &i.MaxAttempts,
- &i.RemainingAttempts,
- &i.NextRetryAt,
- &i.Namespace,
- &i.TriggerType,
- &i.TriggerSource,
- &i.SleepUntil,
- &i.TraceID,
- &i.SpanID,
- )
- return i, err
-}
-
-const heartbeatLease = `-- name: HeartbeatLease :exec
-UPDATE leases
-SET heartbeat_at = ?, expires_at = ?
-WHERE resource_id = ? AND worker_id = ?
-`
-
-type HeartbeatLeaseParams struct {
- HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
- ResourceID string `db:"resource_id" json:"resource_id"`
- WorkerID string `db:"worker_id" json:"worker_id"`
-}
-
-func (q *Queries) HeartbeatLease(ctx context.Context, db DBTX, arg HeartbeatLeaseParams) error {
- _, err := db.ExecContext(ctx, heartbeatLease,
- arg.HeartbeatAt,
- arg.ExpiresAt,
- arg.ResourceID,
- arg.WorkerID,
- )
- return err
-}
-
-const releaseLease = `-- name: ReleaseLease :exec
-DELETE FROM leases
-WHERE resource_id = ? AND worker_id = ?
-`
-
-type ReleaseLeaseParams struct {
- ResourceID string `db:"resource_id" json:"resource_id"`
- WorkerID string `db:"worker_id" json:"worker_id"`
-}
-
-func (q *Queries) ReleaseLease(ctx context.Context, db DBTX, arg ReleaseLeaseParams) error {
- _, err := db.ExecContext(ctx, releaseLease, arg.ResourceID, arg.WorkerID)
- return err
-}
-
-const resetOrphanedWorkflows = `-- name: ResetOrphanedWorkflows :exec
-UPDATE workflow_executions
-SET status = 'pending'
-WHERE workflow_executions.namespace = ?
- AND workflow_executions.status = 'running'
- AND workflow_executions.id NOT IN (
- SELECT resource_id
- FROM leases
- WHERE kind = 'workflow' AND leases.namespace = ?
- )
-`
-
-type ResetOrphanedWorkflowsParams struct {
- Namespace string `db:"namespace" json:"namespace"`
- Namespace_2 string `db:"namespace_2" json:"namespace_2"`
-}
-
-func (q *Queries) ResetOrphanedWorkflows(ctx context.Context, db DBTX, arg ResetOrphanedWorkflowsParams) error {
- _, err := db.ExecContext(ctx, resetOrphanedWorkflows, arg.Namespace, arg.Namespace_2)
- return err
-}
-
-const sleepWorkflow = `-- name: SleepWorkflow :exec
-UPDATE workflow_executions
-SET status = 'sleeping', sleep_until = ?
-WHERE id = ? AND namespace = ?
-`
-
-type SleepWorkflowParams struct {
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
-}
-
-func (q *Queries) SleepWorkflow(ctx context.Context, db DBTX, arg SleepWorkflowParams) error {
- _, err := db.ExecContext(ctx, sleepWorkflow, arg.SleepUntil, arg.ID, arg.Namespace)
- return err
-}
-
-const updateCronJob = `-- name: UpdateCronJob :exec
-UPDATE cron_jobs
-SET cron_spec = ?, workflow_name = ?, enabled = ?, updated_at = ?, next_run_at = ?
-WHERE id = ? AND namespace = ?
-`
-
-type UpdateCronJobParams struct {
- CronSpec string `db:"cron_spec" json:"cron_spec"`
- WorkflowName sql.NullString `db:"workflow_name" json:"workflow_name"`
- Enabled bool `db:"enabled" json:"enabled"`
- UpdatedAt int64 `db:"updated_at" json:"updated_at"`
- NextRunAt int64 `db:"next_run_at" json:"next_run_at"`
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
-}
-
-func (q *Queries) UpdateCronJob(ctx context.Context, db DBTX, arg UpdateCronJobParams) error {
- _, err := db.ExecContext(ctx, updateCronJob,
- arg.CronSpec,
- arg.WorkflowName,
- arg.Enabled,
- arg.UpdatedAt,
- arg.NextRunAt,
- arg.ID,
- arg.Namespace,
- )
- return err
-}
-
-const updateCronJobLastRun = `-- name: UpdateCronJobLastRun :exec
-UPDATE cron_jobs
-SET last_run_at = ?, next_run_at = ?, updated_at = ?
-WHERE id = ? AND namespace = ?
-`
-
-type UpdateCronJobLastRunParams struct {
- LastRunAt sql.NullInt64 `db:"last_run_at" json:"last_run_at"`
- NextRunAt int64 `db:"next_run_at" json:"next_run_at"`
- UpdatedAt int64 `db:"updated_at" json:"updated_at"`
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
-}
-
-func (q *Queries) UpdateCronJobLastRun(ctx context.Context, db DBTX, arg UpdateCronJobLastRunParams) error {
- _, err := db.ExecContext(ctx, updateCronJobLastRun,
- arg.LastRunAt,
- arg.NextRunAt,
- arg.UpdatedAt,
- arg.ID,
- arg.Namespace,
- )
- return err
-}
-
-const updateLease = `-- name: UpdateLease :exec
-UPDATE leases
-SET worker_id = ?, acquired_at = ?, expires_at = ?, heartbeat_at = ?
-WHERE resource_id = ? AND kind = ? AND expires_at < ?
-`
-
-type UpdateLeaseParams struct {
- WorkerID string `db:"worker_id" json:"worker_id"`
- AcquiredAt int64 `db:"acquired_at" json:"acquired_at"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
- HeartbeatAt int64 `db:"heartbeat_at" json:"heartbeat_at"`
- ResourceID string `db:"resource_id" json:"resource_id"`
- Kind LeasesKind `db:"kind" json:"kind"`
- ExpiresAt_2 int64 `db:"expires_at_2" json:"expires_at_2"`
-}
-
-func (q *Queries) UpdateLease(ctx context.Context, db DBTX, arg UpdateLeaseParams) error {
- _, err := db.ExecContext(ctx, updateLease,
- arg.WorkerID,
- arg.AcquiredAt,
- arg.ExpiresAt,
- arg.HeartbeatAt,
- arg.ResourceID,
- arg.Kind,
- arg.ExpiresAt_2,
- )
- return err
-}
-
-const updateStepStatus = `-- name: UpdateStepStatus :exec
-UPDATE workflow_steps
-SET status = ?, completed_at = ?, output_data = ?, error_message = ?
-WHERE namespace = ? AND execution_id = ? AND step_name = ?
-`
-
-type UpdateStepStatusParams struct {
- Status WorkflowStepsStatus `db:"status" json:"status"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- Namespace string `db:"namespace" json:"namespace"`
- ExecutionID string `db:"execution_id" json:"execution_id"`
- StepName string `db:"step_name" json:"step_name"`
-}
-
-func (q *Queries) UpdateStepStatus(ctx context.Context, db DBTX, arg UpdateStepStatusParams) error {
- _, err := db.ExecContext(ctx, updateStepStatus,
- arg.Status,
- arg.CompletedAt,
- arg.OutputData,
- arg.ErrorMessage,
- arg.Namespace,
- arg.ExecutionID,
- arg.StepName,
- )
- return err
-}
-
-const updateStepStatusWithLease = `-- name: UpdateStepStatusWithLease :exec
-UPDATE workflow_steps
-SET status = ?, completed_at = ?, output_data = ?, error_message = ?
-WHERE workflow_steps.namespace = ? AND execution_id = ? AND step_name = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )
-`
-
-type UpdateStepStatusWithLeaseParams struct {
- Status WorkflowStepsStatus `db:"status" json:"status"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- OutputData []byte `db:"output_data" json:"output_data"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- Namespace string `db:"namespace" json:"namespace"`
- ExecutionID string `db:"execution_id" json:"execution_id"`
- StepName string `db:"step_name" json:"step_name"`
- ResourceID string `db:"resource_id" json:"resource_id"`
- WorkerID string `db:"worker_id" json:"worker_id"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
-}
-
-func (q *Queries) UpdateStepStatusWithLease(ctx context.Context, db DBTX, arg UpdateStepStatusWithLeaseParams) error {
- _, err := db.ExecContext(ctx, updateStepStatusWithLease,
- arg.Status,
- arg.CompletedAt,
- arg.OutputData,
- arg.ErrorMessage,
- arg.Namespace,
- arg.ExecutionID,
- arg.StepName,
- arg.ResourceID,
- arg.WorkerID,
- arg.ExpiresAt,
- )
- return err
-}
-
-const updateWorkflowFields = `-- name: UpdateWorkflowFields :exec
-UPDATE workflow_executions
-SET
- status = COALESCE(?, status),
- error_message = COALESCE(?, error_message),
- completed_at = COALESCE(?, completed_at),
- started_at = COALESCE(?, started_at),
- output_data = COALESCE(?, output_data),
- remaining_attempts = COALESCE(?, remaining_attempts),
- next_retry_at = COALESCE(?, next_retry_at),
- sleep_until = COALESCE(?, sleep_until)
-WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )
-`
-
-type UpdateWorkflowFieldsParams struct {
- Status WorkflowExecutionsStatus `db:"status" json:"status"`
- ErrorMessage sql.NullString `db:"error_message" json:"error_message"`
- CompletedAt sql.NullInt64 `db:"completed_at" json:"completed_at"`
- StartedAt sql.NullInt64 `db:"started_at" json:"started_at"`
- OutputData []byte `db:"output_data" json:"output_data"`
- RemainingAttempts int32 `db:"remaining_attempts" json:"remaining_attempts"`
- NextRetryAt sql.NullInt64 `db:"next_retry_at" json:"next_retry_at"`
- SleepUntil sql.NullInt64 `db:"sleep_until" json:"sleep_until"`
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
- ResourceID string `db:"resource_id" json:"resource_id"`
- WorkerID string `db:"worker_id" json:"worker_id"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
-}
-
-func (q *Queries) UpdateWorkflowFields(ctx context.Context, db DBTX, arg UpdateWorkflowFieldsParams) error {
- _, err := db.ExecContext(ctx, updateWorkflowFields,
- arg.Status,
- arg.ErrorMessage,
- arg.CompletedAt,
- arg.StartedAt,
- arg.OutputData,
- arg.RemainingAttempts,
- arg.NextRetryAt,
- arg.SleepUntil,
- arg.ID,
- arg.Namespace,
- arg.ResourceID,
- arg.WorkerID,
- arg.ExpiresAt,
- )
- return err
-}
-
-const updateWorkflowToRunning = `-- name: UpdateWorkflowToRunning :exec
-UPDATE workflow_executions
-SET status = 'running',
- started_at = CASE WHEN started_at IS NULL THEN ? ELSE started_at END,
- sleep_until = NULL
-WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )
-`
-
-type UpdateWorkflowToRunningParams struct {
- StartedAt sql.NullInt64 `db:"started_at" json:"started_at"`
- ID string `db:"id" json:"id"`
- Namespace string `db:"namespace" json:"namespace"`
- ResourceID string `db:"resource_id" json:"resource_id"`
- WorkerID string `db:"worker_id" json:"worker_id"`
- ExpiresAt int64 `db:"expires_at" json:"expires_at"`
-}
-
-func (q *Queries) UpdateWorkflowToRunning(ctx context.Context, db DBTX, arg UpdateWorkflowToRunningParams) error {
- _, err := db.ExecContext(ctx, updateWorkflowToRunning,
- arg.StartedAt,
- arg.ID,
- arg.Namespace,
- arg.ResourceID,
- arg.WorkerID,
- arg.ExpiresAt,
- )
- return err
-}
diff --git a/go/pkg/hydra/store_coverage_test.go b/go/pkg/hydra/store_coverage_test.go
deleted file mode 100644
index e53a8181c2..0000000000
--- a/go/pkg/hydra/store_coverage_test.go
+++ /dev/null
@@ -1,184 +0,0 @@
-package hydra
-
-import (
- "context"
- "database/sql"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/uid"
-)
-
-func TestSQLCQueryCoverage(t *testing.T) {
- // Test that basic SQLC Query operations work
- engine := newTestEngine(t)
- ctx := context.Background()
- namespace := engine.GetNamespace()
-
- t.Run("WorkflowOperations", func(t *testing.T) {
- // Test CreateWorkflow using Query pattern
- workflowID := uid.New(uid.WorkflowPrefix)
- err := store.Query.CreateWorkflow(ctx, engine.GetDB(), store.CreateWorkflowParams{
- ID: workflowID,
- WorkflowName: "test-workflow",
- Status: store.WorkflowExecutionsStatusPending,
- InputData: []byte(`{"test": "data"}`),
- OutputData: []byte{},
- ErrorMessage: sql.NullString{Valid: false},
- CreatedAt: time.Now().UnixMilli(),
- StartedAt: sql.NullInt64{Valid: false},
- CompletedAt: sql.NullInt64{Valid: false},
- MaxAttempts: 3,
- RemainingAttempts: 3,
- NextRetryAt: sql.NullInt64{Valid: false},
- Namespace: namespace,
- TriggerType: store.NullWorkflowExecutionsTriggerType{Valid: false},
- TriggerSource: sql.NullString{Valid: false},
- SleepUntil: sql.NullInt64{Valid: false},
- TraceID: sql.NullString{Valid: false},
- SpanID: sql.NullString{Valid: false},
- })
- require.NoError(t, err, "CreateWorkflow should work")
-
- // Test GetWorkflow using Query pattern
- workflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: workflowID,
- Namespace: namespace,
- })
- require.NoError(t, err, "GetWorkflow should work")
- require.Equal(t, workflowID, workflow.ID)
- require.Equal(t, "test-workflow", workflow.WorkflowName)
- require.Equal(t, store.WorkflowExecutionsStatusPending, workflow.Status)
-
- // GetAllWorkflows was removed - test individual workflow retrieval instead
- retrievedWorkflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: workflowID,
- Namespace: namespace,
- })
- require.NoError(t, err, "GetWorkflow should work")
- require.Equal(t, workflowID, retrievedWorkflow.ID)
-
- // Test GetPendingWorkflows using Query pattern
- pendingWorkflows, err := store.Query.GetPendingWorkflows(ctx, engine.GetDB(), store.GetPendingWorkflowsParams{
- Namespace: namespace,
- NextRetryAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- SleepUntil: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- Limit: 10,
- })
- require.NoError(t, err, "GetPendingWorkflows should work")
- require.Len(t, pendingWorkflows, 1)
-
- // Test UpdateWorkflowFields (replacement for UpdateWorkflowStatus)
- // Note: This will fail due to lease validation, which is expected in tests
- now := time.Now().UnixMilli()
- _ = store.Query.UpdateWorkflowFields(ctx, engine.GetDB(), store.UpdateWorkflowFieldsParams{
- Status: store.WorkflowExecutionsStatusRunning,
- ErrorMessage: sql.NullString{Valid: false},
- CompletedAt: sql.NullInt64{Valid: false},
- StartedAt: sql.NullInt64{Valid: false},
- OutputData: nil,
- RemainingAttempts: 0,
- NextRetryAt: sql.NullInt64{Valid: false},
- SleepUntil: sql.NullInt64{Valid: false},
- ID: workflowID,
- Namespace: namespace,
- ResourceID: workflowID,
- WorkerID: "test-worker",
- ExpiresAt: now,
- })
- // Ignore error due to missing lease
-
- // Test CompleteWorkflow using restored simple query
- err = store.Query.CompleteWorkflow(ctx, engine.GetDB(), store.CompleteWorkflowParams{
- CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- OutputData: []byte(`{"result": "success"}`),
- ID: workflowID,
- Namespace: namespace,
- })
- require.NoError(t, err, "CompleteWorkflow should work")
-
- // Verify final state
- finalWorkflow, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: workflowID,
- Namespace: namespace,
- })
- require.NoError(t, err)
- require.Equal(t, store.WorkflowExecutionsStatusCompleted, finalWorkflow.Status)
- })
-
- t.Run("StepOperations", func(t *testing.T) {
- // Create a workflow first
- workflowID := uid.New(uid.WorkflowPrefix)
- err := store.Query.CreateWorkflow(ctx, engine.GetDB(), store.CreateWorkflowParams{
- ID: workflowID,
- WorkflowName: "test-workflow-with-steps",
- Status: store.WorkflowExecutionsStatusRunning,
- InputData: []byte(`{"test": "data"}`),
- OutputData: []byte{},
- ErrorMessage: sql.NullString{Valid: false},
- CreatedAt: time.Now().UnixMilli(),
- StartedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- CompletedAt: sql.NullInt64{Valid: false},
- MaxAttempts: 3,
- RemainingAttempts: 3,
- NextRetryAt: sql.NullInt64{Valid: false},
- Namespace: namespace,
- TriggerType: store.NullWorkflowExecutionsTriggerType{Valid: false},
- TriggerSource: sql.NullString{Valid: false},
- SleepUntil: sql.NullInt64{Valid: false},
- TraceID: sql.NullString{Valid: false},
- SpanID: sql.NullString{Valid: false},
- })
- require.NoError(t, err)
-
- // Test CreateStep using Query pattern
- stepID := uid.New(uid.StepPrefix)
- err = store.Query.CreateStep(ctx, engine.GetDB(), store.CreateStepParams{
- ID: stepID,
- ExecutionID: workflowID,
- StepName: "test-step",
- Status: store.WorkflowStepsStatusRunning,
- OutputData: []byte{},
- ErrorMessage: sql.NullString{Valid: false},
- StartedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- CompletedAt: sql.NullInt64{Valid: false},
- MaxAttempts: 3,
- RemainingAttempts: 3,
- Namespace: namespace,
- })
- require.NoError(t, err, "CreateStep should work")
-
- // Test GetStep using Query pattern
- step, err := store.Query.GetStep(ctx, engine.GetDB(), store.GetStepParams{
- Namespace: namespace,
- ExecutionID: workflowID,
- StepName: "test-step",
- })
- require.NoError(t, err, "GetStep should work")
- require.Equal(t, stepID, step.ID)
- require.Equal(t, "test-step", step.StepName)
-
- // Test UpdateStepStatus using restored simple query
- err = store.Query.UpdateStepStatus(ctx, engine.GetDB(), store.UpdateStepStatusParams{
- Status: store.WorkflowStepsStatusCompleted,
- CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- OutputData: []byte(`{"step_result": "success"}`),
- ErrorMessage: sql.NullString{Valid: false},
- Namespace: namespace,
- ExecutionID: workflowID,
- StepName: "test-step",
- })
- require.NoError(t, err, "UpdateStepStatus should work")
-
- // GetAllSteps was removed - test individual step retrieval instead
- retrievedStep, err := store.Query.GetStep(ctx, engine.GetDB(), store.GetStepParams{
- Namespace: namespace,
- ExecutionID: workflowID,
- StepName: "test-step",
- })
- require.NoError(t, err, "GetStep should work")
- require.Equal(t, "test-step", retrievedStep.StepName)
- })
-}
diff --git a/go/pkg/hydra/test_helpers.go b/go/pkg/hydra/test_helpers.go
deleted file mode 100644
index 183fec0f89..0000000000
--- a/go/pkg/hydra/test_helpers.go
+++ /dev/null
@@ -1,76 +0,0 @@
-package hydra
-
-import (
- "context"
- "database/sql"
- "fmt"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/otel/logging"
- "github.com/unkeyed/unkey/go/pkg/testutil/containers"
- "github.com/unkeyed/unkey/go/pkg/uid"
-)
-
-// newTestEngineWithClock creates a test engine with the specified clock
-func newTestEngineWithClock(t *testing.T, clk clock.Clock) *Engine {
- t.Helper()
-
- // Use testcontainers for MySQL
- mysqlCfg := containers.MySQL(t)
- mysqlCfg.DBName = "hydra"
- hydraDsn := mysqlCfg.FormatDSN()
-
- // Load the hydra schema into the database
- db, err := sql.Open("mysql", hydraDsn)
- require.NoError(t, err)
- defer db.Close()
-
- // Create a unique namespace for this test to avoid data pollution
- testNamespace := fmt.Sprintf("test_%s_%s", t.Name(), uid.New(uid.Prefix("test")))
-
- // Create the engine with the properly configured database
- engine, err := New(Config{
- DSN: hydraDsn,
- Namespace: testNamespace,
- Clock: clk,
- Logger: logging.NewNoop(),
- Marshaller: NewJSONMarshaller(),
- })
- if err != nil {
- t.Fatalf("Failed to create test engine: %v", err)
- }
-
- return engine
-}
-
-// newTestEngine creates a test engine with default clock
-func newTestEngine(t *testing.T) *Engine {
- return newTestEngineWithClock(t, clock.New())
-}
-
-// waitForWorkflowCompletion waits for a workflow to complete and returns the final workflow state
-func waitForWorkflowCompletion(t *testing.T, engine *Engine, workflowID string, timeout time.Duration) *store.WorkflowExecution {
- t.Helper()
-
- var workflow store.WorkflowExecution
- var err error
-
- require.Eventually(t, func() bool {
- workflow, err = store.Query.GetWorkflow(context.Background(), engine.GetDB(), store.GetWorkflowParams{
- ID: workflowID,
- Namespace: engine.GetNamespace(),
- })
- if err != nil {
- return false
- }
- return workflow.Status == store.WorkflowExecutionsStatusCompleted ||
- workflow.Status == store.WorkflowExecutionsStatusFailed
- }, timeout, 100*time.Millisecond, "Workflow should complete within timeout")
-
- require.NoError(t, err)
- return &workflow
-}
diff --git a/go/pkg/hydra/testharness/events.go b/go/pkg/hydra/testharness/events.go
deleted file mode 100644
index 922286d6af..0000000000
--- a/go/pkg/hydra/testharness/events.go
+++ /dev/null
@@ -1,179 +0,0 @@
-package testharness
-
-import (
- "sync"
- "time"
-)
-
-// WorkflowContext interface for extracting metadata (avoid import cycle)
-type WorkflowContext interface {
- ExecutionID() string
- WorkflowName() string
-}
-
-// EventType represents the type of event that occurred
-type EventType string
-
-const (
- WorkflowStarted EventType = "workflow_started"
- WorkflowCompleted EventType = "workflow_completed"
- WorkflowFailed EventType = "workflow_failed"
- StepExecuting EventType = "step_executing"
- StepExecuted EventType = "step_executed"
- StepFailed EventType = "step_failed"
-)
-
-// EventRecord represents something that happened during test execution
-type EventRecord struct {
- Type EventType `json:"type"`
- Message string `json:"message"`
- Timestamp time.Time `json:"timestamp"`
- Data map[string]interface{} `json:"data"`
-}
-
-// EventCollector captures events during test execution
-type EventCollector struct {
- mu sync.RWMutex
- events []EventRecord
-}
-
-// NewEventCollector creates a new event collector
-func NewEventCollector() *EventCollector {
- return &EventCollector{
- mu: sync.RWMutex{},
- events: make([]EventRecord, 0),
- }
-}
-
-// Emit records an event with workflow context metadata automatically included
-func (e *EventCollector) Emit(ctx WorkflowContext, eventType EventType, message string, extraData ...interface{}) {
- e.mu.Lock()
- defer e.mu.Unlock()
-
- // Start with context metadata
- data := map[string]interface{}{
- "execution_id": ctx.ExecutionID(),
- "workflow_name": ctx.WorkflowName(),
- }
-
- // Add extra data as key-value pairs
- for i := 0; i < len(extraData); i += 2 {
- if i+1 < len(extraData) {
- if key, ok := extraData[i].(string); ok {
- data[key] = extraData[i+1]
- }
- }
- }
-
- event := EventRecord{
- Type: eventType,
- Message: message,
- Timestamp: time.Now(),
- Data: data,
- }
-
- e.events = append(e.events, event)
-}
-
-// Events returns all collected events
-func (e *EventCollector) Events() []EventRecord {
- e.mu.RLock()
- defer e.mu.RUnlock()
-
- // Return a copy to prevent race conditions
- events := make([]EventRecord, len(e.events))
- copy(events, e.events)
- return events
-}
-
-// Filter returns events that match the given criteria
-func (e *EventCollector) Filter(eventType EventType) []EventRecord {
- e.mu.RLock()
- defer e.mu.RUnlock()
-
- var filtered []EventRecord
- for _, event := range e.events {
- if event.Type == eventType {
- filtered = append(filtered, event)
- }
- }
- return filtered
-}
-
-// FilterWithData returns events that match the type and have specific data values
-func (e *EventCollector) FilterWithData(eventType EventType, key string, value interface{}) []EventRecord {
- e.mu.RLock()
- defer e.mu.RUnlock()
-
- var filtered []EventRecord
- for _, event := range e.events {
- if event.Type == eventType {
- if eventValue, exists := event.Data[key]; exists && eventValue == value {
- filtered = append(filtered, event)
- }
- }
- }
- return filtered
-}
-
-// Count returns the number of events of a specific type
-func (e *EventCollector) Count(eventType EventType) int {
- return len(e.Filter(eventType))
-}
-
-// CountWithData returns the number of events that match type and data criteria
-func (e *EventCollector) CountWithData(eventType EventType, key string, value interface{}) int {
- return len(e.FilterWithData(eventType, key, value))
-}
-
-// Clear removes all collected events
-func (e *EventCollector) Clear() {
- e.mu.Lock()
- defer e.mu.Unlock()
- e.events = e.events[:0]
-}
-
-// GetLatest returns the most recent event of a given type, or nil if none found
-func (e *EventCollector) GetLatest(eventType EventType) *EventRecord {
- events := e.Filter(eventType)
- if len(events) == 0 {
- return nil
- }
- return &events[len(events)-1]
-}
-
-// GetFirst returns the first event of a given type, or nil if none found
-func (e *EventCollector) GetFirst(eventType EventType) *EventRecord {
- events := e.Filter(eventType)
- if len(events) == 0 {
- return nil
- }
- return &events[0]
-}
-
-// EventsBetween returns events that occurred between start and end times (inclusive)
-func (e *EventCollector) EventsBetween(start, end time.Time) []EventRecord {
- e.mu.RLock()
- defer e.mu.RUnlock()
-
- var filtered []EventRecord
- for _, event := range e.events {
- if (event.Timestamp.Equal(start) || event.Timestamp.After(start)) &&
- (event.Timestamp.Equal(end) || event.Timestamp.Before(end)) {
- filtered = append(filtered, event)
- }
- }
- return filtered
-}
-
-// Summary returns a summary of all event types and their counts
-func (e *EventCollector) Summary() map[string]int {
- e.mu.RLock()
- defer e.mu.RUnlock()
-
- summary := make(map[string]int)
- for _, event := range e.events {
- summary[string(event.Type)]++
- }
- return summary
-}
diff --git a/go/pkg/hydra/worker.go b/go/pkg/hydra/worker.go
deleted file mode 100644
index b0650d446d..0000000000
--- a/go/pkg/hydra/worker.go
+++ /dev/null
@@ -1,1003 +0,0 @@
-package hydra
-
-import (
- "context"
- "database/sql"
- "fmt"
- "strconv"
- "sync"
- "time"
-
- "github.com/unkeyed/unkey/go/pkg/circuitbreaker"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/db"
- "github.com/unkeyed/unkey/go/pkg/hydra/metrics"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/otel/tracing"
- "github.com/unkeyed/unkey/go/pkg/uid"
- "go.opentelemetry.io/otel/attribute"
- "go.opentelemetry.io/otel/trace"
-)
-
-// Worker represents a workflow worker that can start, run, and shutdown.
-//
-// Workers are responsible for:
-// - Polling the database for pending workflows
-// - Acquiring exclusive leases on workflows to prevent duplicate execution
-// - Executing workflow logic by calling registered workflow handlers
-// - Sending periodic heartbeats to maintain lease ownership
-// - Processing scheduled cron jobs
-// - Recording metrics for observability
-//
-// Workers are designed to be run as long-lived processes and can safely
-// handle network failures, database outages, and graceful shutdowns.
-type Worker interface {
- // Start begins the worker's main execution loop.
- // This method blocks until the context is cancelled or an error occurs.
- Start(ctx context.Context) error
-
- // Shutdown gracefully stops the worker and waits for active workflows to complete.
- // This method should be called during application shutdown to ensure clean termination.
- Shutdown(ctx context.Context) error
-}
-
-// WorkerConfig holds the configuration for a worker instance.
-//
-// All fields are optional and will use sensible defaults if not specified.
-type WorkerConfig struct {
- // WorkerID uniquely identifies this worker instance.
- // If not provided, a random ID will be generated.
- WorkerID string
-
- // Concurrency controls how many workflows can execute simultaneously.
- // Defaults to 10 if not specified.
- Concurrency int
-
- // PollInterval controls how frequently the worker checks for new workflows.
- // Shorter intervals provide lower latency but increase database load.
- // Defaults to 5 seconds if not specified.
- PollInterval time.Duration
-
- // HeartbeatInterval controls how frequently the worker sends lease heartbeats.
- // This should be significantly shorter than ClaimTimeout to prevent lease expiration.
- // Defaults to 30 seconds if not specified.
- HeartbeatInterval time.Duration
-
- // ClaimTimeout controls how long a worker can hold a workflow lease.
- // Expired leases are automatically released, allowing other workers to take over.
- // Defaults to 5 minutes if not specified.
- ClaimTimeout time.Duration
-
- // CronInterval controls how frequently the worker checks for due cron jobs.
- // Defaults to 1 minute if not specified.
- CronInterval time.Duration
-}
-
-type worker struct {
- engine *Engine
- config WorkerConfig
- workflows map[string]Workflow[any]
- clock clock.Clock
- shutdownC chan struct{}
- doneC chan struct{}
- wg sync.WaitGroup
- activeLeases map[string]bool // Track workflow IDs we have leases for
- activeLeasesM sync.RWMutex // Protect the activeLeases map
- queryCircuitBreaker circuitbreaker.CircuitBreaker[[]store.WorkflowExecution] // Protect query operations
- leaseCircuitBreaker circuitbreaker.CircuitBreaker[any] // Protect lease operations
- workflowQueue chan store.WorkflowExecution // Queue of workflows to process
-}
-
-// NewWorker creates a new worker instance with the provided configuration.
-//
-// The worker will be associated with the given engine and inherit its
-// namespace and storage configuration. Missing configuration values
-// will be populated with sensible defaults.
-//
-// The worker must have workflows registered using RegisterWorkflow()
-// before calling Start().
-//
-// Example:
-//
-// worker, err := hydra.NewWorker(engine, hydra.WorkerConfig{
-// WorkerID: "worker-1",
-// Concurrency: 20,
-// PollInterval: 100 * time.Millisecond,
-// HeartbeatInterval: 30 * time.Second,
-// ClaimTimeout: 5 * time.Minute,
-// })
-// if err != nil {
-// return err
-// }
-//
-// The worker includes built-in circuit breakers to protect against
-// database overload and automatic retry logic for transient failures.
-func NewWorker(e *Engine, config WorkerConfig) (Worker, error) {
- if config.WorkerID == "" {
- config.WorkerID = uid.New(uid.WorkerPrefix)
- }
- if config.Concurrency <= 0 {
- config.Concurrency = 10
- }
- if config.PollInterval <= 0 {
- config.PollInterval = 5 * time.Second
- }
- if config.HeartbeatInterval <= 0 {
- config.HeartbeatInterval = 30 * time.Second
- }
- if config.ClaimTimeout <= 0 {
- config.ClaimTimeout = 5 * time.Minute
- }
- if config.CronInterval <= 0 {
- config.CronInterval = 1 * time.Minute
- }
-
- // Initialize circuit breakers for different database operations
- queryCircuitBreaker := circuitbreaker.New[[]store.WorkflowExecution]("hydra-query")
- leaseCircuitBreaker := circuitbreaker.New[any]("hydra-lease")
-
- // Create workflow queue with capacity based on concurrency
- queueSize := config.Concurrency * 10
- if queueSize < 50 {
- queueSize = 50 // Minimum queue size
- }
-
- worker := &worker{
- engine: e,
- config: config,
- workflows: make(map[string]Workflow[any]),
- clock: e.clock,
- shutdownC: make(chan struct{}),
- doneC: make(chan struct{}),
- wg: sync.WaitGroup{},
- activeLeases: make(map[string]bool),
- activeLeasesM: sync.RWMutex{},
- queryCircuitBreaker: queryCircuitBreaker,
- leaseCircuitBreaker: leaseCircuitBreaker,
- workflowQueue: make(chan store.WorkflowExecution, queueSize),
- }
-
- return worker, nil
-}
-
-func (w *worker) run(ctx context.Context) {
- defer close(w.doneC)
-
- // Start workflow processors
- for i := 0; i < w.config.Concurrency; i++ {
- w.wg.Add(1)
- go w.processWorkflows(ctx)
- }
-
- w.wg.Add(4)
- go w.pollForWorkflows(ctx)
- go w.sendHeartbeats(ctx)
- go w.cleanupExpiredLeases(ctx)
- go w.processCronJobs(ctx)
-
- select {
- case <-w.shutdownC:
- case <-ctx.Done():
- }
-
- // Don't close the queue immediately - let processors drain it first
- w.wg.Wait()
-}
-
-func (w *worker) pollForWorkflows(ctx context.Context) {
- defer w.wg.Done()
-
- ticker := w.clock.NewTicker(w.config.PollInterval)
- defer ticker.Stop()
- tickerC := ticker.C()
-
- for {
- select {
- case <-tickerC:
- w.pollOnce(ctx)
-
- case <-w.shutdownC:
- return
-
- case <-ctx.Done():
- return
- }
- }
-}
-
-func (w *worker) pollOnce(ctx context.Context) {
- workflowNames := make([]string, 0, len(w.workflows))
- for name := range w.workflows {
- workflowNames = append(workflowNames, name)
- }
-
- // Use a more conservative fetch limit to reduce contention
- fetchLimit := w.config.Concurrency * 2 // Fetch less to reduce contention
- if fetchLimit < 10 {
- fetchLimit = 10 // Minimum fetch size
- }
- if fetchLimit > 1000 {
- fetchLimit = 1000 // Maximum reasonable fetch size
- }
-
- // Convert to int32 safely for gosec - using string conversion to avoid overflow warning
- fetchLimit32, _ := strconv.ParseInt(strconv.Itoa(fetchLimit), 10, 32)
-
- workflows, err := w.queryCircuitBreaker.Do(ctx, func(ctx context.Context) ([]store.WorkflowExecution, error) {
- // Use new Query pattern
- now := time.Now().UnixMilli()
- var workflows []store.WorkflowExecution
- var err error
-
- if len(workflowNames) > 0 {
- // Use filtered query - for now just use the first workflow name
- // Multiple workflow names support requires SQLC query enhancement
- workflows, err = store.Query.GetPendingWorkflowsFiltered(ctx, w.engine.GetDB(), store.GetPendingWorkflowsFilteredParams{
- Namespace: w.engine.namespace,
- NextRetryAt: sql.NullInt64{Int64: now, Valid: true},
- SleepUntil: sql.NullInt64{Int64: now, Valid: true},
- WorkflowName: workflowNames[0],
- Limit: int32(fetchLimit32), //nolint:gosec // G115: fetchLimit is bounded to [10, 1000]
- })
- } else {
- workflows, err = store.Query.GetPendingWorkflows(ctx, w.engine.GetDB(), store.GetPendingWorkflowsParams{
- Namespace: w.engine.namespace,
- NextRetryAt: sql.NullInt64{Int64: now, Valid: true},
- SleepUntil: sql.NullInt64{Int64: now, Valid: true},
- Limit: int32(fetchLimit32), //nolint:gosec // G115: fetchLimit is bounded to [10, 1000]
- })
- }
-
- if err != nil {
- return nil, err
- }
-
- // Return store types directly (no conversion needed)
- return workflows, nil
- })
-
- // Record polling metrics
- if err != nil {
- metrics.WorkerPollsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, "error").Inc()
- return
- }
-
- // Record successful poll with found work status
- status := "no_work"
- if len(workflows) > 0 {
- status = "found_work"
- }
- metrics.WorkerPollsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, status).Inc()
-
- // Queue workflows - let polling goroutine block if needed
- for _, workflow := range workflows {
- w.workflowQueue <- workflow
- }
-}
-
-func (w *worker) processWorkflows(ctx context.Context) {
- defer w.wg.Done()
-
- for {
- select {
- case workflow := <-w.workflowQueue:
- // Try to acquire lease using new Query pattern with transaction
- err := w.acquireWorkflowLease(ctx, workflow.ID, w.config.WorkerID)
- if err != nil {
- // Another worker got it or error, skip this workflow
- metrics.LeaseAcquisitionsTotal.WithLabelValues(w.config.WorkerID, "workflow", "failed").Inc()
- continue
- }
-
- // Record successful lease acquisition
- metrics.LeaseAcquisitionsTotal.WithLabelValues(w.config.WorkerID, "workflow", "success").Inc()
-
- // Track this lease for heartbeats
- w.addActiveLease(workflow.ID)
-
- // Update active workflows gauge
- metrics.WorkflowsActive.WithLabelValues(w.engine.namespace, w.config.WorkerID).Inc()
-
- // Execute the workflow
- w.executeWorkflow(ctx, &workflow)
-
- // Release the lease and stop tracking it
- // Use new Query pattern
- if err := store.Query.ReleaseLease(ctx, w.engine.GetDB(), store.ReleaseLeaseParams{
- ResourceID: workflow.ID,
- WorkerID: w.config.WorkerID,
- }); err != nil {
- w.engine.logger.Error("Failed to release workflow lease",
- "workflow_id", workflow.ID,
- "worker_id", w.config.WorkerID,
- "error", err.Error(),
- )
- }
- w.removeActiveLease(workflow.ID)
-
- // Update active workflows gauge
- metrics.WorkflowsActive.WithLabelValues(w.engine.namespace, w.config.WorkerID).Dec()
-
- case <-w.shutdownC:
- return
- case <-ctx.Done():
- return
- }
- }
-}
-
-func (w *worker) executeWorkflow(ctx context.Context, e *store.WorkflowExecution) {
- startTime := w.clock.Now()
-
- // Start tracing span for workflow execution
- var span trace.Span
-
- if e.TraceID.Valid && e.SpanID.Valid && e.TraceID.String != "" && e.SpanID.String != "" {
- // Reconstruct the exact trace context from stored trace ID and span ID
- traceID, traceErr := trace.TraceIDFromHex(e.TraceID.String)
- spanID, spanErr := trace.SpanIDFromHex(e.SpanID.String)
-
- if traceErr == nil && spanErr == nil {
- // Create the exact span context from the original workflow creation
- originalSpanCtx := trace.NewSpanContext(trace.SpanContextConfig{
- TraceID: traceID,
- SpanID: spanID,
- TraceFlags: trace.FlagsSampled,
- TraceState: trace.TraceState{},
- Remote: false,
- })
-
- // Set this context as the parent for the execution span
- ctx = trace.ContextWithSpanContext(ctx, originalSpanCtx)
- }
- }
-
- ctx, span = tracing.Start(ctx, fmt.Sprintf("hydra.worker.executeWorkflow.%s", e.WorkflowName))
- defer span.End()
-
- spanAttributes := []attribute.KeyValue{
- attribute.String("hydra.workflow.name", e.WorkflowName),
- attribute.String("hydra.execution.id", e.ID),
- attribute.String("hydra.namespace", e.Namespace),
- attribute.String("hydra.worker.id", w.config.WorkerID),
- }
-
- if e.TraceID.Valid && e.TraceID.String != "" {
- spanAttributes = append(spanAttributes, attribute.String("hydra.original_trace_id", e.TraceID.String))
- }
- if e.SpanID.Valid && e.SpanID.String != "" {
- spanAttributes = append(spanAttributes, attribute.String("hydra.original_span_id", e.SpanID.String))
- }
-
- span.SetAttributes(spanAttributes...)
-
- // Calculate queue time (time from creation to execution start)
- queueTime := time.Duration(startTime.UnixMilli()-e.CreatedAt) * time.Millisecond
- metrics.WorkflowQueueTimeSeconds.WithLabelValues(e.Namespace, e.WorkflowName).Observe(queueTime.Seconds())
-
- // Update workflow to running status with lease validation
- now := time.Now().UnixMilli()
- err := store.Query.UpdateWorkflowToRunning(ctx, w.engine.GetDB(), store.UpdateWorkflowToRunningParams{
- StartedAt: sql.NullInt64{Int64: startTime.UnixMilli(), Valid: true},
- ID: e.ID,
- Namespace: e.Namespace,
- ResourceID: e.ID,
- WorkerID: w.config.WorkerID,
- ExpiresAt: now,
- })
- if err != nil {
- metrics.RecordError(e.Namespace, "worker", "status_update_failed")
- tracing.RecordError(span, err)
- span.SetAttributes(attribute.String("hydra.workflow.status", "failed"))
- return
- }
-
- wf, exists := w.workflows[e.WorkflowName]
- if !exists {
- noHandlerErr := fmt.Errorf("no handler registered for workflow %s", e.WorkflowName)
- tracing.RecordError(span, noHandlerErr)
- span.SetAttributes(attribute.String("hydra.workflow.status", "failed"))
-
- // Use lease-validated failure to ensure correctness
- failureTime := w.clock.Now().UnixMilli()
- result, failErr := w.engine.GetDB().ExecContext(ctx, `
- UPDATE workflow_executions
- SET status = 'failed', error_message = ?, remaining_attempts = remaining_attempts - 1, completed_at = ?, next_retry_at = NULL
- WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- sql.NullString{String: noHandlerErr.Error(), Valid: true},
- sql.NullInt64{Int64: failureTime, Valid: true},
- e.ID,
- e.Namespace,
- e.ID, // resource_id for lease check
- w.config.WorkerID, // worker_id for lease check
- failureTime, // expires_at check
- )
- if failErr != nil {
- w.engine.logger.Error("Failed to mark workflow as failed",
- "workflow_id", e.ID,
- "workflow_name", e.WorkflowName,
- "namespace", e.Namespace,
- "error", failErr.Error(),
- )
- } else {
- // Check if the failure actually happened (lease validation)
- if rowsAffected, checkErr := result.RowsAffected(); checkErr != nil {
- w.engine.logger.Error("Failed to check workflow failure result",
- "workflow_id", e.ID,
- "error", checkErr.Error(),
- )
- } else if rowsAffected == 0 {
- w.engine.logger.Warn("Workflow failure failed: lease expired or invalid",
- "workflow_id", e.ID,
- "worker_id", w.config.WorkerID,
- )
- }
- }
- metrics.ObserveWorkflowDuration(e.Namespace, e.WorkflowName, "failed", startTime)
- metrics.WorkflowsCompletedTotal.WithLabelValues(e.Namespace, e.WorkflowName, "failed").Inc()
- metrics.RecordError(e.Namespace, "worker", "no_handler_registered")
- return
- }
-
- payload := &RawPayload{Data: e.InputData}
-
- wctx := &workflowContext{
- ctx: ctx, // This is the traced context from the worker span
- executionID: e.ID,
- workflowName: e.WorkflowName,
- namespace: e.Namespace,
- workerID: w.config.WorkerID,
- db: w.engine.GetDB(),
- marshaller: w.engine.marshaller,
- logger: w.engine.logger.With("execution_id", e.ID, "namespace", e.Namespace, "workflow_name", e.WorkflowName),
- stepTimeout: 5 * time.Minute, // Default step timeout
- stepMaxAttempts: 3, // Default step max attempts
- }
-
- err = wf.Run(wctx, payload)
-
- if err != nil {
- tracing.RecordError(span, err)
-
- if suspendErr, ok := err.(*WorkflowSuspendedError); ok {
- span.SetAttributes(attribute.String("hydra.workflow.status", "suspended"))
-
- // Use simple sleep workflow since we have the lease
- if sleepErr := store.Query.SleepWorkflow(ctx, w.engine.GetDB(), store.SleepWorkflowParams{
- SleepUntil: sql.NullInt64{Int64: suspendErr.ResumeTime, Valid: true},
- ID: e.ID,
- Namespace: e.Namespace,
- }); sleepErr != nil {
- w.engine.logger.Error("Failed to suspend workflow",
- "workflow_id", e.ID,
- "workflow_name", e.WorkflowName,
- "namespace", e.Namespace,
- "resume_time", suspendErr.ResumeTime,
- "error", sleepErr.Error(),
- )
- }
- metrics.SleepsStartedTotal.WithLabelValues(e.Namespace, e.WorkflowName).Inc()
- return
- }
-
- isFinal := e.RemainingAttempts <= 1
- span.SetAttributes(attribute.String("hydra.workflow.status", "failed"))
-
- // Use lease-validated failure to ensure correctness
- finalFailureTime := w.clock.Now().UnixMilli()
- var result sql.Result
- var failErr error
-
- if isFinal {
- // Final failure - no more retries
- result, failErr = w.engine.GetDB().ExecContext(ctx, `
- UPDATE workflow_executions
- SET status = 'failed', error_message = ?, remaining_attempts = remaining_attempts - 1, completed_at = ?, next_retry_at = NULL
- WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- sql.NullString{String: err.Error(), Valid: true},
- sql.NullInt64{Int64: finalFailureTime, Valid: true},
- e.ID,
- e.Namespace,
- e.ID, // resource_id for lease check
- w.config.WorkerID, // worker_id for lease check
- finalFailureTime, // expires_at check
- )
- } else {
- // Failure with retry - calculate next retry time
- nextRetryAt := w.clock.Now().Add(time.Duration(e.MaxAttempts-e.RemainingAttempts+1) * time.Second).UnixMilli()
- result, failErr = w.engine.GetDB().ExecContext(ctx, `
- UPDATE workflow_executions
- SET status = 'failed', error_message = ?, remaining_attempts = remaining_attempts - 1, next_retry_at = ?
- WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- sql.NullString{String: err.Error(), Valid: true},
- sql.NullInt64{Int64: nextRetryAt, Valid: true},
- e.ID,
- e.Namespace,
- e.ID, // resource_id for lease check
- w.config.WorkerID, // worker_id for lease check
- finalFailureTime, // expires_at check
- )
- }
- if failErr != nil {
- w.engine.logger.Error("Failed to mark workflow as failed",
- "workflow_id", e.ID,
- "workflow_name", e.WorkflowName,
- "namespace", e.Namespace,
- "is_final", isFinal,
- "original_error", err.Error(),
- "fail_error", failErr.Error(),
- )
- } else {
- // Check if the failure actually happened (lease validation)
- if rowsAffected, checkErr := result.RowsAffected(); checkErr != nil {
- w.engine.logger.Error("Failed to check workflow failure result",
- "workflow_id", e.ID,
- "error", checkErr.Error(),
- )
- } else if rowsAffected == 0 {
- w.engine.logger.Warn("Workflow failure failed: lease expired or invalid",
- "workflow_id", e.ID,
- "worker_id", w.config.WorkerID,
- "is_final", isFinal,
- )
- }
- }
-
- if !isFinal {
- metrics.WorkflowsRetriedTotal.WithLabelValues(e.Namespace, e.WorkflowName, fmt.Sprintf("%d", e.MaxAttempts-e.RemainingAttempts+1)).Inc()
- }
-
- metrics.ObserveWorkflowDuration(e.Namespace, e.WorkflowName, "failed", startTime)
- metrics.WorkflowsCompletedTotal.WithLabelValues(e.Namespace, e.WorkflowName, "failed").Inc()
- return
- }
-
- span.SetAttributes(attribute.String("hydra.workflow.status", "completed"))
-
- // Use lease-validated completion to ensure correctness
- now = w.clock.Now().UnixMilli()
- result, err := w.engine.GetDB().ExecContext(ctx, `
- UPDATE workflow_executions
- SET status = 'completed', completed_at = ?, output_data = ?
- WHERE id = ? AND workflow_executions.namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'workflow'
- AND worker_id = ? AND expires_at > ?
- )`,
- sql.NullInt64{Int64: now, Valid: true},
- []byte{}, // No output data for now
- e.ID,
- e.Namespace,
- e.ID, // resource_id for lease check
- w.config.WorkerID, // worker_id for lease check
- now, // expires_at check
- )
- if err != nil {
- tracing.RecordError(span, err)
- w.engine.logger.Error("Failed to mark workflow as completed",
- "workflow_id", e.ID,
- "workflow_name", e.WorkflowName,
- "namespace", e.Namespace,
- "error", err.Error(),
- )
- return
- }
-
- // Check if the completion actually happened (lease validation)
- rowsAffected, checkErr := result.RowsAffected()
- if checkErr != nil {
- w.engine.logger.Error("Failed to check workflow completion result",
- "workflow_id", e.ID,
- "error", checkErr.Error(),
- )
- return
- }
- if rowsAffected == 0 {
- w.engine.logger.Warn("Workflow completion failed: lease expired or invalid",
- "workflow_id", e.ID,
- "worker_id", w.config.WorkerID,
- )
- return
- }
-
- metrics.ObserveWorkflowDuration(e.Namespace, e.WorkflowName, "completed", startTime)
- metrics.WorkflowsCompletedTotal.WithLabelValues(e.Namespace, e.WorkflowName, "completed").Inc()
-}
-
-func (w *worker) sendHeartbeats(ctx context.Context) {
- defer w.wg.Done()
-
- ticker := w.clock.NewTicker(w.config.HeartbeatInterval)
- defer ticker.Stop()
- tickerC := ticker.C()
-
- for {
- select {
- case <-tickerC:
- w.sendHeartbeatsForActiveLeases(ctx)
-
- case <-w.shutdownC:
- return
- case <-ctx.Done():
- return
- }
- }
-}
-
-// addActiveLease tracks a workflow lease for heartbeat sending
-func (w *worker) addActiveLease(workflowID string) {
- w.activeLeasesM.Lock()
- defer w.activeLeasesM.Unlock()
- w.activeLeases[workflowID] = true
-}
-
-// removeActiveLease stops tracking a workflow lease
-func (w *worker) removeActiveLease(workflowID string) {
- w.activeLeasesM.Lock()
- defer w.activeLeasesM.Unlock()
- delete(w.activeLeases, workflowID)
-}
-
-// sendHeartbeatsForActiveLeases sends heartbeats for all workflows this worker has leases for
-func (w *worker) sendHeartbeatsForActiveLeases(ctx context.Context) {
- w.activeLeasesM.RLock()
- // Copy the map to avoid holding the lock while sending heartbeats
- leaseIDs := make([]string, 0, len(w.activeLeases))
- for workflowID := range w.activeLeases {
- leaseIDs = append(leaseIDs, workflowID)
- }
- w.activeLeasesM.RUnlock()
-
- // Send heartbeats for each active lease
- now := w.clock.Now().UnixMilli()
- newExpiresAt := now + w.config.ClaimTimeout.Milliseconds()
-
- for _, workflowID := range leaseIDs {
- // Protect heartbeat with circuit breaker
- _, err := w.leaseCircuitBreaker.Do(ctx, func(ctx context.Context) (any, error) {
- // Use new Query pattern
- return nil, store.Query.HeartbeatLease(ctx, w.engine.GetDB(), store.HeartbeatLeaseParams{
- HeartbeatAt: now,
- ExpiresAt: newExpiresAt,
- ResourceID: workflowID,
- WorkerID: w.config.WorkerID,
- })
- })
- if err != nil {
- // Record failed heartbeat
- metrics.WorkerHeartbeatsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, "failed").Inc()
- continue
- }
-
- // Record successful heartbeat
- metrics.WorkerHeartbeatsTotal.WithLabelValues(w.config.WorkerID, w.engine.namespace, "success").Inc()
- }
-}
-
-func (w *worker) cleanupExpiredLeases(ctx context.Context) {
- defer w.wg.Done()
-
- ticker := w.clock.NewTicker(w.config.HeartbeatInterval * 2) // Clean up less frequently than heartbeats
- defer ticker.Stop()
- tickerC := ticker.C()
-
- for {
- select {
- case <-tickerC:
- // Clean up expired leases first
- now := w.clock.Now().UnixMilli()
- err := store.Query.CleanupExpiredLeases(ctx, w.engine.GetDB(), store.CleanupExpiredLeasesParams{
- Namespace: w.engine.namespace,
- ExpiresAt: now,
- })
- if err != nil {
- w.engine.logger.Warn("Failed to cleanup expired leases", "error", err.Error())
- }
-
- // Then reset orphaned workflows back to pending so they can be picked up again
- err = store.Query.ResetOrphanedWorkflows(ctx, w.engine.GetDB(), store.ResetOrphanedWorkflowsParams{
- Namespace: w.engine.namespace,
- Namespace_2: w.engine.namespace,
- })
- if err != nil {
- w.engine.logger.Warn("Failed to reset orphaned workflows", "error", err.Error())
- }
-
- case <-w.shutdownC:
- return
- case <-ctx.Done():
- return
- }
- }
-}
-
-func (w *worker) processCronJobs(ctx context.Context) {
- defer w.wg.Done()
-
- ticker := w.clock.NewTicker(w.config.CronInterval)
- defer ticker.Stop()
- tickerC := ticker.C()
-
- for {
- select {
- case <-tickerC:
- w.processDueCronJobs(ctx)
-
- case <-w.shutdownC:
- return
- case <-ctx.Done():
- return
- }
- }
-}
-
-func (w *worker) processDueCronJobs(ctx context.Context) {
- now := w.engine.clock.Now().UnixMilli()
-
- dueCrons, err := store.Query.GetDueCronJobs(ctx, w.engine.GetDB(), store.GetDueCronJobsParams{
- Namespace: w.engine.namespace,
- NextRunAt: now,
- })
- if err != nil {
- return
- }
-
- if len(dueCrons) == 0 {
- return
- }
-
- for _, cronJob := range dueCrons {
- var canHandle bool
- if cronJob.WorkflowName.Valid && cronJob.WorkflowName.String != "" {
- _, canHandle = w.workflows[cronJob.WorkflowName.String]
- } else {
- _, canHandle = w.engine.cronHandlers[cronJob.Name]
- }
-
- if !canHandle {
- continue
- }
-
- err := store.Query.CreateLease(ctx, w.engine.GetDB(), store.CreateLeaseParams{
- ResourceID: cronJob.ID,
- Kind: store.LeasesKindCronJob,
- Namespace: w.engine.namespace,
- WorkerID: w.config.WorkerID,
- AcquiredAt: now,
- ExpiresAt: now + (5 * time.Minute).Milliseconds(), // 5 minute lease for cron execution
- HeartbeatAt: now,
- })
- if err != nil {
- continue
- }
-
- w.executeCronJob(ctx, cronJob)
-
- if err := store.Query.ReleaseLease(ctx, w.engine.GetDB(), store.ReleaseLeaseParams{
- ResourceID: cronJob.ID,
- WorkerID: w.config.WorkerID,
- }); err != nil {
- w.engine.logger.Error("Failed to release cron job lease",
- "cron_job_id", cronJob.ID,
- "cron_name", cronJob.Name,
- "worker_id", w.config.WorkerID,
- "error", err.Error(),
- )
- }
- }
-}
-
-func (w *worker) executeCronJob(ctx context.Context, cronJob store.CronJob) {
-
- now := w.engine.clock.Now().UnixMilli()
-
- payload := &CronPayload{
- CronJobID: cronJob.ID,
- CronName: cronJob.Name,
- ScheduledAt: cronJob.NextRunAt,
- ActualRunAt: now,
- Namespace: cronJob.Namespace,
- }
-
- handler, exists := w.engine.cronHandlers[cronJob.Name]
- if !exists {
- return
- }
-
- // Execute cron handler with panic recovery
- func() {
- defer func() {
- if r := recover(); r != nil {
- w.engine.logger.Error("Cron handler panicked",
- "cron_job_id", cronJob.ID,
- "cron_name", cronJob.Name,
- "panic", r,
- )
- }
- }()
- if err := handler(ctx, *payload); err != nil {
- w.engine.logger.Error("Cron handler execution failed",
- "cron_job_id", cronJob.ID,
- "cron_name", cronJob.Name,
- "error", err.Error(),
- )
- }
- }()
-
- // Update cron job with lease validation - only if worker holds valid cron lease
- nextRun := calculateNextRun(cronJob.CronSpec, w.engine.clock.Now())
- updateTime := w.engine.clock.Now().UnixMilli()
- result, err := w.engine.GetDB().ExecContext(ctx, `
- UPDATE cron_jobs
- SET last_run_at = ?, next_run_at = ?, updated_at = ?
- WHERE id = ? AND namespace = ?
- AND EXISTS (
- SELECT 1 FROM leases
- WHERE resource_id = ? AND kind = 'cron_job'
- AND worker_id = ? AND expires_at > ?
- )`,
- sql.NullInt64{Int64: now, Valid: true},
- nextRun,
- updateTime,
- cronJob.ID,
- w.engine.namespace,
- cronJob.ID, // resource_id for lease check
- w.config.WorkerID, // worker_id for lease check
- updateTime, // expires_at check
- )
- if err != nil {
- w.engine.logger.Error("Failed to update cron job last run time",
- "cron_job_id", cronJob.ID,
- "cron_name", cronJob.Name,
- "namespace", w.engine.namespace,
- "last_run", now,
- "next_run", nextRun,
- "error", err.Error(),
- )
- } else {
- // Check if the update actually happened (lease validation)
- if rowsAffected, checkErr := result.RowsAffected(); checkErr != nil {
- w.engine.logger.Error("Failed to check cron job update result",
- "cron_job_id", cronJob.ID,
- "error", checkErr.Error(),
- )
- } else if rowsAffected == 0 {
- w.engine.logger.Warn("Cron job update failed: lease expired or invalid",
- "cron_job_id", cronJob.ID,
- "worker_id", w.config.WorkerID,
- )
- }
- }
-
-}
-
-// acquireWorkflowLease implements workflow lease acquisition using new Query pattern
-func (w *worker) acquireWorkflowLease(ctx context.Context, workflowID, workerID string) error {
- now := w.clock.Now().UnixMilli()
- expiresAt := now + w.config.ClaimTimeout.Milliseconds()
-
- // Begin transaction
- tx, err := w.engine.GetDB().BeginTx(ctx, nil)
- if err != nil {
- return err
- }
- defer func() {
- if rollbackErr := tx.Rollback(); rollbackErr != nil && rollbackErr != sql.ErrTxDone {
- w.engine.logger.Error("failed to rollback transaction", "error", rollbackErr)
- }
- }()
-
- // First, check if workflow is still available for leasing
- workflow, err := store.Query.GetWorkflow(ctx, tx, store.GetWorkflowParams{
- ID: workflowID,
- Namespace: w.engine.namespace,
- })
- if err != nil {
- if db.IsNotFound(err) {
- return fmt.Errorf("workflow not found")
- }
- return err
- }
-
- // Check if workflow is in a valid state for execution
- if workflow.Status != store.WorkflowExecutionsStatusPending &&
- workflow.Status != store.WorkflowExecutionsStatusFailed &&
- workflow.Status != store.WorkflowExecutionsStatusSleeping {
- return fmt.Errorf("workflow not available for execution, status: %s", workflow.Status)
- }
-
- // Check for retry timing if it's a failed workflow
- if workflow.Status == store.WorkflowExecutionsStatusFailed &&
- workflow.NextRetryAt.Valid && workflow.NextRetryAt.Int64 > now {
- return fmt.Errorf("workflow retry not yet due")
- }
-
- // Check for sleep timing if it's a sleeping workflow
- if workflow.Status == store.WorkflowExecutionsStatusSleeping &&
- workflow.SleepUntil.Valid && workflow.SleepUntil.Int64 > now {
- return fmt.Errorf("workflow still sleeping")
- }
-
- // Try to create the lease
- err = store.Query.CreateLease(ctx, tx, store.CreateLeaseParams{
- ResourceID: workflowID,
- Kind: store.LeasesKindWorkflow,
- Namespace: w.engine.namespace,
- WorkerID: workerID,
- AcquiredAt: now,
- ExpiresAt: expiresAt,
- HeartbeatAt: now,
- })
- if err != nil {
- // If lease creation failed, try to take over ONLY expired leases
- leaseResult, leaseErr := tx.ExecContext(ctx, `
- UPDATE leases
- SET worker_id = ?, acquired_at = ?, expires_at = ?, heartbeat_at = ?
- WHERE resource_id = ? AND kind = ? AND expires_at < ?`,
- workerID, now, expiresAt, now, workflowID, store.LeasesKindWorkflow, now)
- if leaseErr != nil {
- return fmt.Errorf("failed to check for expired lease: %w", leaseErr)
- }
-
- // Check if we actually took over an expired lease
- rowsAffected, rowsErr := leaseResult.RowsAffected()
- if rowsErr != nil {
- return fmt.Errorf("failed to check lease takeover result: %w", rowsErr)
- }
- if rowsAffected == 0 {
- return fmt.Errorf("workflow is already leased by another worker")
- }
- }
-
- // Update workflow to running status
- err = store.Query.UpdateWorkflowToRunning(ctx, tx, store.UpdateWorkflowToRunningParams{
- StartedAt: sql.NullInt64{Int64: now, Valid: true},
- ID: workflowID,
- Namespace: w.engine.namespace,
- ResourceID: workflowID,
- WorkerID: w.config.WorkerID,
- ExpiresAt: now,
- })
- if err != nil {
- return fmt.Errorf("failed to update workflow status: %w", err)
- }
-
- // Commit the transaction
- return tx.Commit()
-}
-
-func (w *worker) Start(ctx context.Context) error {
- go w.run(ctx)
- return nil
-}
-
-func (w *worker) Shutdown(ctx context.Context) error {
- select {
- case <-w.shutdownC:
- default:
- close(w.shutdownC)
- }
-
- select {
- case <-w.doneC:
- return nil
- case <-ctx.Done():
- return ctx.Err()
- }
-}
diff --git a/go/pkg/hydra/worker_heartbeat_test.go b/go/pkg/hydra/worker_heartbeat_test.go
deleted file mode 100644
index 49aecd076f..0000000000
--- a/go/pkg/hydra/worker_heartbeat_test.go
+++ /dev/null
@@ -1,138 +0,0 @@
-package hydra
-
-import (
- "context"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/uid"
-)
-
-// TestWorkerHeartbeatFunctionality ensures that workers send heartbeats to maintain their leases
-// and prevent workflows from being incorrectly marked as orphaned when workers are healthy.
-func TestWorkerHeartbeatFunctionality(t *testing.T) {
- // Arrange: Create engine with test clock for deterministic timing
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- // Create a workflow that will run for a while to give us time to test heartbeats
- workflow := &longRunningWorkflow{
- engine: engine,
- name: "heartbeat-test-workflow",
- executeTime: 5 * time.Second, // Run longer than heartbeat interval
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
- defer cancel()
-
- // Start workflow
- executionID, err := workflow.Start(ctx, struct{}{})
- require.NoError(t, err)
-
- // Start worker with short heartbeat interval for faster testing
- workerID := uid.New(uid.WorkerPrefix)
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: workerID,
- Concurrency: 1,
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 1 * time.Second, // Send heartbeats frequently
- ClaimTimeout: 10 * time.Second, // Long enough for multiple heartbeats
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Act: Let worker pick up workflow and start sending heartbeats
- testClock.Tick(200 * time.Millisecond) // Trigger initial poll
- time.Sleep(50 * time.Millisecond) // Let worker pick up the workflow
-
- // Keep triggering polls until workflow is picked up
- require.Eventually(t, func() bool {
- testClock.Tick(200 * time.Millisecond)
- time.Sleep(10 * time.Millisecond)
-
- // Check if workflow has been picked up
- currentStatus, getErr := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- if getErr != nil {
- return false
- }
- return currentStatus.Status != store.WorkflowExecutionsStatusPending
- }, 3*time.Second, 50*time.Millisecond, "Worker should pick up workflow within timeout")
-
- // Verify workflow is being processed
- workflowStatus, err := store.Query.GetWorkflow(ctx, engine.GetDB(), store.GetWorkflowParams{
- ID: executionID,
- Namespace: engine.GetNamespace(),
- })
- require.NoError(t, err)
- require.Equal(t, store.WorkflowExecutionsStatusRunning, workflowStatus.Status, "Workflow should be running")
-
- // Get initial lease
- lease, err := store.Query.GetLease(ctx, engine.GetDB(), store.GetLeaseParams{
- ResourceID: executionID,
- Kind: store.LeasesKindWorkflow,
- })
- require.NoError(t, err)
- require.Equal(t, workerID, lease.WorkerID, "Lease should be held by our worker")
-
- initialExpiresAt := lease.ExpiresAt
-
- // Advance time to trigger first heartbeat
- testClock.Tick(1500 * time.Millisecond) // Past first heartbeat interval
- time.Sleep(50 * time.Millisecond) // Let heartbeat be processed
-
- // Verify heartbeat extended the lease
- updatedLease, err := store.Query.GetLease(ctx, engine.GetDB(), store.GetLeaseParams{
- ResourceID: executionID,
- Kind: store.LeasesKindWorkflow,
- })
- require.NoError(t, err)
- require.Equal(t, workerID, updatedLease.WorkerID, "Lease should still be held by our worker")
- require.Greater(t, updatedLease.ExpiresAt, initialExpiresAt,
- "HEARTBEAT FAILURE: Lease expiration should be extended after heartbeat. "+
- "Initial: %d, Updated: %d. This means the worker is not sending heartbeats properly, "+
- "which could cause healthy workers to lose their leases prematurely.",
- initialExpiresAt, updatedLease.ExpiresAt)
- require.Greater(t, updatedLease.HeartbeatAt, lease.HeartbeatAt,
- "HeartbeatAt timestamp should be updated")
-
- // The key test: verify heartbeat actually extended the lease
- extensionAmount := updatedLease.ExpiresAt - initialExpiresAt
- require.Greater(t, extensionAmount, int64(0),
- "HEARTBEAT SUCCESS: Lease was extended by %d ms. Heartbeats are working correctly.", extensionAmount)
-
-}
-
-// longRunningWorkflow simulates a workflow that takes time to execute,
-// giving us opportunity to test heartbeat behavior during execution
-type longRunningWorkflow struct {
- engine *Engine
- name string
- executeTime time.Duration
-}
-
-func (w *longRunningWorkflow) Name() string {
- return w.name
-}
-
-func (w *longRunningWorkflow) Run(ctx WorkflowContext, req any) error {
- // Simulate long-running work by sleeping
- // In a real test, this would be actual work that takes time
- time.Sleep(w.executeTime)
- return nil
-}
-
-func (w *longRunningWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
diff --git a/go/pkg/hydra/worker_polling_test.go b/go/pkg/hydra/worker_polling_test.go
deleted file mode 100644
index 1117822682..0000000000
--- a/go/pkg/hydra/worker_polling_test.go
+++ /dev/null
@@ -1,314 +0,0 @@
-package hydra
-
-import (
- "context"
- "fmt"
- "sync"
- "sync/atomic"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
-)
-
-// TestWorkerPollingEfficiency verifies that workers can handle concurrent load
-// without excessive database contention or resource exhaustion
-func TestWorkerPollingEfficiency(t *testing.T) {
- engine := newTestEngine(t)
-
- const (
- numWorkers = 10
- numWorkflows = 50
- testDuration = 5 * time.Second
- )
-
- var completedWorkflows atomic.Int64
-
- // Create workflow that tracks completion
- pollingWorkflow := &pollingTestWorkflow{
- engine: engine,
- name: "polling-test-workflow",
- onPoll: func() {
- completedWorkflows.Add(1)
- },
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), testDuration)
- defer cancel()
-
- // Start workers
- var wg sync.WaitGroup
- for i := 0; i < numWorkers; i++ {
- wg.Add(1)
- go func(workerID int) {
- defer wg.Done()
-
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: fmt.Sprintf("polling-worker-%d", workerID),
- Concurrency: 5, // Multiple workflows per worker
- PollInterval: 100 * time.Millisecond,
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, pollingWorkflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- <-ctx.Done()
- }(i)
- }
-
- // Submit workflows for processing
- for i := 0; i < numWorkflows; i++ {
- _, err := pollingWorkflow.Start(ctx, fmt.Sprintf("poll-test-%d", i))
- require.NoError(t, err)
- }
-
- // Wait for completion or timeout
- require.Eventually(t, func() bool {
- return completedWorkflows.Load() >= int64(numWorkflows)
- }, testDuration, 100*time.Millisecond,
- "Should complete %d workflows within %v", numWorkflows, testDuration)
-
- wg.Wait()
-
- // Verify all workflows were processed
- finalCompleted := completedWorkflows.Load()
- require.GreaterOrEqual(t, finalCompleted, int64(numWorkflows),
- "Should have completed at least %d workflows, got %d", numWorkflows, finalCompleted)
-}
-
-// TestWorkerPollingAccuracy tests that workers actually poll at the configured interval
-func TestWorkerPollingAccuracy(t *testing.T) {
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- const pollInterval = 200 * time.Millisecond
- const tolerance = 50 * time.Millisecond // 25% tolerance
-
- var pollTimes []time.Time
- var mu sync.Mutex
-
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: "accuracy-test-worker",
- Concurrency: 1,
- PollInterval: pollInterval,
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- require.NoError(t, err)
-
- pollingWorkflow := &pollingTestWorkflow{
- engine: engine,
- name: "accuracy-test-workflow",
- onPoll: func() {
- mu.Lock()
- pollTimes = append(pollTimes, testClock.Now())
- mu.Unlock()
- },
- }
-
- err = RegisterWorkflow(worker, pollingWorkflow)
- require.NoError(t, err)
-
- ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
- defer cancel()
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Advance clock to trigger multiple polls
- for i := 0; i < 10; i++ {
- testClock.Tick(pollInterval)
- time.Sleep(10 * time.Millisecond) // Allow processing
- }
-
- // Analyze interval accuracy
- mu.Lock()
- if len(pollTimes) < 2 {
- mu.Unlock()
- t.Skip("Not enough poll events to analyze intervals")
- return
- }
-
- actualIntervals := make([]time.Duration, len(pollTimes)-1)
- for i := 1; i < len(pollTimes); i++ {
- actualIntervals[i-1] = pollTimes[i].Sub(pollTimes[i-1])
- }
- mu.Unlock()
-
- // Check each interval is within tolerance
- accurateIntervals := 0
- for _, interval := range actualIntervals {
- diff := interval - pollInterval
- if diff < 0 {
- diff = -diff
- }
-
- isAccurate := diff <= tolerance
- if isAccurate {
- accurateIntervals++
- }
-
- }
-
- accuracy := float64(accurateIntervals) / float64(len(actualIntervals)) * 100
-
- // Performance assertions
- require.GreaterOrEqual(t, accuracy, 80.0,
- "At least 80%% of polling intervals should be accurate, got %.1f%%", accuracy)
-
-}
-
-// TestThunderingHerdPrevention ensures that when many workers start at the same time,
-// they don't all poll the database simultaneously causing performance issues
-func TestThunderingHerdPrevention(t *testing.T) {
- testClock := clock.NewTestClock()
- engine := newTestEngineWithClock(t, testClock)
-
- const (
- numWorkers = 50 // Large number to stress test
- pollInterval = 100 * time.Millisecond
- )
-
- // Track when each worker polls
- pollEvents := make(chan time.Time, 1000)
-
- ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
- defer cancel()
-
- // Start all workers simultaneously
- var wg sync.WaitGroup
- for i := 0; i < numWorkers; i++ {
- wg.Add(1)
- go func(workerID int) {
- defer wg.Done()
-
- worker, err := NewWorker(engine, WorkerConfig{
- WorkerID: fmt.Sprintf("herd-worker-%d", workerID),
- Concurrency: 1,
- PollInterval: pollInterval,
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- require.NoError(t, err)
-
- pollingWorkflow := &pollingTestWorkflow{
- engine: engine,
- name: "herd-test-workflow",
- onPoll: func() {
- select {
- case pollEvents <- testClock.Now():
- default:
- // Channel full, skip
- }
- },
- }
-
- err = RegisterWorkflow(worker, pollingWorkflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- <-ctx.Done()
- }(i)
- }
-
- // Advance time to trigger polling
- go func() {
- for {
- select {
- case <-ctx.Done():
- return
- default:
- testClock.Tick(pollInterval / 4)
- time.Sleep(5 * time.Millisecond)
- }
- }
- }()
-
- wg.Wait()
- close(pollEvents)
-
- // Analyze thundering herd behavior
- pollTimes := make([]time.Time, 0)
- for pollTime := range pollEvents {
- pollTimes = append(pollTimes, pollTime)
- }
-
- // Check for clustering (thundering herd indicator)
- clustering := analyzePollingClustering(pollTimes, pollInterval)
-
- // Performance assertion
- require.Less(t, clustering, 0.5,
- "Polling clustering should be low to prevent thundering herd, got %.2f", clustering)
-
-}
-
-// pollingTestWorkflow is a minimal workflow that tracks when it's polled for
-type pollingTestWorkflow struct {
- engine *Engine
- name string
- onPoll func()
-}
-
-func (w *pollingTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *pollingTestWorkflow) Run(ctx WorkflowContext, req any) error {
- // This is called when the workflow is actually executed
- // We use onPoll to track when workers check for pending work
- if w.onPoll != nil {
- w.onPoll()
- }
-
- _, err := Step(ctx, "polling-step", func(context.Context) (string, error) {
- return "polled", nil
- })
- return err
-}
-
-func (w *pollingTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// Helper function to analyze polling clustering (thundering herd detection)
-func analyzePollingClustering(pollTimes []time.Time, pollInterval time.Duration) float64 {
- if len(pollTimes) < 2 {
- return 0
- }
-
- // Group polls by time windows
- windowSize := pollInterval / 10 // 10% of poll interval
- timeWindows := make(map[int64]int)
-
- baseTime := pollTimes[0]
- for _, pollTime := range pollTimes {
- windowIndex := pollTime.Sub(baseTime).Nanoseconds() / windowSize.Nanoseconds()
- timeWindows[windowIndex]++
- }
-
- // Calculate clustering factor (higher = more clustered)
- totalPolls := len(pollTimes)
- maxWindowCount := 0
-
- for _, count := range timeWindows {
- if count > maxWindowCount {
- maxWindowCount = count
- }
- }
-
- clustering := float64(maxWindowCount) / float64(totalPolls)
-
- return clustering
-}
diff --git a/go/pkg/hydra/workflow.go b/go/pkg/hydra/workflow.go
deleted file mode 100644
index c00ad43e80..0000000000
--- a/go/pkg/hydra/workflow.go
+++ /dev/null
@@ -1,315 +0,0 @@
-package hydra
-
-import (
- "context"
- "database/sql"
- "fmt"
- "time"
-
- "github.com/unkeyed/unkey/go/pkg/hydra/store"
- "github.com/unkeyed/unkey/go/pkg/otel/logging"
- "github.com/unkeyed/unkey/go/pkg/otel/tracing"
- "go.opentelemetry.io/otel/attribute"
-)
-
-// Workflow defines the interface for typed workflows.
-//
-// Workflows are the core business logic containers in Hydra. They define
-// a series of steps to be executed reliably with exactly-once guarantees.
-//
-// Workflows must be stateless and deterministic - they can be executed
-// multiple times with the same input and produce the same result. State
-// is managed by the workflow engine and persisted automatically.
-//
-// Type parameter TReq defines the input payload type for the workflow.
-// Use 'any' for workflows that accept different payload types.
-//
-// Example implementation:
-//
-// type OrderWorkflow struct{}
-//
-// func (w *OrderWorkflow) Name() string {
-// return "order-processing"
-// }
-//
-// func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error {
-// // Execute steps using hydra.Step()
-// payment, err := hydra.Step(ctx, "validate-payment", func(stepCtx context.Context) (*Payment, error) {
-// return validatePayment(stepCtx, req.PaymentID)
-// })
-// if err != nil {
-// return err
-// }
-//
-// // Additional steps...
-// return nil
-// }
-type Workflow[TReq any] interface {
- // Name returns a unique identifier for this workflow type.
- // The name is used to route workflow executions to the correct handler
- // and must be consistent across deployments.
- Name() string
-
- // Run executes the workflow logic with the provided context and request.
- // This method should be deterministic and idempotent.
- //
- // The context provides access to workflow execution metadata and
- // the Step() function for creating durable execution units.
- //
- // Returning an error will mark the workflow as failed and trigger
- // retry logic if configured. Use hydra.Sleep() to suspend the
- // workflow for time-based coordination.
- Run(ctx WorkflowContext, req TReq) error
-}
-
-// GenericWorkflow is a type alias for workflows that accept any request type.
-// This is useful when registering workflows that handle different payload types
-// or when the payload type is not known at compile time.
-type GenericWorkflow = Workflow[any]
-
-// RawPayload represents raw workflow input data that needs to be unmarshalled
-type RawPayload struct {
- Data []byte
-}
-
-// WorkflowContext provides access to workflow execution context and utilities.
-//
-// The context is passed to workflow Run() methods and provides access to:
-// - The underlying Go context for cancellation and timeouts
-// - Workflow execution metadata like execution ID and name
-// - Step execution utilities through the Step() function
-//
-// Workflow contexts are created and managed by the workflow engine and
-// should not be created manually.
-type WorkflowContext interface {
- // Context returns the underlying Go context for this workflow execution.
- // This context will be cancelled if the workflow is cancelled or times out.
- Context() context.Context
-
- // ExecutionID returns the unique identifier for this workflow execution.
- // This ID can be used for logging, tracking, and debugging purposes.
- ExecutionID() string
-
- // WorkflowName returns the name of the workflow being executed.
- // This matches the value returned by the workflow's Name() method.
- WorkflowName() string
-}
-
-// workflowContext implements WorkflowContext and provides internal workflow utilities
-type workflowContext struct {
- ctx context.Context
- executionID string
- workflowName string
- namespace string
- workerID string
- db *sql.DB
- marshaller Marshaller
- logger logging.Logger
- stepTimeout time.Duration
- stepMaxAttempts int32
-}
-
-func (w *workflowContext) Context() context.Context {
- return w.ctx
-}
-
-func (w *workflowContext) ExecutionID() string {
- return w.executionID
-}
-
-func (w *workflowContext) WorkflowName() string {
- return w.workflowName
-}
-
-func (w *workflowContext) markStepCompleted(stepName string, outputData []byte) error {
- // Use simple step update - we're already in workflow execution context
- return store.Query.UpdateStepStatus(w.ctx, w.db, store.UpdateStepStatusParams{
- Status: store.WorkflowStepsStatusCompleted,
- CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- OutputData: outputData,
- ErrorMessage: sql.NullString{String: "", Valid: false},
- Namespace: w.namespace,
- ExecutionID: w.executionID,
- StepName: stepName,
- })
-}
-
-func (w *workflowContext) markStepFailed(stepName string, errorMsg string) error {
- // Use simple step update - we're already in workflow execution context
- return store.Query.UpdateStepStatus(w.ctx, w.db, store.UpdateStepStatusParams{
- Status: store.WorkflowStepsStatusFailed,
- CompletedAt: sql.NullInt64{Int64: time.Now().UnixMilli(), Valid: true},
- OutputData: []byte{},
- ErrorMessage: sql.NullString{String: errorMsg, Valid: errorMsg != ""},
- Namespace: w.namespace,
- ExecutionID: w.executionID,
- StepName: stepName,
- })
-}
-
-// RegisterWorkflow registers a typed workflow with a worker.
-//
-// This function associates a workflow implementation with a worker so that
-// the worker can execute workflows of this type. The workflow's Name() method
-// is used as the unique identifier for routing workflow executions.
-//
-// The function handles type conversion transparently, allowing strongly-typed
-// workflow implementations to be registered with the generic worker interface.
-//
-// Parameters:
-// - w: The worker that will execute this workflow type
-// - workflow: The workflow implementation to register
-//
-// Example:
-//
-// type OrderWorkflow struct{}
-//
-// func (w *OrderWorkflow) Name() string { return "order-processing" }
-// func (w *OrderWorkflow) Run(ctx hydra.WorkflowContext, req *OrderRequest) error {
-// // workflow implementation
-// return nil
-// }
-//
-// orderWorkflow := &OrderWorkflow{}
-// err := hydra.RegisterWorkflow(worker, orderWorkflow)
-// if err != nil {
-// return err
-// }
-//
-// Requirements:
-// - The workflow name must be unique within the worker
-// - The workflow must implement the Workflow[TReq] interface
-// - The worker must be started with Start() after registration
-//
-// Returns an error if:
-// - A workflow with the same name is already registered
-// - The worker type is invalid
-func RegisterWorkflow[TReq any](w Worker, workflow Workflow[TReq]) error {
- worker, ok := w.(*worker)
- if !ok {
- return fmt.Errorf("invalid worker type")
- }
-
- if _, exists := worker.workflows[workflow.Name()]; exists {
- return fmt.Errorf("workflow %q is already registered", workflow.Name())
- }
-
- // Create a wrapper that handles the type conversion
- genericWorkflow := &workflowWrapper[TReq]{
- wrapped: workflow,
- }
-
- worker.workflows[workflow.Name()] = genericWorkflow
- return nil
-}
-
-// workflowWrapper wraps a typed workflow to implement GenericWorkflow
-type workflowWrapper[TReq any] struct {
- wrapped Workflow[TReq]
-}
-
-func (w *workflowWrapper[TReq]) Name() string {
- return w.wrapped.Name()
-}
-
-func (w *workflowWrapper[TReq]) Run(ctx WorkflowContext, req any) error {
- wctx, ok := ctx.(*workflowContext)
- if !ok {
- return fmt.Errorf("invalid context type, expected *workflowContext")
- }
-
- // Start tracing span for workflow execution
- workflowCtx, span := tracing.Start(wctx.ctx, fmt.Sprintf("hydra.workflow.%s", w.wrapped.Name()))
- defer span.End()
-
- span.SetAttributes(
- attribute.String("hydra.workflow.name", w.wrapped.Name()),
- attribute.String("hydra.execution.id", wctx.executionID),
- attribute.String("hydra.namespace", wctx.namespace),
- attribute.String("hydra.worker.id", wctx.workerID),
- )
-
- // Update the workflow context to use the traced context
- wctx.ctx = workflowCtx
-
- // Extract the raw payload and unmarshal it to the correct type
- rawPayload, ok := req.(*RawPayload)
- if !ok {
- err := fmt.Errorf("expected RawPayload, got %T", req)
- tracing.RecordError(span, err)
- return err
- }
-
- var typedReq TReq
- if err := wctx.marshaller.Unmarshal(rawPayload.Data, &typedReq); err != nil {
- tracing.RecordError(span, err)
- return fmt.Errorf("failed to unmarshal workflow request: %w", err)
- }
-
- // Pass the updated workflow context (with traced context) to the workflow implementation
- err := w.wrapped.Run(wctx, typedReq)
- if err != nil {
- tracing.RecordError(span, err)
-
- span.SetAttributes(attribute.String("hydra.workflow.status", "failed"))
- } else {
- span.SetAttributes(attribute.String("hydra.workflow.status", "completed"))
- }
-
- return err
-}
-
-// WorkflowOption defines a function that configures workflow execution
-type WorkflowOption func(*WorkflowConfig)
-
-// WorkflowConfig holds the configuration for workflow execution
-type WorkflowConfig struct {
- MaxAttempts int32
-
- TimeoutDuration time.Duration
-
- RetryBackoff time.Duration
-
- TriggerType store.WorkflowExecutionsTriggerType
- TriggerSource *string
-}
-
-// WithMaxAttempts sets the maximum number of retry attempts for a workflow
-func WithMaxAttempts(attempts int32) WorkflowOption {
- return func(c *WorkflowConfig) {
- c.MaxAttempts = attempts
- }
-}
-
-// WithTimeout sets the timeout duration for a workflow
-func WithTimeout(timeout time.Duration) WorkflowOption {
- return func(c *WorkflowConfig) {
- c.TimeoutDuration = timeout
- }
-}
-
-// WithRetryBackoff sets the retry backoff duration for a workflow
-func WithRetryBackoff(backoff time.Duration) WorkflowOption {
- return func(c *WorkflowConfig) {
- c.RetryBackoff = backoff
- }
-}
-
-// WithTrigger sets the trigger type and source for a workflow
-func WithTrigger(triggerType store.WorkflowExecutionsTriggerType, triggerSource *string) WorkflowOption {
- return func(c *WorkflowConfig) {
- c.TriggerType = triggerType
- c.TriggerSource = triggerSource
- }
-}
-
-// WorkflowSuspendedError represents an error that suspends workflow execution until a specific time
-type WorkflowSuspendedError struct {
- Reason string
-
- ResumeTime int64
-}
-
-func (e *WorkflowSuspendedError) Error() string {
- return fmt.Sprintf("workflow suspended for %s until %d", e.Reason, e.ResumeTime)
-}
diff --git a/go/pkg/hydra/workflow_performance_test.go b/go/pkg/hydra/workflow_performance_test.go
deleted file mode 100644
index 148acb614a..0000000000
--- a/go/pkg/hydra/workflow_performance_test.go
+++ /dev/null
@@ -1,422 +0,0 @@
-package hydra
-
-import (
- "context"
- "fmt"
- "sync/atomic"
- "testing"
- "time"
-
- "github.com/stretchr/testify/require"
- "github.com/unkeyed/unkey/go/pkg/clock"
-)
-
-// TestWorkflowPickupLatencyBaseline measures the baseline latency for a single worker
-// to pick up and start executing a single workflow. This establishes our performance
-// baseline before testing the 5-second SLA requirement.
-func TestWorkflowPickupLatencyBaseline(t *testing.T) {
- // Arrange: Create engine with real clock for accurate timing
- realClock := clock.New()
- engine := newTestEngineWithClock(t, realClock)
-
- var workflowStartTime atomic.Int64
-
- // Create a workflow that records when it actually starts executing
- workflow := &latencyTestWorkflow{
- engine: engine,
- name: "baseline-latency-workflow",
- onStart: func() {
- workflowStartTime.Store(time.Now().UnixMilli())
- },
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- // Start worker with production-like configuration
- worker, err := NewWorker(engine, WorkerConfig{
- Concurrency: 1,
- PollInterval: 100 * time.Millisecond, // Realistic poll interval
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- require.NoError(t, err)
-
- err = RegisterWorkflow(worker, workflow)
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- // Act: Record submission time and start workflow
- _, err = workflow.Start(ctx, struct{}{})
- require.NoError(t, err)
-
- // Wait for workflow to start executing
- require.Eventually(t, func() bool {
- return workflowStartTime.Load() != 0
- }, 5*time.Second, 10*time.Millisecond, "Workflow should start executing within 5 seconds")
-
- // Calculate pickup latency
- latency := time.Since(time.UnixMilli(workflowStartTime.Load()))
-
- require.Less(t, latency, 5*time.Second, "Pickup latency should be less than 5 seconds for baseline test")
-}
-
-// latencyTestWorkflow is a minimal workflow for testing pickup latency
-type latencyTestWorkflow struct {
- engine *Engine
- name string
- onStart func()
-}
-
-func (w *latencyTestWorkflow) Name() string {
- return w.name
-}
-
-func (w *latencyTestWorkflow) Run(ctx WorkflowContext, req any) error {
- // Record when workflow actually starts executing
- if w.onStart != nil {
- w.onStart()
- }
-
- // Minimal work to complete quickly
- _, err := Step(ctx, "timing-step", func(context.Context) (string, error) {
- return "completed", nil
- })
- return err
-}
-
-func (w *latencyTestWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// TestWorkflowPickupLatencyConcurrent verifies that ALL workflows are picked up within the 5-second SLA
-// under concurrent load. This tests the critical requirement that every workflow must be processed
-// within the SLA window, not just the average.
-func TestWorkflowPickupLatencyConcurrent(t *testing.T) {
- // Arrange: Create engine with real clock for accurate timing
- realClock := clock.New()
- engine := newTestEngineWithClock(t, realClock)
-
- const numWorkers = 5 // Multiple workers to test concurrent performance
- const numWorkflows = 50 // Realistic batch to stress test SLA compliance
-
- var completedCount atomic.Int64
- var maxLatency atomic.Int64
- var slaViolations atomic.Int64
-
- // Create workflow factory that records completion timing
- createWorkflow := func(id int) *concurrentLatencyWorkflow {
- return &concurrentLatencyWorkflow{
- engine: engine,
- name: "concurrent-latency-workflow",
- id: id,
- onComplete: func(latencyMs int64) {
- // Track maximum latency across all workflows
- for {
- current := maxLatency.Load()
- if latencyMs <= current || maxLatency.CompareAndSwap(current, latencyMs) {
- break
- }
- }
-
- // Count SLA violations (workflows taking >5s)
- if latencyMs > 5000 {
- slaViolations.Add(1)
- t.Errorf("SLA VIOLATION: Workflow %d took %dms (>5000ms) to be picked up", id, latencyMs)
- }
-
- completedCount.Add(1)
- },
- }
- }
-
- ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
- defer cancel()
-
- // Start multiple workers sharing the same database
- workers := make([]Worker, numWorkers)
- for i := 0; i < numWorkers; i++ {
- worker, err := NewWorker(engine, WorkerConfig{
- Concurrency: 10, // Reasonable concurrency per worker
- PollInterval: 50 * time.Millisecond, // Fast polling for concurrent load
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- require.NoError(t, err)
-
- // Register the workflow type with each worker
- err = RegisterWorkflow(worker, createWorkflow(0))
- require.NoError(t, err)
-
- err = worker.Start(ctx)
- require.NoError(t, err)
- defer worker.Shutdown(ctx)
-
- workers[i] = worker
- }
-
- // Act: Submit all workflows as quickly as possible
- submissionStart := time.Now()
- executionIDs := make([]string, numWorkflows)
-
- for i := 0; i < numWorkflows; i++ {
- workflow := createWorkflow(i)
- executionID, err := workflow.Start(ctx, submissionStart.UnixMilli())
- require.NoError(t, err)
- executionIDs[i] = executionID
- }
-
- _ = time.Since(submissionStart) // Submission timing not needed for SLA test
-
- // Wait for all workflows to complete
- require.Eventually(t, func() bool {
- return completedCount.Load() == numWorkflows
- }, 15*time.Second, 100*time.Millisecond,
- "All %d workflows should complete within timeout", numWorkflows)
-
- // Assert SLA compliance: ALL workflows must be picked up within 5 seconds
- finalSlaViolations := slaViolations.Load()
- finalMaxLatency := maxLatency.Load()
-
- require.Equal(t, int64(0), finalSlaViolations,
- "SLA VIOLATION: %d out of %d workflows took longer than 5 seconds to be picked up",
- finalSlaViolations, numWorkflows)
-
- require.Less(t, finalMaxLatency, int64(5000),
- "SLA VIOLATION: Maximum pickup latency was %dms, must be <5000ms for ALL workflows",
- finalMaxLatency)
-
-}
-
-// concurrentLatencyWorkflow tracks individual workflow latency in concurrent scenarios
-type concurrentLatencyWorkflow struct {
- engine *Engine
- name string
- id int
- onComplete func(latencyMs int64)
-}
-
-func (w *concurrentLatencyWorkflow) Name() string {
- return w.name
-}
-
-func (w *concurrentLatencyWorkflow) Run(ctx WorkflowContext, req any) error {
- var submissionTime int64
- switch v := req.(type) {
- case int64:
- submissionTime = v
- case float64:
- submissionTime = int64(v) // JSON unmarshaling converts numbers to float64
- default:
- return fmt.Errorf("expected int64 or float64 submission time, got %T", req)
- }
-
- // Calculate latency from submission to execution start
- latency := time.Now().UnixMilli() - submissionTime
-
- // Report completion with latency
- if w.onComplete != nil {
- w.onComplete(latency)
- }
-
- // Minimal work to complete quickly
- _, err := Step(ctx, "latency-step", func(context.Context) (string, error) {
- return "completed", nil
- })
-
- return err
-}
-
-func (w *concurrentLatencyWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// BenchmarkWorkflowSubmission measures the rate at which workflows can be submitted
-func BenchmarkWorkflowSubmission(b *testing.B) {
- engine := newTestEngineBench(b)
-
- workflow := &benchmarkWorkflow{
- engine: engine,
- name: "benchmark-workflow",
- }
-
- ctx := context.Background()
-
- b.ResetTimer()
- b.RunParallel(func(pb *testing.PB) {
- for pb.Next() {
- _, err := workflow.Start(ctx, struct{}{})
- if err != nil {
- b.Fatal(err)
- }
- }
- })
-}
-
-// BenchmarkWorkflowThroughput measures end-to-end workflow processing throughput
-func BenchmarkWorkflowThroughput(b *testing.B) {
- engine := newTestEngineBench(b)
-
- workflow := &benchmarkWorkflow{
- engine: engine,
- name: "throughput-workflow",
- }
-
- // Start a single worker
- worker, err := NewWorker(engine, WorkerConfig{
- Concurrency: 10, // Process multiple workflows concurrently
- PollInterval: 10 * time.Millisecond, // Fast polling for benchmarks
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- if err != nil {
- b.Fatal(err)
- }
-
- err = RegisterWorkflow(worker, workflow)
- if err != nil {
- b.Fatal(err)
- }
-
- ctx, cancel := context.WithCancel(context.Background())
- defer cancel()
-
- err = worker.Start(ctx)
- if err != nil {
- b.Fatal(err)
- }
- defer worker.Shutdown(ctx)
-
- // Give worker time to start
- time.Sleep(50 * time.Millisecond)
-
- b.ResetTimer()
-
- // Track completion
- var completed atomic.Int64
- workflow.onComplete = func() {
- completed.Add(1)
- }
-
- // Submit N workflows as fast as possible
- submissionStart := time.Now()
- for i := 0; i < b.N; i++ {
- _, err := workflow.Start(ctx, struct{}{})
- if err != nil {
- b.Fatal(err)
- }
- }
- submissionDuration := time.Since(submissionStart)
-
- // Wait for all workflows to complete
- for completed.Load() < int64(b.N) {
- time.Sleep(1 * time.Millisecond)
- }
-
- b.ReportMetric(float64(b.N)/submissionDuration.Seconds(), "submissions/sec")
- b.ReportMetric(float64(b.N)/b.Elapsed().Seconds(), "completions/sec")
-}
-
-// BenchmarkSingleWorkerLatency measures latency with a single worker processing one workflow at a time
-func BenchmarkSingleWorkerLatency(b *testing.B) {
- engine := newTestEngineBench(b)
-
- workflow := &benchmarkWorkflow{
- engine: engine,
- name: "latency-workflow",
- }
-
- worker, err := NewWorker(engine, WorkerConfig{
- Concurrency: 1, // Single workflow at a time
- PollInterval: 1 * time.Millisecond, // Very fast polling
- HeartbeatInterval: 5 * time.Second,
- ClaimTimeout: 30 * time.Second,
- })
- if err != nil {
- b.Fatal(err)
- }
-
- err = RegisterWorkflow(worker, workflow)
- if err != nil {
- b.Fatal(err)
- }
-
- ctx, cancel := context.WithCancel(context.Background())
- defer cancel()
-
- err = worker.Start(ctx)
- if err != nil {
- b.Fatal(err)
- }
- defer worker.Shutdown(ctx)
-
- time.Sleep(50 * time.Millisecond)
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- completed := make(chan struct{})
- workflow.onComplete = func() {
- close(completed)
- }
-
- start := time.Now()
- _, err := workflow.Start(ctx, struct{}{})
- if err != nil {
- b.Fatal(err)
- }
-
- <-completed
- latency := time.Since(start)
-
- // Report per-operation timing
- if i == 0 {
- b.ReportMetric(float64(latency.Nanoseconds()), "ns/workflow")
- }
- }
-}
-
-// benchmarkWorkflow is a minimal workflow for benchmarking
-type benchmarkWorkflow struct {
- engine *Engine
- name string
- onComplete func()
-}
-
-func (w *benchmarkWorkflow) Name() string {
- return w.name
-}
-
-func (w *benchmarkWorkflow) Run(ctx WorkflowContext, req any) error {
- // Minimal work - just complete a simple step
- _, err := Step(ctx, "benchmark-step", func(context.Context) (string, error) {
- return "done", nil
- })
-
- if w.onComplete != nil {
- w.onComplete()
- }
-
- return err
-}
-
-func (w *benchmarkWorkflow) Start(ctx context.Context, payload any) (string, error) {
- return w.engine.StartWorkflow(ctx, w.Name(), payload)
-}
-
-// Helper for benchmarks that need testing.TB interface
-func newTestEngineBench(tb testing.TB) *Engine {
- // Use MySQL container for benchmarks
- t, ok := tb.(*testing.T)
- if !ok {
- // For benchmarks, create a new testing.T
- t = &testing.T{}
- t.Helper()
- }
-
- // Use the unified test helper
- return newTestEngineWithClock(t, clock.New())
-}