Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion go/apps/ctrl/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,36 @@ type CloudflareConfig struct {
ApiToken string
}

type Route53Config struct {
// Enables DNS-01 challenges using AWS Route53
Enabled bool

// AccessKeyID is the AWS access key ID
AccessKeyID string

// SecretAccessKey is the AWS secret access key
SecretAccessKey string

// Region is the AWS region (e.g., "us-east-1")
Region string

// HostedZoneID bypasses zone auto-discovery. Required when domains have CNAMEs
// that confuse the zone lookup (e.g., wildcard CNAMEs to load balancers).
HostedZoneID string
}

type AcmeConfig struct {
// Enables ACME challenges for TLS certificates
Enabled bool

// Enables DNS-01 challenges using Cloudflare
// EmailDomain is the domain used for ACME account emails (e.g., "unkey.com")
EmailDomain string

// Cloudflare enables DNS-01 challenges using Cloudflare
Cloudflare CloudflareConfig

// Route53 enables DNS-01 challenges using AWS Route53
Route53 Route53Config
}

type RestateConfig struct {
Expand Down Expand Up @@ -206,6 +230,17 @@ func (c Config) Validate() error {
}
}

// Validate Route53 configuration if enabled
if c.Acme.Enabled && c.Acme.Route53.Enabled {
if err := assert.All(
assert.NotEmpty(c.Acme.Route53.AccessKeyID, "route53 access key ID is required when route53 is enabled"),
assert.NotEmpty(c.Acme.Route53.SecretAccessKey, "route53 secret access key is required when route53 is enabled"),
assert.NotEmpty(c.Acme.Route53.Region, "route53 region is required when route53 is enabled"),
); err != nil {
return err
}
}

if err := assert.NotEmpty(c.ClickhouseURL, "ClickhouseURL is required"); err != nil {
return err
}
Expand Down
58 changes: 58 additions & 0 deletions go/apps/ctrl/internal/caches/caches.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package caches

import (
"time"

"github.com/unkeyed/unkey/go/pkg/cache"
"github.com/unkeyed/unkey/go/pkg/clock"
"github.com/unkeyed/unkey/go/pkg/db"
"github.com/unkeyed/unkey/go/pkg/otel/logging"
)

// Caches holds all shared cache instances for the ctrl application.
type Caches struct {
Domains cache.Cache[string, db.CustomDomain]
Challenges cache.Cache[string, db.AcmeChallenge]
}

type Config struct {
Logger logging.Logger
Clock clock.Clock
}

func New(cfg Config) (*Caches, error) {
clk := cfg.Clock
if clk == nil {
clk = clock.New()
}

domains, err := cache.New(cache.Config[string, db.CustomDomain]{
Fresh: 5 * time.Minute,
Stale: 10 * time.Minute,
MaxSize: 10000,
Logger: cfg.Logger,
Resource: "domains",
Clock: clk,
})
if err != nil {
return nil, err
}

// Short TTL for challenges since they change during ACME flow
challenges, err := cache.New(cache.Config[string, db.AcmeChallenge]{
Fresh: 10 * time.Second,
Stale: 30 * time.Second,
MaxSize: 1000,
Logger: cfg.Logger,
Resource: "acme_challenges",
Clock: clk,
})
if err != nil {
return nil, err
}

return &Caches{
Domains: domains,
Challenges: challenges,
}, nil
}
168 changes: 162 additions & 6 deletions go/apps/ctrl/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,22 @@ package ctrl
import (
"bytes"
"context"
"database/sql"
"fmt"
"log/slog"
"net/http"
"os"
"time"

"connectrpc.com/connect"
"github.com/go-acme/lego/v4/challenge"
restate "github.com/restatedev/sdk-go"
restateIngress "github.com/restatedev/sdk-go/ingress"
restateServer "github.com/restatedev/sdk-go/server"
ctrlCaches "github.com/unkeyed/unkey/go/apps/ctrl/internal/caches"
"github.com/unkeyed/unkey/go/apps/ctrl/middleware"
"github.com/unkeyed/unkey/go/apps/ctrl/services/acme"
"github.com/unkeyed/unkey/go/apps/ctrl/services/acme/providers"
"github.com/unkeyed/unkey/go/apps/ctrl/services/build/backend/depot"
"github.com/unkeyed/unkey/go/apps/ctrl/services/build/backend/docker"
buildStorage "github.com/unkeyed/unkey/go/apps/ctrl/services/build/storage"
Expand All @@ -29,11 +35,13 @@ import (
hydrav1 "github.com/unkeyed/unkey/go/gen/proto/hydra/v1"
"github.com/unkeyed/unkey/go/gen/proto/krane/v1/kranev1connect"
"github.com/unkeyed/unkey/go/pkg/clickhouse"
"github.com/unkeyed/unkey/go/pkg/clock"
"github.com/unkeyed/unkey/go/pkg/db"
"github.com/unkeyed/unkey/go/pkg/otel"
"github.com/unkeyed/unkey/go/pkg/otel/logging"
"github.com/unkeyed/unkey/go/pkg/retry"
"github.com/unkeyed/unkey/go/pkg/shutdown"
"github.com/unkeyed/unkey/go/pkg/uid"
"github.com/unkeyed/unkey/go/pkg/vault"
"github.com/unkeyed/unkey/go/pkg/vault/storage"
pkgversion "github.com/unkeyed/unkey/go/pkg/version"
Expand All @@ -47,6 +55,11 @@ func Run(ctx context.Context, cfg Config) error {
return fmt.Errorf("bad config: %w", err)
}

// Disable CNAME following in lego to prevent it from following wildcard CNAMEs
// (e.g., *.example.com -> loadbalancer.aws.com) and failing Route53 zone lookup.
// Must be set before creating any ACME DNS providers.
os.Setenv("LEGO_DISABLE_CNAME_SUPPORT", "true")
Comment on lines +58 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

cat -n go/apps/ctrl/run.go | head -80 | tail -30

Repository: unkeyed/unkey

Length of output: 1104


🏁 Script executed:

rg -n "LEGO_DISABLE_CNAME_SUPPORT" go/apps/ctrl/run.go -B 5 -A 5

Repository: unkeyed/unkey

Length of output: 466


🏁 Script executed:

fd "cloudflare" go/apps/ctrl --type f

Repository: unkeyed/unkey

Length of output: 116


🏁 Script executed:

sed -n '50,300p' go/apps/ctrl/run.go | head -150

Repository: unkeyed/unkey

Length of output: 4600


🏁 Script executed:

cat go/apps/ctrl/services/acme/providers/cloudflare_provider.go

Repository: unkeyed/unkey

Length of output: 1094


🏁 Script executed:

sed -n '61,400p' go/apps/ctrl/run.go | grep -n -A 5 -B 5 "cfg.Acme\|ACME" | head -100

Repository: unkeyed/unkey

Length of output: 3909


🏁 Script executed:

rg -n "cfg\.Acme\.Enabled|if.*Acme" go/apps/ctrl/run.go -B 2 -A 2

Repository: unkeyed/unkey

Length of output: 1728


🌐 Web query:

lego LEGO_DISABLE_CNAME_SUPPORT cloudflare DNS-01 challenge impact

💡 Result:

Short answer: LEGO_DISABLE_CNAME_SUPPORT controls whether the lego ACME client follows CNAMEs when resolving dns‑01 challenge names. If set (true), lego will NOT follow CNAME aliases; if unset (default) lego will follow them. [1][2]

Impact when using Cloudflare and dns‑01:

  • If you use CNAME aliases (e.g., delegating _acme-challenge.example.com to another zone or to acme-dns), disabling CNAME support stops lego from following that alias and will look for/create TXT records in the original zone — causing validation failures. [1][4]
  • Cloudflare features (proxying / CNAME flattening or a proxied record) can already break dns‑01 verification because the TXT/CNAME may not resolve as expected; combining that with LEGO_DISABLE_CNAME_SUPPORT=true makes debugging harder and is likely to fail if the TXT actually lives under the aliased zone. See Cloudflare CNAME/CNAME-flattening notes. [3]
  • Some users/tools (e.g., Traefik integrations) mention toggling this env var when CNAME handling causes issues; often the root problem is DNS configuration or resolver choices rather than lego itself. Setting explicit DNS resolvers or fixing the CNAME/proxy settings on Cloudflare is another remedy. [5][4]

Recommendations:

  1. If you delegate _acme-challenge via CNAME to a different provider (acme-dns or another Cloudflare zone), leave CNAME support enabled (unset LEGO_DISABLE_CNAME_SUPPORT). [1][4]
  2. Ensure the CNAME record is DNS‑only (not proxied) and that Cloudflare CNAME flattening won’t interfere. [3]
  3. If you must disable CNAME following, make sure the TXT records are writable in the zone lego will query (or change your provider config so the TXT is created where lego expects). [1][4]
  4. If failures persist, check logs, confirm TXT resolution from public resolvers, or configure explicit resolvers for the DNS challenge. [5]

Sources:

  • lego CLI options (LEGO_DISABLE_CNAME_SUPPORT description). [1]
  • lego project features (CNAME support default). [2]
  • Cloudflare DNS docs on CNAME / flattening / proxied records. [3]
  • go-acme/lego GitHub issue discussing CNAME/dns‑01 delegation problems. [4]
  • Traefik community post referencing LEGO_DISABLE_CNAME_SUPPORT and resolver workarounds. [5]

Environment variable set globally before ACME providers are created; clarify Cloudflare DNS-01 impact.

The LEGO_DISABLE_CNAME_SUPPORT setting is correctly placed after validation but before any ACME DNS providers are instantiated. However, this global environment variable affects both Route53 and Cloudflare providers:

  • Route53: Disabling CNAME following prevents issues with wildcard CNAME records (e.g., *.example.com → loadbalancer.aws.com) that would fail zone lookups.
  • Cloudflare: Disabling CNAME support will break dns-01 challenges if the setup delegates _acme-challenge via CNAME to another zone or relies on CNAME flattening. This is only safe if using Cloudflare's standard direct zone record management (not CNAME-delegated challenges).

If Cloudflare users employ CNAME-based challenge delegation, this environment variable will cause dns-01 validation failures. Consider either:

  1. Documenting that Cloudflare integration requires direct zone records (not CNAME delegation), or
  2. Making this setting conditional per-provider (more complex but provider-specific).
🤖 Prompt for AI Agents
In go/apps/ctrl/run.go around lines 58-61 the global
os.Setenv("LEGO_DISABLE_CNAME_SUPPORT","true") unconditionally disables CNAME
following for all ACME DNS providers which breaks Cloudflare setups that
delegate _acme-challenge via CNAME; change this so the env var is only set when
using providers that require CNAME following disabled (e.g., Route53) or make it
configurable: detect the configured DNS provider(s) before creating providers
and either (a) set LEGO_DISABLE_CNAME_SUPPORT only if the provider is
Route53/other non-CNAME-delegated provider, or (b) add a config
flag/documentation indicating Cloudflare requires direct zone records and do not
set the env var when Cloudflare is in use. Ensure the check runs before provider
construction and keep the default behavior unchanged for existing Route53 users.


shutdowns := shutdown.New()

if cfg.OtelEnabled {
Expand Down Expand Up @@ -270,11 +283,73 @@ func Run(ctx context.Context, cfg Config) error {
DefaultDomain: cfg.DefaultDomain,
})))

restateSrv.Bind(hydrav1.NewCertificateServiceServer(certificate.New(certificate.Config{
// Initialize shared caches for ACME (needed for verification endpoint regardless of provider config)
caches, cacheErr := ctrlCaches.New(ctrlCaches.Config{
Logger: logger,
DB: database,
Vault: acmeVaultSvc,
})))
Clock: clock.New(),
})
if cacheErr != nil {
return fmt.Errorf("failed to create ACME caches: %w", cacheErr)
}

// Setup ACME challenge providers
var dnsProvider challenge.Provider
var httpProvider challenge.Provider
if cfg.Acme.Enabled {
// HTTP-01 provider for regular (non-wildcard) domains
httpProv, httpErr := providers.NewHTTPProvider(providers.HTTPConfig{
DB: database,
Logger: logger,
DomainCache: caches.Domains,
})
if httpErr != nil {
return fmt.Errorf("failed to create HTTP-01 provider: %w", httpErr)
}
httpProvider = httpProv
logger.Info("ACME HTTP-01 provider enabled")

// DNS-01 provider for wildcard domains (requires DNS provider config)
if cfg.Acme.Cloudflare.Enabled {
cfProvider, cfErr := providers.NewCloudflareProvider(providers.CloudflareConfig{
DB: database,
Logger: logger,
APIToken: cfg.Acme.Cloudflare.ApiToken,
DomainCache: caches.Domains,
})
if cfErr != nil {
return fmt.Errorf("failed to create Cloudflare DNS provider: %w", cfErr)
}
dnsProvider = cfProvider
logger.Info("ACME Cloudflare DNS-01 provider enabled for wildcard certs")
} else if cfg.Acme.Route53.Enabled {
r53Provider, r53Err := providers.NewRoute53Provider(providers.Route53Config{
DB: database,
Logger: logger,
AccessKeyID: cfg.Acme.Route53.AccessKeyID,
SecretAccessKey: cfg.Acme.Route53.SecretAccessKey,
Region: cfg.Acme.Route53.Region,
HostedZoneID: cfg.Acme.Route53.HostedZoneID,
DomainCache: caches.Domains,
})
if r53Err != nil {
return fmt.Errorf("failed to create Route53 DNS provider: %w", r53Err)
}
dnsProvider = r53Provider
logger.Info("ACME Route53 DNS-01 provider enabled for wildcard certs")
}
}

// Certificate service needs a longer timeout for ACME DNS-01 challenges
// which can take 5-10 minutes for DNS propagation
restateSrv.Bind(hydrav1.NewCertificateServiceServer(certificate.New(certificate.Config{
Logger: logger,
DB: database,
Vault: acmeVaultSvc,
EmailDomain: cfg.Acme.EmailDomain,
DefaultDomain: cfg.DefaultDomain,
DNSProvider: dnsProvider,
HTTPProvider: httpProvider,
}), restate.WithInactivityTimeout(15*time.Minute)))
restateSrv.Bind(hydrav1.NewProjectServiceServer(projectWorkflow.New(projectWorkflow.Config{
Logger: logger,
DB: database,
Expand Down Expand Up @@ -332,6 +407,29 @@ func Run(ctx context.Context, cfg Config) error {
logger.Error("failed to register with Restate after retries", "error", err.Error())
} else {
logger.Info("Successfully registered with Restate")

// Bootstrap wildcard certificate for default domain if ACME is enabled
if cfg.Acme.Enabled && dnsProvider != nil && cfg.DefaultDomain != "" {
bootstrapWildcardDomain(ctx, database, logger, cfg.DefaultDomain)
}

// Start the certificate renewal cron job if ACME is enabled
// Use Send with idempotency key so multiple restarts don't create duplicate crons
if cfg.Acme.Enabled && dnsProvider != nil {
certClient := hydrav1.NewCertificateServiceIngressClient(restateClient, "global")
_, startErr := certClient.RenewExpiringCertificates().Send(
ctx,
&hydrav1.RenewExpiringCertificatesRequest{
DaysBeforeExpiry: 30,
},
restate.WithIdempotencyKey("cert-renewal-cron-startup"),
)
if startErr != nil {
logger.Warn("failed to start certificate renewal cron", "error", startErr)
} else {
logger.Info("Certificate renewal cron job started")
}
}
}
}()
}
Expand Down Expand Up @@ -370,8 +468,10 @@ func Run(ctx context.Context, cfg Config) error {
}), connectOptions...))
mux.Handle(ctrlv1connect.NewOpenApiServiceHandler(openapi.New(database, logger), connectOptions...))
mux.Handle(ctrlv1connect.NewAcmeServiceHandler(acme.New(acme.Config{
DB: database,
Logger: logger,
DB: database,
Logger: logger,
DomainCache: caches.Domains,
ChallengeCache: caches.Challenges,
}), connectOptions...))

// Configure server
Expand Down Expand Up @@ -435,3 +535,59 @@ func Run(ctx context.Context, cfg Config) error {
logger.Info("Ctrl server shut down successfully")
return nil
}

// bootstrapWildcardDomain ensures a wildcard domain and ACME challenge exist for the default domain.
// This allows the renewal cron to automatically issue a wildcard certificate on startup.
func bootstrapWildcardDomain(ctx context.Context, database db.Database, logger logging.Logger, defaultDomain string) {
wildcardDomain := "*." + defaultDomain

// Check if the wildcard domain already exists
_, err := db.Query.FindCustomDomainByDomain(ctx, database.RO(), wildcardDomain)
if err == nil {
logger.Info("Wildcard domain already exists", "domain", wildcardDomain)
return
}
if !db.IsNotFound(err) {
logger.Error("Failed to check for existing wildcard domain", "error", err, "domain", wildcardDomain)
return
}

// Create the custom domain record
domainID := uid.New(uid.DomainPrefix)
now := time.Now().UnixMilli()

// Use "unkey_internal" as the workspace for platform-managed resources
workspaceID := "unkey_internal"

err = db.Query.UpsertCustomDomain(ctx, database.RW(), db.UpsertCustomDomainParams{
ID: domainID,
WorkspaceID: workspaceID,
Domain: wildcardDomain,
ChallengeType: db.CustomDomainsChallengeTypeDNS01,
CreatedAt: now,
UpdatedAt: sql.NullInt64{Int64: now, Valid: true},
})
if err != nil {
logger.Error("Failed to create wildcard domain", "error", err, "domain", wildcardDomain)
return
}

// Create the ACME challenge record with status 'waiting' so the renewal cron picks it up
err = db.Query.InsertAcmeChallenge(ctx, database.RW(), db.InsertAcmeChallengeParams{
WorkspaceID: workspaceID,
DomainID: domainID,
Token: "",
Authorization: "",
Status: db.AcmeChallengesStatusWaiting,
ChallengeType: db.AcmeChallengesChallengeTypeDNS01,
CreatedAt: now,
UpdatedAt: sql.NullInt64{Int64: now, Valid: true},
ExpiresAt: 0, // Will be set when certificate is issued
})
if err != nil {
logger.Error("Failed to create ACME challenge for wildcard domain", "error", err, "domain", wildcardDomain)
return
}

logger.Info("Bootstrapped wildcard domain for certificate issuance", "domain", wildcardDomain)
}
Loading