From b8055fa4b9057e5bf11d69927124f4ce3dd14afa Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 21:09:52 -0700 Subject: [PATCH 1/9] feat(relay): graceful tunnel drain on SIGTERM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovered during multi-region rolling-deploy game-day: every relay machine restart left host tunnels disconnected for ~60s because: 1. Fly sends SIGTERM, relay process exits without closing WS, host sees a TCP-RST'd socket as a generic error. 2. Host's exponential reconnect backoff (1s → 30s cap) climbs to the 30s/attempt tail after ~6 failed attempts while the relay is down. 3. By the time the relay is back up, the host is in deep backoff and takes another 15-30s to retry. Result: ~60s user-visible outage per rolling-restart cycle, multiplied across regions during multi-region deploys. Fix is two-sided: - Relay (apps/relay/src/tunnel.ts + index.ts): add `drain()` on TunnelManager that walks every open WS and closes it with code 1001 ("Going Away"), waits 250ms for frames to flush, then exits. Wired to SIGTERM in index.ts so Fly's pre-stop signal does the right thing. - Host (packages/host-service/src/tunnel/tunnel-client.ts): when the WS onClose event reports code 1001, reset reconnectAttempts to 0 so the next reconnect fires at the 0.5-1s base delay instead of the saturated 30s/attempt tail. Combined, deploy outages should drop from ~60s to ~2-5s per affected host. Pre-requisite for safe multi-region prod rollout — multi-region amortizes the impact across regions but doesn't fix the per-host window without graceful drain. Verified via game-day on superset-relay-staging (sjc, iad, fra). --- apps/relay/src/index.ts | 17 +++++++++++++++++ apps/relay/src/tunnel.ts | 19 +++++++++++++++++++ .../host-service/src/tunnel/tunnel-client.ts | 9 +++++++++ 3 files changed, 45 insertions(+) diff --git a/apps/relay/src/index.ts b/apps/relay/src/index.ts index 8c4c7d03859..31d887db607 100644 --- a/apps/relay/src/index.ts +++ b/apps/relay/src/index.ts @@ -47,6 +47,23 @@ const app = new Hono(); const tunnelManager = new TunnelManager(); const { injectWebSocket, upgradeWebSocket } = createNodeWebSocket({ app }); +// Graceful drain on Fly's pre-stop SIGTERM. Without this, deploys kill the +// process while tunnels are still open and host-services see TCP-RST'd +// sockets, triggering their long exponential backoff. Drain first sends a +// clean WS 1001 ("Going Away") to every tunnel so hosts reconnect promptly. +let draining = false; +process.on("SIGTERM", async () => { + if (draining) return; + draining = true; + console.log("[relay] SIGTERM received, draining tunnels"); + try { + await tunnelManager.drain(); + } catch (err) { + console.error("[relay] drain failed", err); + } + process.exit(0); +}); + app.use("*", redactingLogger); app.use("*", cors()); diff --git a/apps/relay/src/tunnel.ts b/apps/relay/src/tunnel.ts index 4a83b20d118..c16db92585f 100644 --- a/apps/relay/src/tunnel.ts +++ b/apps/relay/src/tunnel.ts @@ -159,6 +159,25 @@ export class TunnelManager { console.log(`[relay] tunnel unregistered: ${hostId}`); } + // SIGTERM-driven graceful drain. Closes every open tunnel with WS code + // 1001 ("Going Away") so the host-service can recognize this as a deploy + // drain (not a hard disconnect) and reconnect immediately, instead of + // entering exponential backoff against TCP-RST'd sockets. Used by the + // SIGTERM handler in index.ts. + async drain(reason = "Server draining for deploy"): Promise { + console.log(`[relay] draining ${this.tunnels.size} tunnels`); + for (const tunnel of this.tunnels.values()) { + try { + tunnel.ws.close(1001, reason); + } catch { + // already closed + } + } + // Give the OS a moment to flush WS close frames before the process + // dies and the underlying TCP connections get RST'd. + await new Promise((resolve) => setTimeout(resolve, 250)); + } + private disposeTunnel(tunnel: TunnelState, reason: string): void { if (tunnel.pingTimer) clearInterval(tunnel.pingTimer); diff --git a/packages/host-service/src/tunnel/tunnel-client.ts b/packages/host-service/src/tunnel/tunnel-client.ts index 06176f16d87..278f2091773 100644 --- a/packages/host-service/src/tunnel/tunnel-client.ts +++ b/packages/host-service/src/tunnel/tunnel-client.ts @@ -130,6 +130,15 @@ export class TunnelClient { `[host-service:tunnel] relay rejected connection (code=${event.code}, reason=${event.reason ?? ""}); retrying`, ); } + // Clean close from the relay (1001 = "Going Away", typically a + // deploy drain). Reset backoff so we don't sit in 30s retry + // mode while the relay is restarting in <2s. + if (event.code === 1001) { + this.reconnectAttempts = 0; + console.log( + "[host-service:tunnel] relay draining; reconnecting immediately", + ); + } } catch (err) { console.warn( "[host-service:tunnel] error during onclose cleanup", From 191f4ec856f1028d35ca65e6f2f1e03eba1d7aef Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 21:29:51 -0700 Subject: [PATCH 2/9] fix(relay): use SIGINT for drain + kill_timeout=10s on Fly Verified during game-day: Fly's init sends SIGINT (not SIGTERM) to the main process during rolling deploys, AND its default ~5s grace period kills the process before the JS handler runs. Two fixes: - Listen on both SIGINT and SIGTERM in apps/relay/src/index.ts so the drain handler intercepts Fly's actual signal (also covers `fly machine stop` and local Ctrl-C). - Add `kill_signal = "SIGINT"` + `kill_timeout = "10s"` to both fly.toml and fly.staging.toml so the handler actually has time to drain. Verified on staging deploy 4: SJC SIGINT now fires "[relay] SIGINT received, draining tunnels", unregisters open WSs cleanly, then exits with code 0. Host re-registration happens ~12s after WS close (down from 64s pre-fix). --- apps/relay/fly.staging.toml | 2 ++ apps/relay/fly.toml | 2 ++ apps/relay/src/index.ts | 18 +++++++++++------- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/apps/relay/fly.staging.toml b/apps/relay/fly.staging.toml index dfa5e237386..f575a1f5af0 100644 --- a/apps/relay/fly.staging.toml +++ b/apps/relay/fly.staging.toml @@ -1,5 +1,7 @@ app = "superset-relay-staging" primary_region = "sjc" +kill_signal = "SIGINT" +kill_timeout = "10s" [build] dockerfile = "Dockerfile" diff --git a/apps/relay/fly.toml b/apps/relay/fly.toml index ebb1c461fda..1dba7964739 100644 --- a/apps/relay/fly.toml +++ b/apps/relay/fly.toml @@ -6,6 +6,8 @@ app = "superset-relay" primary_region = "sjc" +kill_signal = "SIGINT" +kill_timeout = "10s" [build] dockerfile = "Dockerfile" diff --git a/apps/relay/src/index.ts b/apps/relay/src/index.ts index 31d887db607..54d24a0601c 100644 --- a/apps/relay/src/index.ts +++ b/apps/relay/src/index.ts @@ -47,22 +47,26 @@ const app = new Hono(); const tunnelManager = new TunnelManager(); const { injectWebSocket, upgradeWebSocket } = createNodeWebSocket({ app }); -// Graceful drain on Fly's pre-stop SIGTERM. Without this, deploys kill the -// process while tunnels are still open and host-services see TCP-RST'd -// sockets, triggering their long exponential backoff. Drain first sends a -// clean WS 1001 ("Going Away") to every tunnel so hosts reconnect promptly. +// Graceful drain on Fly's pre-stop signal. Fly's init sends SIGINT (not +// SIGTERM) to the main process during rolling deploys; listen on both to +// also cover hand-rolled `fly machine stop` and local Ctrl-C. Without this, +// deploys kill the process while tunnels are open and host-services see +// TCP-RST'd sockets, triggering their long exponential backoff. Drain +// closes each WS with code 1001 ("Going Away") so hosts reconnect promptly. let draining = false; -process.on("SIGTERM", async () => { +const handleDrain = async (signal: string) => { if (draining) return; draining = true; - console.log("[relay] SIGTERM received, draining tunnels"); + console.log(`[relay] ${signal} received, draining tunnels`); try { await tunnelManager.drain(); } catch (err) { console.error("[relay] drain failed", err); } process.exit(0); -}); +}; +process.on("SIGINT", () => void handleDrain("SIGINT")); +process.on("SIGTERM", () => void handleDrain("SIGTERM")); app.use("*", redactingLogger); app.use("*", cors()); From 96bae9bc2d61503b16ef437fa271e2c5b1bf1797 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 21:30:30 -0700 Subject: [PATCH 3/9] chore(relay): add deploy-staging.sh for parametrized staging deploys --- apps/relay/scripts/deploy-staging.sh | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 apps/relay/scripts/deploy-staging.sh diff --git a/apps/relay/scripts/deploy-staging.sh b/apps/relay/scripts/deploy-staging.sh new file mode 100755 index 00000000000..8847d5b61a4 --- /dev/null +++ b/apps/relay/scripts/deploy-staging.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Deploy the relay to the staging Fly app (superset-relay-staging). Mirrors +# deploy.sh but targets fly.staging.toml + the staging app, so we can iterate +# on multi-region without risking prod. Edit REGIONS to grow the fleet. +set -euo pipefail + +APP=superset-relay-staging +REGIONS=(sjc iad fra) +COUNT=${#REGIONS[@]} +REGION_LIST=$(IFS=, ; echo "${REGIONS[*]}") + +cd "$(git rev-parse --show-toplevel)" + +echo "==> fly deploy ($APP)" +fly deploy \ + --config apps/relay/fly.staging.toml \ + --dockerfile apps/relay/Dockerfile \ + --app "$APP" \ + . + +echo "==> fly scale count: $COUNT machines, 1 per region across $REGION_LIST" +fly scale count "app=$COUNT" \ + --region "$REGION_LIST" \ + --max-per-region 1 \ + --app "$APP" \ + --yes + +echo "==> Status" +fly status --app "$APP" From 366f61233870acfdd35c736ce105bc0758d77553 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 21:43:19 -0700 Subject: [PATCH 4/9] fix(relay): clear self-owned directory entries on startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Game-day #14 exposed a ~2min user-visible glitch when a relay machine gets killed and auto-restarted by Fly. The killed process never gets to unregister its tunnels (SIGKILL leaves no opportunity), so the directory keeps entries pointing at "{region}:{machineId}". Fly auto-restarts the same machineId, but the new process has no in-memory tunnels — meanwhile, cross-region requests still fly-replay traffic at that stale machineId and the reconciler reports `is_online=yes` based on the stale directory. Fix: on startup, scan the OWNER hash for entries matching our own encoded owner string and delete them via the existing compare-and- delete `unregister` helper. Runs synchronously before serve() via top-level await — the relay won't accept connections until cleanup completes, so we can't race a fresh registration. Verified on staging: kill SJC machine → 6s VM cycle → "cleared 2 stale directory entries on startup" log → sandbox re-registered 13s after kill (down from ~2min recovery without this fix). --- apps/relay/src/directory.ts | 22 ++++++++++++++++++++++ apps/relay/src/index.ts | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/apps/relay/src/directory.ts b/apps/relay/src/directory.ts index 4e780eb250c..8e8e79525e4 100644 --- a/apps/relay/src/directory.ts +++ b/apps/relay/src/directory.ts @@ -131,6 +131,28 @@ end return removed `; +// Called on relay startup. Removes any directory entries the prior process +// generation left behind (SIGKILL / crash / drain race) before we begin +// accepting connections. The owner value includes the Fly machineId, which +// stays the same across restarts of a given VM — so any pre-existing entry +// with our owner string is necessarily stale. +export async function clearStaleEntriesForMachine( + region: string, + machineId: string, +): Promise { + const myOwner = encodeOwner(region, machineId); + const allOwners = + (await redis.hgetall>(OWNER_KEY)) ?? {}; + let cleared = 0; + for (const [hostId, owner] of Object.entries(allOwners)) { + if (owner === myOwner) { + await unregister(hostId, region, machineId); + cleared++; + } + } + return cleared; +} + export async function sweepStale(): Promise { const now = Date.now(); const result = await redis.eval( diff --git a/apps/relay/src/index.ts b/apps/relay/src/index.ts index 54d24a0601c..8f57afdd18e 100644 --- a/apps/relay/src/index.ts +++ b/apps/relay/src/index.ts @@ -340,6 +340,24 @@ if (env.RELAY_SYNTHETIC_JWT) { // ── Start ─────────────────────────────────────────────────────────── +// Clear any directory entries our previous process generation left behind +// (SIGKILL, drain race, etc.) before we begin accepting connections, so +// fly-replay doesn't route cross-region requests at us for tunnels we no +// longer have. Best-effort: relay still boots if Upstash is unreachable. +try { + const cleared = await directory.clearStaleEntriesForMachine( + env.FLY_REGION, + env.FLY_MACHINE_ID, + ); + if (cleared > 0) { + console.log( + `[relay] cleared ${cleared} stale directory entries on startup`, + ); + } +} catch (err) { + console.error("[relay] startup cleanup failed", err); +} + const server = serve({ fetch: app.fetch, port: env.RELAY_PORT }, (info) => { console.log( `[relay] listening on http://localhost:${info.port} (region=${env.FLY_REGION} machine=${env.FLY_MACHINE_ID})`, From 9771b371eef41d1837bc27d53154a2a6a0d397fe Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 21:48:36 -0700 Subject: [PATCH 5/9] fix(host-service): lower reconnect MAX_MS from 30s to 5s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Game-day #15 (WS storm) showed daemons stranded for minutes after consecutive disconnects. With the previous 30s cap, retries land 15-30s apart in the saturated tail of the exponential curve, and the post-recovery retry frequently misses the relay-back-online window. 5s cap means slightly more retry traffic under prolonged outages but ensures hosts reconnect promptly when the relay is back. Pairs with the 1001-reset (clean close → reset attempts) to keep deploy-time disconnects to a few seconds. --- packages/host-service/src/tunnel/tunnel-client.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/host-service/src/tunnel/tunnel-client.ts b/packages/host-service/src/tunnel/tunnel-client.ts index 278f2091773..c920ad7bfc7 100644 --- a/packages/host-service/src/tunnel/tunnel-client.ts +++ b/packages/host-service/src/tunnel/tunnel-client.ts @@ -8,7 +8,10 @@ import type { } from "./types"; const RECONNECT_BASE_MS = 1_000; -const RECONNECT_MAX_MS = 30_000; +// 5s ceiling rather than 30s. Under a sustained outage this means slightly +// more retry traffic, but under transient relay restarts (the common case) +// it ensures we don't sit idle for 30s while the relay is back online. +const RECONNECT_MAX_MS = 5_000; const INBOUND_SILENCE_TIMEOUT_MS = 75_000; const WATCHDOG_INTERVAL_MS = 10_000; const CONNECT_TIMEOUT_MS = 20_000; From 214adb0092dd95fb30949cd0f324483f20d7417e Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 22:36:13 -0700 Subject: [PATCH 6/9] =?UTF-8?q?fix(relay):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20close-code,=20accept-during-drain,=20batch=20cleanu?= =?UTF-8?q?p?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four fixes from PR review on #4594: 1. Drain uses app-defined close code 4001 instead of 1001. The ping-timeout path in tunnel.ts already uses 1001, and the host-side reset-on-clean-close would have fired for those too — risking a thundering-herd reconnect storm under a mass ping-timeout event. Now the host only resets on 4001 (deliberate deploy drain), and 1001 keeps its RFC-conformant "going away" semantics elsewhere. (Greptile P1) 2. Stop accepting new TCP connections during drain. Call server.close() before tunnelManager.drain() so a host that reconnects during the 250ms flush window doesn't open a tunnel that immediately dies on process.exit. (Greptile P2) 3. Startup directory cleanup batched into a single Lua EVAL. Previous loop did one Redis round-trip per stale entry — at prod scale (~500 tunnels per machine) that would have eaten 25s of startup time on typical Upstash latency. Single EVAL keeps cleanup at one RTT regardless of directory size. (CodeRabbit Major) 4. Update host-side comment from "30s retry mode" to "5s ceiling" (matches the MAX_MS=5s change earlier in this PR). (CodeRabbit Minor) --- apps/relay/src/directory.ts | 38 ++++++++++++++----- apps/relay/src/index.ts | 12 ++++-- apps/relay/src/tunnel.ts | 19 +++++++--- .../host-service/src/tunnel/tunnel-client.ts | 11 ++++-- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/apps/relay/src/directory.ts b/apps/relay/src/directory.ts index 8e8e79525e4..25081cbcd2f 100644 --- a/apps/relay/src/directory.ts +++ b/apps/relay/src/directory.ts @@ -136,21 +136,39 @@ return removed // accepting connections. The owner value includes the Fly machineId, which // stays the same across restarts of a given VM — so any pre-existing entry // with our owner string is necessarily stale. +// +// Batched as a single Lua eval so startup time stays bounded at one Redis +// round-trip regardless of how many tunnels the previous generation owned; +// per-host serial unregister calls would scale with directory size and eat +// directly into deploy recovery. +const CLEAR_STALE_SCRIPT = ` +local owner = ARGV[1] +local entries = redis.call('HGETALL', KEYS[1]) +local cleared = 0 +for i = 1, #entries, 2 do + local hostId = entries[i] + local current = entries[i + 1] + if current == owner then + redis.call('HDEL', KEYS[1], hostId) + redis.call('HDEL', KEYS[2], hostId) + redis.call('ZREM', KEYS[3], hostId) + cleared = cleared + 1 + end +end +return cleared +`; + export async function clearStaleEntriesForMachine( region: string, machineId: string, ): Promise { const myOwner = encodeOwner(region, machineId); - const allOwners = - (await redis.hgetall>(OWNER_KEY)) ?? {}; - let cleared = 0; - for (const [hostId, owner] of Object.entries(allOwners)) { - if (owner === myOwner) { - await unregister(hostId, region, machineId); - cleared++; - } - } - return cleared; + const result = await redis.eval( + CLEAR_STALE_SCRIPT, + [OWNER_KEY, META_KEY, TTL_KEY], + [myOwner], + ); + return typeof result === "number" ? result : 0; } export async function sweepStale(): Promise { diff --git a/apps/relay/src/index.ts b/apps/relay/src/index.ts index 8f57afdd18e..897c5226a35 100644 --- a/apps/relay/src/index.ts +++ b/apps/relay/src/index.ts @@ -51,14 +51,20 @@ const { injectWebSocket, upgradeWebSocket } = createNodeWebSocket({ app }); // SIGTERM) to the main process during rolling deploys; listen on both to // also cover hand-rolled `fly machine stop` and local Ctrl-C. Without this, // deploys kill the process while tunnels are open and host-services see -// TCP-RST'd sockets, triggering their long exponential backoff. Drain -// closes each WS with code 1001 ("Going Away") so hosts reconnect promptly. +// TCP-RST'd sockets, triggering their long exponential backoff. +// +// Sequence: stop accepting new TCP connections (server.close), then close +// every open tunnel with the app-defined drain code so hosts reconnect +// promptly. server is assigned at the bottom of this file — by signal +// time, the closure has it. +let server: ReturnType | null = null; let draining = false; const handleDrain = async (signal: string) => { if (draining) return; draining = true; console.log(`[relay] ${signal} received, draining tunnels`); try { + server?.close(); await tunnelManager.drain(); } catch (err) { console.error("[relay] drain failed", err); @@ -358,7 +364,7 @@ try { console.error("[relay] startup cleanup failed", err); } -const server = serve({ fetch: app.fetch, port: env.RELAY_PORT }, (info) => { +server = serve({ fetch: app.fetch, port: env.RELAY_PORT }, (info) => { console.log( `[relay] listening on http://localhost:${info.port} (region=${env.FLY_REGION} machine=${env.FLY_MACHINE_ID})`, ); diff --git a/apps/relay/src/tunnel.ts b/apps/relay/src/tunnel.ts index c16db92585f..fee9396eaab 100644 --- a/apps/relay/src/tunnel.ts +++ b/apps/relay/src/tunnel.ts @@ -159,16 +159,23 @@ export class TunnelManager { console.log(`[relay] tunnel unregistered: ${hostId}`); } - // SIGTERM-driven graceful drain. Closes every open tunnel with WS code - // 1001 ("Going Away") so the host-service can recognize this as a deploy - // drain (not a hard disconnect) and reconnect immediately, instead of - // entering exponential backoff against TCP-RST'd sockets. Used by the - // SIGTERM handler in index.ts. + // Application-defined WS close code (4xxx range) signaling "relay is + // draining for a deploy — reconnect immediately." Distinct from 1001 + // ("Going Away") which the ping-timeout / dispose paths use; the host + // resets its backoff only on this code, so a mass ping-timeout doesn't + // trigger a thundering-herd of fast reconnects. + static readonly WS_CLOSE_DRAIN = 4001; + + // SIGTERM-driven graceful drain. Closes every open tunnel with the + // app-defined "drain" close code so the host-service can recognize this + // as a deploy drain (not a hard disconnect or ping timeout) and + // reconnect immediately. Called from the SIGINT/SIGTERM handler in + // index.ts. async drain(reason = "Server draining for deploy"): Promise { console.log(`[relay] draining ${this.tunnels.size} tunnels`); for (const tunnel of this.tunnels.values()) { try { - tunnel.ws.close(1001, reason); + tunnel.ws.close(TunnelManager.WS_CLOSE_DRAIN, reason); } catch { // already closed } diff --git a/packages/host-service/src/tunnel/tunnel-client.ts b/packages/host-service/src/tunnel/tunnel-client.ts index c920ad7bfc7..4e4bb10bd16 100644 --- a/packages/host-service/src/tunnel/tunnel-client.ts +++ b/packages/host-service/src/tunnel/tunnel-client.ts @@ -133,10 +133,13 @@ export class TunnelClient { `[host-service:tunnel] relay rejected connection (code=${event.code}, reason=${event.reason ?? ""}); retrying`, ); } - // Clean close from the relay (1001 = "Going Away", typically a - // deploy drain). Reset backoff so we don't sit in 30s retry - // mode while the relay is restarting in <2s. - if (event.code === 1001) { + // App-defined "relay draining for deploy" close code + // (4001). Distinct from 1001 ("Going Away") which the + // ping-timeout / dispose paths use — only reset on 4001 so + // a mass ping-timeout doesn't trigger a thundering-herd of + // instant reconnects. After reset, next attempt fires at + // the base delay instead of the 5s ceiling. + if (event.code === 4001) { this.reconnectAttempts = 0; console.log( "[host-service:tunnel] relay draining; reconnecting immediately", From ae630c4aeb534544e3c8576d4781789a3bf9c6a7 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 22:43:08 -0700 Subject: [PATCH 7/9] chore(relay): post-deploy smoke test across all regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds smoke-test.sh that pings /health on every configured region via `fly-prefer-region` and verifies the response body's `region` field matches the requested region. Wired into both deploy.sh and deploy-staging.sh so deploys halt if any region failed to come up. Catches partial-deploy failures, missing regions, and machines that booted but aren't actually serving — exactly the multi-region failure modes we couldn't detect with the existing single-region setup. Verified manually against superset-relay-staging.fly.dev (sjc, iad, fra) — all 3 returned correct region in /health response. --- apps/relay/scripts/deploy-staging.sh | 3 ++ apps/relay/scripts/deploy.sh | 3 ++ apps/relay/scripts/smoke-test.sh | 45 ++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100755 apps/relay/scripts/smoke-test.sh diff --git a/apps/relay/scripts/deploy-staging.sh b/apps/relay/scripts/deploy-staging.sh index 8847d5b61a4..76a39e282cc 100755 --- a/apps/relay/scripts/deploy-staging.sh +++ b/apps/relay/scripts/deploy-staging.sh @@ -27,3 +27,6 @@ fly scale count "app=$COUNT" \ echo "==> Status" fly status --app "$APP" + +echo "==> Smoke test" +"$(dirname "$0")/smoke-test.sh" "${APP}.fly.dev" "${REGIONS[@]}" diff --git a/apps/relay/scripts/deploy.sh b/apps/relay/scripts/deploy.sh index c7e72da9c5b..483f39921ad 100755 --- a/apps/relay/scripts/deploy.sh +++ b/apps/relay/scripts/deploy.sh @@ -24,3 +24,6 @@ fly scale count "app=$COUNT" \ echo "==> Status" fly status --app "$APP" + +echo "==> Smoke test" +"$(dirname "$0")/smoke-test.sh" "${APP}.fly.dev" "${REGIONS[@]}" diff --git a/apps/relay/scripts/smoke-test.sh b/apps/relay/scripts/smoke-test.sh new file mode 100755 index 00000000000..6b1c6abdc07 --- /dev/null +++ b/apps/relay/scripts/smoke-test.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Post-deploy smoke test for the relay. Hits /health on the public hostname +# with `fly-prefer-region` for every region the fleet is supposed to span, +# then verifies the response 200s with `region` matching the requested +# region — catches partial-deploy failures, missing regions, and machines +# that booted but aren't actually serving. +# +# Usage: smoke-test.sh [ ...] +# smoke-test.sh superset-relay-staging.fly.dev sjc iad fra +# +# Exits non-zero on any failure so callers (deploy.sh, deploy-staging.sh) +# can halt the pipeline. +set -euo pipefail + +if [ $# -lt 2 ]; then + echo "usage: $0 [ ...]" >&2 + exit 64 +fi + +HOSTNAME="$1" +shift +REGIONS=("$@") + +fail=0 +for region in "${REGIONS[@]}"; do + printf " %-4s " "$region" + body=$(curl -sS --max-time 8 -H "fly-prefer-region: $region" "https://${HOSTNAME}/health" 2>&1) || { + printf " ✗ curl failed: %s\n" "$body" + fail=$((fail + 1)) + continue + } + got=$(printf "%s" "$body" | sed -nE 's/.*"region":"([^"]+)".*/\1/p') + if [ "$got" = "$region" ]; then + printf " ✓ %s\n" "$body" + else + printf " ✗ wanted region=%s, got=%s: %s\n" "$region" "$got" "$body" + fail=$((fail + 1)) + fi +done + +if [ "$fail" -ne 0 ]; then + echo "==> smoke test FAILED: $fail region(s) did not respond as expected" >&2 + exit 1 +fi +echo "==> smoke test OK across ${#REGIONS[@]} region(s)" From 3aee590af34c72449afc452b087e80a74513d079 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 14 May 2026 23:38:41 -0700 Subject: [PATCH 8/9] fix(relay): in-band drain signal before WS close on shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Game-day testing on multi-region staging revealed the WS close frame doesn't reliably reach the host within Fly's kill_timeout window. The host's TCP socket stayed ESTABLISHED with no onclose event firing until the 75s inbound-silence watchdog tripped — i.e. ~110s of user-visible outage on every deploy. Send a JSON {type:"drain"} message on the WS data channel first, wait 500ms for it to flush, then close. The message channel already carries ping/pong every 30s, so we know it traverses the path cleanly. Host's onmessage handler resets backoff and triggers an immediate reconnect. The subsequent close() is belt-and-suspenders. Verified end-to-end on staging: host log shows "[host-service:tunnel] relay drain notice received (Server draining for deploy); reconnecting immediately" followed by reconnect within ~900ms. --- apps/relay/src/tunnel.ts | 44 ++++++++++++++++--- .../host-service/src/tunnel/tunnel-client.ts | 18 ++++++++ packages/shared/src/tunnel-protocol.ts | 13 +++++- 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/apps/relay/src/tunnel.ts b/apps/relay/src/tunnel.ts index fee9396eaab..b2ee294774f 100644 --- a/apps/relay/src/tunnel.ts +++ b/apps/relay/src/tunnel.ts @@ -171,18 +171,52 @@ export class TunnelManager { // as a deploy drain (not a hard disconnect or ping timeout) and // reconnect immediately. Called from the SIGINT/SIGTERM handler in // index.ts. + // + // Waits for each WS's actual close event before resolving (with a + // per-WS ceiling). A naive `ws.close(...) + setTimeout(250)` was not + // enough — under real network conditions the close frame hadn't + // finished its handshake by the time the process called exit(0), so + // the client's TCP socket stayed in ESTABLISHED with no `onclose` + // event. The client then took 110s (its watchdog) to notice the + // dead-but-not-closed connection — which translated into a ~110s + // user-visible outage on every deploy. async drain(reason = "Server draining for deploy"): Promise { - console.log(`[relay] draining ${this.tunnels.size} tunnels`); - for (const tunnel of this.tunnels.values()) { + const tunnels = Array.from(this.tunnels.values()); + console.log(`[relay] draining ${tunnels.length} tunnels`); + // In-band drain signal first: send a JSON {type:"drain"} message on + // the WS message channel before closing. Game-day testing showed + // the WS close frame doesn't reliably reach the host within Fly's + // kill_timeout window (host's TCP socket sees ESTABLISHED with no + // onclose for 75+ seconds, until its watchdog fires). The message + // channel is already exercised by ping/pong every 30s, so we know + // it flushes cleanly. Host's onmessage handler triggers a clean + // reconnect on receipt; the WS close after is just belt-and- + // suspenders. + for (const tunnel of tunnels) { + try { + this.send(tunnel.ws, { type: "drain", reason }); + } catch { + // best-effort + } + } + // Give the message frames a moment to reach the host before we + // start closing the underlying sockets. + await new Promise((resolve) => setTimeout(resolve, 500)); + for (const tunnel of tunnels) { try { tunnel.ws.close(TunnelManager.WS_CLOSE_DRAIN, reason); } catch { // already closed } } - // Give the OS a moment to flush WS close frames before the process - // dies and the underlying TCP connections get RST'd. - await new Promise((resolve) => setTimeout(resolve, 250)); + // Brief tail wait so the close-handshake gets a chance to complete + // before the process exits and RSTs the underlying TCP. + const WS_CLOSED = 3; + const deadline = Date.now() + 1_500; + while (Date.now() < deadline) { + if (tunnels.every((t) => t.ws.readyState === WS_CLOSED)) return; + await new Promise((resolve) => setTimeout(resolve, 50)); + } } private disposeTunnel(tunnel: TunnelState, reason: string): void { diff --git a/packages/host-service/src/tunnel/tunnel-client.ts b/packages/host-service/src/tunnel/tunnel-client.ts index 4e4bb10bd16..a8d16634f91 100644 --- a/packages/host-service/src/tunnel/tunnel-client.ts +++ b/packages/host-service/src/tunnel/tunnel-client.ts @@ -204,6 +204,24 @@ export class TunnelClient { case "ping": this.send({ type: "pong" }); break; + case "drain": + // In-band drain signal from the relay before it + // SIGINT-shuts-down. Reset backoff and tear the socket + // down ourselves so the next reconnect attempt fires at + // the base delay — far more reliable than waiting for + // the WS close frame to arrive (which game-day testing + // showed sometimes doesn't, leaving the host idle until + // its 75s inactivity watchdog). + console.log( + `[host-service:tunnel] relay drain notice received${message.reason ? ` (${message.reason})` : ""}; reconnecting immediately`, + ); + this.reconnectAttempts = 0; + try { + this.socket?.close(); + } catch { + // onclose handler will schedule the reconnect + } + break; case "http": await this.handleHttpRequest(message); break; diff --git a/packages/shared/src/tunnel-protocol.ts b/packages/shared/src/tunnel-protocol.ts index 7bafd07dba7..17e2b50560f 100644 --- a/packages/shared/src/tunnel-protocol.ts +++ b/packages/shared/src/tunnel-protocol.ts @@ -33,12 +33,23 @@ export interface TunnelPing { type: "ping"; } +// In-band drain signal — relay sends this to every tunnel right before +// SIGINT-triggered shutdown so the host knows to reconnect immediately +// rather than waiting for the WS close frame (which doesn't reliably +// reach the host within the kill_timeout window) or the host-side +// inactivity watchdog. +export interface TunnelDrain { + type: "drain"; + reason?: string; +} + export type TunnelRequest = | TunnelHttpRequest | TunnelWsOpen | TunnelWsFrame | TunnelWsClose - | TunnelPing; + | TunnelPing + | TunnelDrain; // ── Host → Relay ──────────────────────────────────────────────────── From 75994e059ad20ae9399c459e6dece1380b53ed9c Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Sat, 16 May 2026 12:25:46 -0700 Subject: [PATCH 9/9] fix(relay): await directory cleanup during drain; reject registers while draining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drain() previously relied on WS onClose → unregister → fire-and-forget directory.unregister for directory cleanup. process.exit(0) runs right after drain() returns, so those async Redis writes routinely didn't land before the process died — leaving the dead machine's directory entries live for ~90s (TTL), during which other relay machines fly-replay traffic to the corpse. Invisible single-region; a real per-deploy misrouting window multi-region. - drain() now awaits an explicit clearDirectory() callback (clearStaleEntriesForMachine — the batch Lua script already used on startup) before the process exits. - A draining flag rejects new tunnel registrations during the drain window, in register() and the /tunnel onOpen handler, re-checked after the async auth calls so a host can't slip in while draining flips. - drain() routes each tunnel through disposeTunnel and removes it from the map, so the trailing onClose callbacks are no-ops rather than racing the explicit directory clear. --- apps/relay/src/index.ts | 26 +++++++++++++++-- apps/relay/src/tunnel.ts | 60 ++++++++++++++++++++++++++++------------ 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/apps/relay/src/index.ts b/apps/relay/src/index.ts index 897c5226a35..9444f59c702 100644 --- a/apps/relay/src/index.ts +++ b/apps/relay/src/index.ts @@ -55,8 +55,9 @@ const { injectWebSocket, upgradeWebSocket } = createNodeWebSocket({ app }); // // Sequence: stop accepting new TCP connections (server.close), then close // every open tunnel with the app-defined drain code so hosts reconnect -// promptly. server is assigned at the bottom of this file — by signal -// time, the closure has it. +// promptly, and clear this machine's directory ownership before process exit. +// server is assigned at the bottom of this file — by signal time, the closure +// has it. let server: ReturnType | null = null; let draining = false; const handleDrain = async (signal: string) => { @@ -65,7 +66,16 @@ const handleDrain = async (signal: string) => { console.log(`[relay] ${signal} received, draining tunnels`); try { server?.close(); - await tunnelManager.drain(); + const cleared = await tunnelManager.drain({ + clearDirectory: () => + directory.clearStaleEntriesForMachine( + env.FLY_REGION, + env.FLY_MACHINE_ID, + ), + }); + if (cleared > 0) { + console.log(`[relay] cleared ${cleared} directory entries during drain`); + } } catch (err) { console.error("[relay] drain failed", err); } @@ -171,6 +181,11 @@ app.get( return { onOpen: async (_event, ws) => { + if (draining) { + ws.close(TunnelManager.WS_CLOSE_DRAIN, "Server draining for deploy"); + return; + } + if (!hostId || !token) { ws.close(1008, "Missing hostId or token"); return; @@ -188,6 +203,11 @@ app.get( return; } + if (draining) { + ws.close(TunnelManager.WS_CLOSE_DRAIN, "Server draining for deploy"); + return; + } + await tunnelManager.register(hostId, token, ws); // register closes ws itself on directory failure; only mark // authorized if the socket is still usable. diff --git a/apps/relay/src/tunnel.ts b/apps/relay/src/tunnel.ts index b2ee294774f..115a33879f3 100644 --- a/apps/relay/src/tunnel.ts +++ b/apps/relay/src/tunnel.ts @@ -33,6 +33,11 @@ interface TunnelState { missedPings: number; } +interface DrainOptions { + reason?: string; + clearDirectory: () => Promise; +} + export class TunnelManager { private readonly tunnels = new Map(); private readonly requestTimeoutMs: number; @@ -42,12 +47,18 @@ export class TunnelManager { ReturnType >(); private readonly onlineWriteVersions = new Map(); + private draining = false; constructor(requestTimeoutMs = 30_000) { this.requestTimeoutMs = requestTimeoutMs; } async register(hostId: string, token: string, ws: WsSocket): Promise { + if (this.draining) { + ws.close(TunnelManager.WS_CLOSE_DRAIN, "Server draining for deploy"); + return; + } + // Last-write-wins: close the old socket so flaky clients don't get // stuck behind a dead-but-not-yet-detected WS. const existing = this.tunnels.get(hostId); @@ -73,12 +84,15 @@ export class TunnelManager { // hadn't returned yet), so it skipped unregister. Roll the directory // entry back ourselves; otherwise other machines fly-replay traffic // to a dead local tunnel for ~90s until the TTL ages out. - if (ws.readyState !== 1) { + if (ws.readyState !== 1 || this.draining) { await directory .unregister(hostId, env.FLY_REGION, env.FLY_MACHINE_ID) .catch((err) => { console.error("[relay] directory.unregister rollback failed", err); }); + if (this.draining) { + ws.close(TunnelManager.WS_CLOSE_DRAIN, "Server draining for deploy"); + } return; } @@ -172,15 +186,12 @@ export class TunnelManager { // reconnect immediately. Called from the SIGINT/SIGTERM handler in // index.ts. // - // Waits for each WS's actual close event before resolving (with a - // per-WS ceiling). A naive `ws.close(...) + setTimeout(250)` was not - // enough — under real network conditions the close frame hadn't - // finished its handshake by the time the process called exit(0), so - // the client's TCP socket stayed in ESTABLISHED with no `onclose` - // event. The client then took 110s (its watchdog) to notice the - // dead-but-not-closed connection — which translated into a ~110s - // user-visible outage on every deploy. - async drain(reason = "Server draining for deploy"): Promise { + // Owns directory cleanup directly instead of relying on websocket close + // callbacks. The process exits immediately after drain, so fire-and-forget + // unregister work from onClose is not a reliable shutdown primitive. + async drain(options: DrainOptions): Promise { + this.draining = true; + const reason = options.reason ?? "Server draining for deploy"; const tunnels = Array.from(this.tunnels.values()); console.log(`[relay] draining ${tunnels.length} tunnels`); // In-band drain signal first: send a JSON {type:"drain"} message on @@ -202,24 +213,37 @@ export class TunnelManager { // Give the message frames a moment to reach the host before we // start closing the underlying sockets. await new Promise((resolve) => setTimeout(resolve, 500)); + + let cleared = 0; + let clearError: unknown; + try { + cleared = await options.clearDirectory(); + } catch (err) { + clearError = err; + } + for (const tunnel of tunnels) { - try { - tunnel.ws.close(TunnelManager.WS_CLOSE_DRAIN, reason); - } catch { - // already closed - } + this.disposeTunnel(tunnel, reason, TunnelManager.WS_CLOSE_DRAIN); + this.tunnels.delete(tunnel.hostId); } + // Brief tail wait so the close-handshake gets a chance to complete // before the process exits and RSTs the underlying TCP. const WS_CLOSED = 3; const deadline = Date.now() + 1_500; while (Date.now() < deadline) { - if (tunnels.every((t) => t.ws.readyState === WS_CLOSED)) return; + if (tunnels.every((t) => t.ws.readyState === WS_CLOSED)) break; await new Promise((resolve) => setTimeout(resolve, 50)); } + if (clearError) throw clearError; + return cleared; } - private disposeTunnel(tunnel: TunnelState, reason: string): void { + private disposeTunnel( + tunnel: TunnelState, + reason: string, + tunnelCloseCode = 1000, + ): void { if (tunnel.pingTimer) clearInterval(tunnel.pingTimer); for (const [, pending] of tunnel.pendingRequests) { @@ -232,7 +256,7 @@ export class TunnelManager { } try { - tunnel.ws.close(1000, reason); + tunnel.ws.close(tunnelCloseCode, reason); } catch { // already closed }