-
Notifications
You must be signed in to change notification settings - Fork 20
fix(reliability): gate boot on schema, surface err, split readiness #767
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ import path from 'node:path'; | |
| import { fileURLToPath } from 'node:url'; | ||
| import { getRequestListener } from '@hono/node-server'; | ||
| import { Hono } from 'hono'; | ||
| import { closeDbSingleton, probeListenNotify } from './db/client'; | ||
| import { closeDbSingleton, getDb, probeListenNotify } from './db/client'; | ||
| import { mountViteDev } from './dev-vite'; | ||
| import type { Env } from './index'; | ||
| import { app as mainApp } from './index'; | ||
|
|
@@ -43,6 +43,7 @@ import { assertExternalDepsResolvable } from '../../connector-worker/src/runtime | |
| import { isSentryReported, markSentryReported } from './sentry'; | ||
| import { getEnvFromProcess } from './utils/env'; | ||
| import logger from './utils/logger'; | ||
| import { assertSchemaUpToDate } from './utils/schema-version-check'; | ||
| import { initWorkspaceProvider } from './workspace'; | ||
|
|
||
| // Create a wrapper app that injects environment into each request | ||
|
|
@@ -145,6 +146,19 @@ async function main() { | |
| } | ||
| process.env.DATABASE_URL = databaseUrl; | ||
|
|
||
| // Refuse to boot if the image expects a migration the database hasn't | ||
| // applied. Skippable via SKIP_SCHEMA_VERSION_CHECK=1 for emergency | ||
| // forward-flight (e.g. rolling back to an older image whose migrations | ||
| // dir is a strict prefix of what's already applied). See | ||
| // utils/schema-version-check.ts for the 2026-05-16 incident this guards. | ||
| if (process.env.SKIP_SCHEMA_VERSION_CHECK !== '1') { | ||
| const migrationsDir = | ||
| process.env.LOBU_MIGRATIONS_DIR?.trim() || path.join(PACKAGE_REPO_ROOT, 'db', 'migrations'); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Useful? React with 👍 / 👎. |
||
| await assertSchemaUpToDate(getDb(), { migrationsDir }); | ||
| } else { | ||
| logger.warn('[schema-check] SKIP_SCHEMA_VERSION_CHECK=1 — skipping boot-time assertion'); | ||
| } | ||
|
|
||
| // Verify LISTEN/NOTIFY actually delivers. This is a *detector*, not a gate: | ||
| // the runs-queue has a 200ms SKIP-LOCKED poll fallback that keeps the queue | ||
| // correct even when LISTEN is silently dropped (transaction-mode pgbouncer, | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,137 @@ | ||||||||||||||||||||||||||
| import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'; | ||||||||||||||||||||||||||
| import { tmpdir } from 'node:os'; | ||||||||||||||||||||||||||
| import path from 'node:path'; | ||||||||||||||||||||||||||
| import type { Sql } from 'postgres'; | ||||||||||||||||||||||||||
| import { afterEach, beforeEach, describe, expect, it } from 'vitest'; | ||||||||||||||||||||||||||
| import type { DbClient } from '../../db/client'; | ||||||||||||||||||||||||||
| import { | ||||||||||||||||||||||||||
| assertSchemaUpToDate, | ||||||||||||||||||||||||||
| compareSchemaVersions, | ||||||||||||||||||||||||||
| readExpectedSchemaVersion, | ||||||||||||||||||||||||||
| } from '../schema-version-check'; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||
| * Build a stub DbClient that always returns the given `applied` version when | ||||||||||||||||||||||||||
| * a tagged-template query runs. Just enough surface to satisfy the call site | ||||||||||||||||||||||||||
| * in `assertSchemaUpToDate`. | ||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||
| function makeStubDb(applied: string | null): DbClient { | ||||||||||||||||||||||||||
| const fn = ((_strings: TemplateStringsArray, ..._values: unknown[]) => { | ||||||||||||||||||||||||||
| return Object.assign(Promise.resolve([{ version: applied }]), { count: 1 }); | ||||||||||||||||||||||||||
| }) as unknown as Sql; | ||||||||||||||||||||||||||
| return fn as unknown as DbClient; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| describe('readExpectedSchemaVersion', () => { | ||||||||||||||||||||||||||
| let dir: string; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| beforeEach(() => { | ||||||||||||||||||||||||||
| dir = mkdtempSync(path.join(tmpdir(), 'schema-check-')); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| afterEach(() => { | ||||||||||||||||||||||||||
| rmSync(dir, { recursive: true, force: true }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('returns the highest version prefix from migration filenames', () => { | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, '20260512000000_first.sql'), ''); | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, '20260515150000_geo_enrichment.sql'), ''); | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, '20260516200000_events_search_tsv.sql'), ''); | ||||||||||||||||||||||||||
| expect(readExpectedSchemaVersion(dir)).toBe('20260516200000'); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('ignores non-migration files (no dbmate-style prefix)', () => { | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, '20260512000000_real.sql'), ''); | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, 'README.md'), ''); | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, 'rollback.sql'), ''); | ||||||||||||||||||||||||||
| expect(readExpectedSchemaVersion(dir)).toBe('20260512000000'); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('returns null for an unreadable directory (treat as "no expectation")', () => { | ||||||||||||||||||||||||||
| expect(readExpectedSchemaVersion(path.join(dir, 'does-not-exist'))).toBeNull(); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('returns null for an empty directory', () => { | ||||||||||||||||||||||||||
| expect(readExpectedSchemaVersion(dir)).toBeNull(); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| describe('compareSchemaVersions', () => { | ||||||||||||||||||||||||||
| it('returns ok when applied >= expected', () => { | ||||||||||||||||||||||||||
| expect(compareSchemaVersions('20260516200000', '20260516200000')).toEqual({ | ||||||||||||||||||||||||||
| kind: 'ok', | ||||||||||||||||||||||||||
| expected: '20260516200000', | ||||||||||||||||||||||||||
| applied: '20260516200000', | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| expect(compareSchemaVersions('20260516200000', '20260517000000')).toMatchObject({ | ||||||||||||||||||||||||||
| kind: 'ok', | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('returns mismatch when applied is behind expected', () => { | ||||||||||||||||||||||||||
| expect(compareSchemaVersions('20260516200000', '20260516120000')).toEqual({ | ||||||||||||||||||||||||||
| kind: 'mismatch', | ||||||||||||||||||||||||||
| expected: '20260516200000', | ||||||||||||||||||||||||||
| applied: '20260516120000', | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('returns mismatch when no version is applied yet', () => { | ||||||||||||||||||||||||||
| expect(compareSchemaVersions('20260516200000', null)).toEqual({ | ||||||||||||||||||||||||||
| kind: 'mismatch', | ||||||||||||||||||||||||||
| expected: '20260516200000', | ||||||||||||||||||||||||||
| applied: null, | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('returns ok when expected is null (dev fallback / no migrations on disk)', () => { | ||||||||||||||||||||||||||
| expect(compareSchemaVersions(null, null)).toMatchObject({ kind: 'ok' }); | ||||||||||||||||||||||||||
| expect(compareSchemaVersions(null, '20260516200000')).toMatchObject({ kind: 'ok' }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| describe('assertSchemaUpToDate', () => { | ||||||||||||||||||||||||||
| let dir: string; | ||||||||||||||||||||||||||
| const originalNodeEnv = process.env.NODE_ENV; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| beforeEach(() => { | ||||||||||||||||||||||||||
| dir = mkdtempSync(path.join(tmpdir(), 'schema-check-')); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| afterEach(() => { | ||||||||||||||||||||||||||
| rmSync(dir, { recursive: true, force: true }); | ||||||||||||||||||||||||||
| process.env.NODE_ENV = originalNodeEnv; | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
Comment on lines
+101
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: In Node.js, assigning undefined to a property on process.env does not delete the environment variable; instead, it converts the value to the string "undefined" [1][2][3]. Because process.env implicitly converts assigned values to strings, the expression process.env.NODE_ENV = undefined results in the environment variable being set to the literal string "undefined" [1][4][5]. To correctly remove an environment variable from the process.env object, you must use the delete operator [1][4][5]: delete process.env.NODE_ENV; Using delete removes the property from the process.env object entirely, which is the intended way to unset an environment variable within a Node.js process [2][3][5]. Citations:
Restore Line 103 leaks test state: assigning Proposed fix afterEach(() => {
rmSync(dir, { recursive: true, force: true });
- process.env.NODE_ENV = originalNodeEnv;
+ if (originalNodeEnv === undefined) {
+ delete process.env.NODE_ENV;
+ } else {
+ process.env.NODE_ENV = originalNodeEnv;
+ }
});📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('throws in production when the migrations directory is missing (fail closed)', async () => { | ||||||||||||||||||||||||||
| process.env.NODE_ENV = 'production'; | ||||||||||||||||||||||||||
| const missingDir = path.join(dir, 'does-not-exist'); | ||||||||||||||||||||||||||
| await expect( | ||||||||||||||||||||||||||
| assertSchemaUpToDate(makeStubDb('20260516200000'), { migrationsDir: missingDir }) | ||||||||||||||||||||||||||
| ).rejects.toThrow(/missing db\/migrations/i); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('passes in development when the migrations directory is missing (fail open for dev)', async () => { | ||||||||||||||||||||||||||
| process.env.NODE_ENV = 'development'; | ||||||||||||||||||||||||||
| const missingDir = path.join(dir, 'does-not-exist'); | ||||||||||||||||||||||||||
| await expect( | ||||||||||||||||||||||||||
| assertSchemaUpToDate(makeStubDb(null), { migrationsDir: missingDir }) | ||||||||||||||||||||||||||
| ).resolves.toBeUndefined(); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('throws when the database is behind the migrations directory', async () => { | ||||||||||||||||||||||||||
| process.env.NODE_ENV = 'production'; | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, '20260516200000_events_search_tsv.sql'), ''); | ||||||||||||||||||||||||||
| await expect( | ||||||||||||||||||||||||||
| assertSchemaUpToDate(makeStubDb('20260515120000'), { migrationsDir: dir }) | ||||||||||||||||||||||||||
| ).rejects.toThrow(/database is behind/i); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| it('passes when the database is at the expected version', async () => { | ||||||||||||||||||||||||||
| process.env.NODE_ENV = 'production'; | ||||||||||||||||||||||||||
| writeFileSync(path.join(dir, '20260516200000_events_search_tsv.sql'), ''); | ||||||||||||||||||||||||||
| await expect( | ||||||||||||||||||||||||||
| assertSchemaUpToDate(makeStubDb('20260516200000'), { migrationsDir: dir }) | ||||||||||||||||||||||||||
| ).resolves.toBeUndefined(); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| /** | ||
| * Boot-time schema-version assertion. | ||
| * | ||
| * Compares the highest migration version present in `db/migrations/` (what | ||
| * this image expects to run against) with the highest version recorded in | ||
| * the database's `schema_migrations` table. If the database is behind, | ||
| * throws — the server boot path catches it, logs, and exits non-zero so the | ||
| * pod fails readiness and Kubernetes refuses to route traffic. | ||
| * | ||
| * Why: on 2026-05-16 the pre-upgrade migration Job for | ||
| * `20260516200000_events_search_tsv.sql` timed out at 60s (table rewrite on | ||
| * a 1.15M-row events table under ACCESS EXCLUSIVE > statement_timeout). The | ||
| * Job exited non-zero, but the app Deployment rolled forward anyway with an | ||
| * image that queried the new view's `search_tsv` column. Every request | ||
| * through the affected paths threw; the pod OOM'd and CrashLoopBackOff'd. | ||
| * A boot-time gate would have refused to start that image and kept the | ||
| * previous version serving traffic. | ||
| */ | ||
|
|
||
| import { readdirSync } from 'node:fs'; | ||
| import type { DbClient } from '../db/client'; | ||
| import logger from './logger'; | ||
|
|
||
| const MIGRATION_FILENAME_RE = /^(\d+)_[^/]+\.sql$/; | ||
|
|
||
| /** | ||
| * Find the largest version prefix (e.g. `20260516200000`) across files in | ||
| * `migrationsDir`. Returns null if the directory is empty / unreadable — | ||
| * that case is treated as "no expectation" rather than failing closed, so a | ||
| * dev environment without migrations checked out doesn't deadlock boot. | ||
| */ | ||
| export function readExpectedSchemaVersion(migrationsDir: string): string | null { | ||
| let entries: string[]; | ||
| try { | ||
| entries = readdirSync(migrationsDir); | ||
| } catch (err) { | ||
| logger.warn( | ||
| { err, migrationsDir }, | ||
| '[schema-check] migrations directory not readable — skipping schema-version assertion' | ||
| ); | ||
| return null; | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| let max: string | null = null; | ||
| for (const name of entries) { | ||
| const match = MIGRATION_FILENAME_RE.exec(name); | ||
| if (!match) continue; | ||
| const version = match[1]; | ||
| if (max === null || version > max) max = version; | ||
| } | ||
| return max; | ||
| } | ||
|
|
||
| /** | ||
| * Query the highest applied version from the database. Returns null if the | ||
| * `schema_migrations` table is empty (fresh install) — the caller decides | ||
| * whether that's expected. | ||
| */ | ||
| export async function readAppliedSchemaVersion(sql: DbClient): Promise<string | null> { | ||
| const rows = (await sql`SELECT MAX(version) AS version FROM public.schema_migrations`) as Array<{ | ||
| version: string | null; | ||
| }>; | ||
| return rows[0]?.version ?? null; | ||
| } | ||
|
|
||
| export interface SchemaVersionMismatch { | ||
| kind: 'mismatch'; | ||
| expected: string; | ||
| applied: string | null; | ||
| } | ||
|
|
||
| export interface SchemaVersionOk { | ||
| kind: 'ok'; | ||
| expected: string | null; | ||
| applied: string | null; | ||
| } | ||
|
|
||
| /** | ||
| * Compare expected (from disk) vs applied (from DB). Returns a discriminated | ||
| * union the caller can branch on, instead of throwing — keeps tests cheap. | ||
| */ | ||
| export function compareSchemaVersions( | ||
| expected: string | null, | ||
| applied: string | null | ||
| ): SchemaVersionOk | SchemaVersionMismatch { | ||
| if (expected === null) return { kind: 'ok', expected, applied }; | ||
| if (applied !== null && applied >= expected) { | ||
| return { kind: 'ok', expected, applied }; | ||
| } | ||
| return { kind: 'mismatch', expected, applied }; | ||
| } | ||
|
|
||
| /** | ||
| * Boot-time assertion: throws if the database is behind the image's | ||
| * migrations directory. Call once during server startup, before opening the | ||
| * HTTP listener. | ||
| * | ||
| * Fails closed in production: if `migrationsDir` can't be listed (bad path, | ||
| * missing copy in the image, wrong volume mount), `NODE_ENV=production` | ||
| * treats that as a deployment defect and throws. In dev a missing/empty | ||
| * directory degrades to a warning — so `bun run dev` from a worktree that | ||
| * doesn't have `db/` checked out still boots. | ||
| */ | ||
| export async function assertSchemaUpToDate( | ||
| sql: DbClient, | ||
| options: { migrationsDir: string } | ||
| ): Promise<void> { | ||
| const expected = readExpectedSchemaVersion(options.migrationsDir); | ||
|
|
||
| if (expected === null && process.env.NODE_ENV === 'production') { | ||
| const msg = | ||
| `[schema-check] migrations directory ${options.migrationsDir} is empty or unreadable in a ` + | ||
| `production build — the image is missing db/migrations. Refusing to start.`; | ||
| logger.error({ migrationsDir: options.migrationsDir }, msg); | ||
| throw new Error(msg); | ||
| } | ||
|
|
||
| const applied = await readAppliedSchemaVersion(sql); | ||
| const result = compareSchemaVersions(expected, applied); | ||
|
|
||
| if (result.kind === 'mismatch') { | ||
| const msg = | ||
| `[schema-check] database is behind the image. Expected migration ${result.expected} ` + | ||
| `to be applied, but the highest applied version is ${result.applied ?? '(none)'}. ` + | ||
| `Run \`dbmate up\` against this database before rolling out this image.`; | ||
| logger.error( | ||
| { expected: result.expected, applied: result.applied, migrationsDir: options.migrationsDir }, | ||
| msg | ||
| ); | ||
| throw new Error(msg); | ||
| } | ||
|
|
||
| logger.info( | ||
| { expected: result.expected, applied: result.applied }, | ||
| '[schema-check] schema version up to date' | ||
| ); | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don’t return raw DB errors from readiness responses.
Line 415 exposes internal database error details from a public unauthenticated route. Return a generic readiness payload and log the error server-side instead.
Proposed fix
app.get('/health/ready', async (c) => { try { const sql = getDb(); await sql`SELECT 1`; return c.json({ status: 'ok', service: 'lobu-api' }); } catch (error) { + logger.warn({ err: error }, '[health/ready] database check failed'); return c.json( - { status: 'unready', service: 'lobu-api', error: errorMessage(error) }, + { status: 'unready', service: 'lobu-api' }, 503 ); } });🤖 Prompt for AI Agents