diff --git a/charts/lobu/templates/deployment.yaml b/charts/lobu/templates/deployment.yaml index 314386cff..d18437dea 100644 --- a/charts/lobu/templates/deployment.yaml +++ b/charts/lobu/templates/deployment.yaml @@ -94,9 +94,14 @@ spec: - secretRef: name: {{ $secretName }} {{- end }} + # Readiness probes the DB too (/health/ready does SELECT 1). + # Failing readiness pulls the pod out of the Service endpoint set + # without restarting it — right semantic for a transient DB blip. + # Liveness stays on /health (process-up only) so a pooler hiccup + # doesn't trigger CrashLoopBackOff. readinessProbe: httpGet: - path: /health + path: /health/ready port: http initialDelaySeconds: {{ .Values.healthCheck.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.healthCheck.readinessProbe.periodSeconds }} diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 77e006ac1..3b6d37270 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -387,7 +387,9 @@ app.use('/*', async (c, next) => { }); /** - * Health check endpoint + * Liveness probe — process is up. Cheap, dependency-free; failing this + * signals "restart the pod." Don't add DB or downstream checks here, or a + * transient pooler hiccup will cause a CrashLoop. */ app.get('/health', (c) => { return c.json({ @@ -398,6 +400,24 @@ app.get('/health', (c) => { }); }); +/** + * Readiness probe — process is up AND can talk to the database. Failing + * this drops the pod from the Service's endpoint set without restarting + * it, which is the right semantic for transient DB unavailability. + */ +app.get('/health/ready', async (c) => { + try { + const sql = getDb(); + await sql`SELECT 1`; + return c.json({ status: 'ok', service: 'lobu-api' }); + } catch (error) { + return c.json( + { status: 'unready', service: 'lobu-api', error: errorMessage(error) }, + 503 + ); + } +}); + /** * Scheduler health check endpoint * Returns detailed metrics about the feed scheduling system diff --git a/packages/server/src/server.ts b/packages/server/src/server.ts index 8cbf7c68e..92291e314 100644 --- a/packages/server/src/server.ts +++ b/packages/server/src/server.ts @@ -28,7 +28,7 @@ import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { getRequestListener } from '@hono/node-server'; import { Hono } from 'hono'; -import { closeDbSingleton, probeListenNotify } from './db/client'; +import { closeDbSingleton, getDb, probeListenNotify } from './db/client'; import { mountViteDev } from './dev-vite'; import type { Env } from './index'; import { app as mainApp } from './index'; @@ -43,6 +43,7 @@ import { assertExternalDepsResolvable } from '../../connector-worker/src/runtime import { isSentryReported, markSentryReported } from './sentry'; import { getEnvFromProcess } from './utils/env'; import logger from './utils/logger'; +import { assertSchemaUpToDate } from './utils/schema-version-check'; import { initWorkspaceProvider } from './workspace'; // Create a wrapper app that injects environment into each request @@ -145,6 +146,19 @@ async function main() { } process.env.DATABASE_URL = databaseUrl; + // Refuse to boot if the image expects a migration the database hasn't + // applied. Skippable via SKIP_SCHEMA_VERSION_CHECK=1 for emergency + // forward-flight (e.g. rolling back to an older image whose migrations + // dir is a strict prefix of what's already applied). See + // utils/schema-version-check.ts for the 2026-05-16 incident this guards. + if (process.env.SKIP_SCHEMA_VERSION_CHECK !== '1') { + const migrationsDir = + process.env.LOBU_MIGRATIONS_DIR?.trim() || path.join(PACKAGE_REPO_ROOT, 'db', 'migrations'); + await assertSchemaUpToDate(getDb(), { migrationsDir }); + } else { + logger.warn('[schema-check] SKIP_SCHEMA_VERSION_CHECK=1 — skipping boot-time assertion'); + } + // Verify LISTEN/NOTIFY actually delivers. This is a *detector*, not a gate: // the runs-queue has a 200ms SKIP-LOCKED poll fallback that keeps the queue // correct even when LISTEN is silently dropped (transaction-mode pgbouncer, diff --git a/packages/server/src/utils/__tests__/schema-version-check.test.ts b/packages/server/src/utils/__tests__/schema-version-check.test.ts new file mode 100644 index 000000000..f33f58c07 --- /dev/null +++ b/packages/server/src/utils/__tests__/schema-version-check.test.ts @@ -0,0 +1,137 @@ +import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import type { Sql } from 'postgres'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import type { DbClient } from '../../db/client'; +import { + assertSchemaUpToDate, + compareSchemaVersions, + readExpectedSchemaVersion, +} from '../schema-version-check'; + +/** + * Build a stub DbClient that always returns the given `applied` version when + * a tagged-template query runs. Just enough surface to satisfy the call site + * in `assertSchemaUpToDate`. + */ +function makeStubDb(applied: string | null): DbClient { + const fn = ((_strings: TemplateStringsArray, ..._values: unknown[]) => { + return Object.assign(Promise.resolve([{ version: applied }]), { count: 1 }); + }) as unknown as Sql; + return fn as unknown as DbClient; +} + +describe('readExpectedSchemaVersion', () => { + let dir: string; + + beforeEach(() => { + dir = mkdtempSync(path.join(tmpdir(), 'schema-check-')); + }); + + afterEach(() => { + rmSync(dir, { recursive: true, force: true }); + }); + + it('returns the highest version prefix from migration filenames', () => { + writeFileSync(path.join(dir, '20260512000000_first.sql'), ''); + writeFileSync(path.join(dir, '20260515150000_geo_enrichment.sql'), ''); + writeFileSync(path.join(dir, '20260516200000_events_search_tsv.sql'), ''); + expect(readExpectedSchemaVersion(dir)).toBe('20260516200000'); + }); + + it('ignores non-migration files (no dbmate-style prefix)', () => { + writeFileSync(path.join(dir, '20260512000000_real.sql'), ''); + writeFileSync(path.join(dir, 'README.md'), ''); + writeFileSync(path.join(dir, 'rollback.sql'), ''); + expect(readExpectedSchemaVersion(dir)).toBe('20260512000000'); + }); + + it('returns null for an unreadable directory (treat as "no expectation")', () => { + expect(readExpectedSchemaVersion(path.join(dir, 'does-not-exist'))).toBeNull(); + }); + + it('returns null for an empty directory', () => { + expect(readExpectedSchemaVersion(dir)).toBeNull(); + }); +}); + +describe('compareSchemaVersions', () => { + it('returns ok when applied >= expected', () => { + expect(compareSchemaVersions('20260516200000', '20260516200000')).toEqual({ + kind: 'ok', + expected: '20260516200000', + applied: '20260516200000', + }); + expect(compareSchemaVersions('20260516200000', '20260517000000')).toMatchObject({ + kind: 'ok', + }); + }); + + it('returns mismatch when applied is behind expected', () => { + expect(compareSchemaVersions('20260516200000', '20260516120000')).toEqual({ + kind: 'mismatch', + expected: '20260516200000', + applied: '20260516120000', + }); + }); + + it('returns mismatch when no version is applied yet', () => { + expect(compareSchemaVersions('20260516200000', null)).toEqual({ + kind: 'mismatch', + expected: '20260516200000', + applied: null, + }); + }); + + it('returns ok when expected is null (dev fallback / no migrations on disk)', () => { + expect(compareSchemaVersions(null, null)).toMatchObject({ kind: 'ok' }); + expect(compareSchemaVersions(null, '20260516200000')).toMatchObject({ kind: 'ok' }); + }); +}); + +describe('assertSchemaUpToDate', () => { + let dir: string; + const originalNodeEnv = process.env.NODE_ENV; + + beforeEach(() => { + dir = mkdtempSync(path.join(tmpdir(), 'schema-check-')); + }); + + afterEach(() => { + rmSync(dir, { recursive: true, force: true }); + process.env.NODE_ENV = originalNodeEnv; + }); + + it('throws in production when the migrations directory is missing (fail closed)', async () => { + process.env.NODE_ENV = 'production'; + const missingDir = path.join(dir, 'does-not-exist'); + await expect( + assertSchemaUpToDate(makeStubDb('20260516200000'), { migrationsDir: missingDir }) + ).rejects.toThrow(/missing db\/migrations/i); + }); + + it('passes in development when the migrations directory is missing (fail open for dev)', async () => { + process.env.NODE_ENV = 'development'; + const missingDir = path.join(dir, 'does-not-exist'); + await expect( + assertSchemaUpToDate(makeStubDb(null), { migrationsDir: missingDir }) + ).resolves.toBeUndefined(); + }); + + it('throws when the database is behind the migrations directory', async () => { + process.env.NODE_ENV = 'production'; + writeFileSync(path.join(dir, '20260516200000_events_search_tsv.sql'), ''); + await expect( + assertSchemaUpToDate(makeStubDb('20260515120000'), { migrationsDir: dir }) + ).rejects.toThrow(/database is behind/i); + }); + + it('passes when the database is at the expected version', async () => { + process.env.NODE_ENV = 'production'; + writeFileSync(path.join(dir, '20260516200000_events_search_tsv.sql'), ''); + await expect( + assertSchemaUpToDate(makeStubDb('20260516200000'), { migrationsDir: dir }) + ).resolves.toBeUndefined(); + }); +}); diff --git a/packages/server/src/utils/logger.ts b/packages/server/src/utils/logger.ts index 9f8f1ba0d..a0d070c04 100644 --- a/packages/server/src/utils/logger.ts +++ b/packages/server/src/utils/logger.ts @@ -25,6 +25,14 @@ const getLogLevel = (): pino.Level => { /** * Create a Pino logger instance */ +// pino's default error serializer only fires for the `err` key, so +// `logger.error({ error }, '...')` silently logs `error: {}` (Error's own +// fields are non-enumerable). Register the same serializer on the `error` +// key too so either spelling produces a real stack/message. Found during +// the 2026-05-16 prod outage where every queue failure logged `error: {}` +// and hid `column "events.search_tsv" does not exist`. +const errSerializer = pino.stdSerializers.err; + const logger = pino({ level: getLogLevel(), browser: { @@ -35,6 +43,10 @@ const logger = pino({ return { level: label }; }, }, + serializers: { + err: errSerializer, + error: errSerializer, + }, }); export default logger; diff --git a/packages/server/src/utils/schema-version-check.ts b/packages/server/src/utils/schema-version-check.ts new file mode 100644 index 000000000..a810e7968 --- /dev/null +++ b/packages/server/src/utils/schema-version-check.ts @@ -0,0 +1,137 @@ +/** + * Boot-time schema-version assertion. + * + * Compares the highest migration version present in `db/migrations/` (what + * this image expects to run against) with the highest version recorded in + * the database's `schema_migrations` table. If the database is behind, + * throws — the server boot path catches it, logs, and exits non-zero so the + * pod fails readiness and Kubernetes refuses to route traffic. + * + * Why: on 2026-05-16 the pre-upgrade migration Job for + * `20260516200000_events_search_tsv.sql` timed out at 60s (table rewrite on + * a 1.15M-row events table under ACCESS EXCLUSIVE > statement_timeout). The + * Job exited non-zero, but the app Deployment rolled forward anyway with an + * image that queried the new view's `search_tsv` column. Every request + * through the affected paths threw; the pod OOM'd and CrashLoopBackOff'd. + * A boot-time gate would have refused to start that image and kept the + * previous version serving traffic. + */ + +import { readdirSync } from 'node:fs'; +import type { DbClient } from '../db/client'; +import logger from './logger'; + +const MIGRATION_FILENAME_RE = /^(\d+)_[^/]+\.sql$/; + +/** + * Find the largest version prefix (e.g. `20260516200000`) across files in + * `migrationsDir`. Returns null if the directory is empty / unreadable — + * that case is treated as "no expectation" rather than failing closed, so a + * dev environment without migrations checked out doesn't deadlock boot. + */ +export function readExpectedSchemaVersion(migrationsDir: string): string | null { + let entries: string[]; + try { + entries = readdirSync(migrationsDir); + } catch (err) { + logger.warn( + { err, migrationsDir }, + '[schema-check] migrations directory not readable — skipping schema-version assertion' + ); + return null; + } + + let max: string | null = null; + for (const name of entries) { + const match = MIGRATION_FILENAME_RE.exec(name); + if (!match) continue; + const version = match[1]; + if (max === null || version > max) max = version; + } + return max; +} + +/** + * Query the highest applied version from the database. Returns null if the + * `schema_migrations` table is empty (fresh install) — the caller decides + * whether that's expected. + */ +export async function readAppliedSchemaVersion(sql: DbClient): Promise { + const rows = (await sql`SELECT MAX(version) AS version FROM public.schema_migrations`) as Array<{ + version: string | null; + }>; + return rows[0]?.version ?? null; +} + +export interface SchemaVersionMismatch { + kind: 'mismatch'; + expected: string; + applied: string | null; +} + +export interface SchemaVersionOk { + kind: 'ok'; + expected: string | null; + applied: string | null; +} + +/** + * Compare expected (from disk) vs applied (from DB). Returns a discriminated + * union the caller can branch on, instead of throwing — keeps tests cheap. + */ +export function compareSchemaVersions( + expected: string | null, + applied: string | null +): SchemaVersionOk | SchemaVersionMismatch { + if (expected === null) return { kind: 'ok', expected, applied }; + if (applied !== null && applied >= expected) { + return { kind: 'ok', expected, applied }; + } + return { kind: 'mismatch', expected, applied }; +} + +/** + * Boot-time assertion: throws if the database is behind the image's + * migrations directory. Call once during server startup, before opening the + * HTTP listener. + * + * Fails closed in production: if `migrationsDir` can't be listed (bad path, + * missing copy in the image, wrong volume mount), `NODE_ENV=production` + * treats that as a deployment defect and throws. In dev a missing/empty + * directory degrades to a warning — so `bun run dev` from a worktree that + * doesn't have `db/` checked out still boots. + */ +export async function assertSchemaUpToDate( + sql: DbClient, + options: { migrationsDir: string } +): Promise { + const expected = readExpectedSchemaVersion(options.migrationsDir); + + if (expected === null && process.env.NODE_ENV === 'production') { + const msg = + `[schema-check] migrations directory ${options.migrationsDir} is empty or unreadable in a ` + + `production build — the image is missing db/migrations. Refusing to start.`; + logger.error({ migrationsDir: options.migrationsDir }, msg); + throw new Error(msg); + } + + const applied = await readAppliedSchemaVersion(sql); + const result = compareSchemaVersions(expected, applied); + + if (result.kind === 'mismatch') { + const msg = + `[schema-check] database is behind the image. Expected migration ${result.expected} ` + + `to be applied, but the highest applied version is ${result.applied ?? '(none)'}. ` + + `Run \`dbmate up\` against this database before rolling out this image.`; + logger.error( + { expected: result.expected, applied: result.applied, migrationsDir: options.migrationsDir }, + msg + ); + throw new Error(msg); + } + + logger.info( + { expected: result.expected, applied: result.applied }, + '[schema-check] schema version up to date' + ); +} diff --git a/packages/web b/packages/web index 4516312cf..c390103ed 160000 --- a/packages/web +++ b/packages/web @@ -1 +1 @@ -Subproject commit 4516312cf933313e693a2093da4ffb024740cbc5 +Subproject commit c390103ed009f00f91fb5547a811235c914dd3d8