diff --git a/charts/lobu/templates/deployment.yaml b/charts/lobu/templates/deployment.yaml index d18437dea..22ee75e1a 100644 --- a/charts/lobu/templates/deployment.yaml +++ b/charts/lobu/templates/deployment.yaml @@ -12,6 +12,12 @@ spec: {{- toYaml . | nindent 4 }} {{- else }} {{- if .Values.app.workspaces.enabled }} + # The workspaces PVC is RWO so we can't have two pods mount it at once; + # default to Recreate to avoid scheduling conflicts. Single-replica + # operators see a brief (~30s) "no available server" window on every + # deploy as a result. To eliminate it, set RWX storage on workspaces + # (or move workspaces off this deployment entirely) and override + # `app.strategy` to RollingUpdate via values. See PR #773 discussion. strategy: type: Recreate {{- end }} @@ -36,6 +42,12 @@ spec: securityContext: {{- toYaml . | nindent 8 }} {{- end }} + # Grace period must exceed preStopDelaySeconds + reasonable shutdown + # time for the gateway (graceful_shutdown in server.ts cleans up + # task scheduler, embedded gateway, DB pool, HTTP server). Default + # k8s grace period is 30s; bump so the preStop sleep doesn't eat + # the whole window. + terminationGracePeriodSeconds: {{ .Values.app.terminationGracePeriodSeconds | default 45 }} containers: - name: app image: {{ include "lobu.appImage" . }} @@ -94,6 +106,24 @@ spec: - secretRef: name: {{ $secretName }} {{- end }} + {{- /* + PreStop hook is only useful under RollingUpdate (the new pod is + already serving, so deregistering the old pod via Service + endpoint removal + giving downstream caches time to notice it + shrinks the "old pod kept getting traffic during drain" window). + Under `Recreate` (the workspaces-RWO default), the new pod + doesn't start until the old one fully terminates — adding a + preStop sleep would EXTEND the no-available-server window by + its duration. We only emit the hook when preStopDelaySeconds + is explicitly > 0; ops repos using RollingUpdate set it, + Recreate-mode deploys leave it at the default 0. + */ -}} + {{- if gt (int (.Values.app.preStopDelaySeconds | default 0)) 0 }} + lifecycle: + preStop: + exec: + command: ["sh", "-c", "sleep {{ .Values.app.preStopDelaySeconds }}"] + {{- end }} # Readiness probes the DB too (/health/ready does SELECT 1). # Failing readiness pulls the pod out of the Service endpoint set # without restarting it — right semantic for a transient DB blip. diff --git a/charts/lobu/values.yaml b/charts/lobu/values.yaml index bff9093be..3009cd977 100644 --- a/charts/lobu/values.yaml +++ b/charts/lobu/values.yaml @@ -35,6 +35,23 @@ app: podAnnotations: {} podSecurityContext: {} securityContext: {} + # Seconds the preStop hook sleeps before SIGTERM is sent. Gives Service + # endpoint deregistration + downstream LB cache time to notice the pod + # is going away, so in-flight requests during the deregistration lag + # still hit a live process. + # + # Default 0 (preStop hook NOT emitted). Set to ~15 only when running + # with `app.strategy.type: RollingUpdate` and replicaCount > 1. + # Under the default Recreate strategy (workspaces PVC is RWO), the + # new pod doesn't start until the old one terminates, so adding a + # preStop sleep would EXTEND the no-available-server window — the + # opposite of what we want. + preStopDelaySeconds: 0 + # Total time k8s waits for the pod to stop before SIGKILL. Must be + # > preStopDelaySeconds + actual shutdown time. The gateway's graceful + # shutdown closes the HTTP server, stops the task scheduler, drains the + # DB pool — empirically ~5s. + terminationGracePeriodSeconds: 45 resources: requests: diff --git a/db/migrations/20260517020000_softdelete_orphan_feeds.sql b/db/migrations/20260517020000_softdelete_orphan_feeds.sql new file mode 100644 index 000000000..c18f67061 --- /dev/null +++ b/db/migrations/20260517020000_softdelete_orphan_feeds.sql @@ -0,0 +1,56 @@ +-- migrate:up + +-- Soft-delete feeds whose (connector_key, organization_id) has no active +-- connector_definition row. +-- +-- Why: the 2026-05-16 audit found feeds 117-155 (and others) referencing +-- connector_key='website' in orgs that have no active definition for it +-- (only one org has `website` active; one definition is archived). Every +-- CheckDueFeeds tick (every minute) tried to materialize a sync run for +-- these feeds and threw "No active connector definition found for X." — +-- producing ~380 error logs / minute that masked real signal in stdout. +-- +-- The app-side code path now warns + skips (no throw) for the same case +-- so future orphans don't spam logs either. This migration is the one-time +-- cleanup of the existing data. +-- +-- Conservative criteria — match exactly the set CheckDueFeeds processes +-- (so we only soft-delete feeds that actually produce the error stream). +-- - feed has no pinned_version (= would have looked up connector_definitions) +-- - feed.deleted_at IS NULL (still considered active) +-- - feed.status = 'active' (CheckDueFeeds filters on this — see +-- packages/server/src/scheduled/check-due-feeds.ts:36-43) +-- - connection.deleted_at IS NULL AND connection.status = 'active' (same) +-- - NO active connector_definition exists for that (key, organization) pair +-- +-- Feeds in paused / pending_auth / error / revoked states are left alone +-- — operators may be mid-recovery on them and they don't contribute to +-- the error spam (CheckDueFeeds skips them anyway). +-- +-- The same feed remains recoverable: clearing `deleted_at` + reinstalling +-- the connector definition for the org restores it. + +UPDATE public.feeds f +SET deleted_at = now() +FROM public.connections c +WHERE f.connection_id = c.id + AND f.deleted_at IS NULL + AND f.pinned_version IS NULL + AND f.status = 'active' + AND c.deleted_at IS NULL + AND c.status = 'active' + AND NOT EXISTS ( + SELECT 1 + FROM public.connector_definitions cd + WHERE cd.key = c.connector_key + AND cd.organization_id = f.organization_id + AND cd.status = 'active' + ); + +-- migrate:down + +-- No-op: re-attaching the orphan feeds would require knowing which were +-- soft-deleted by this migration vs. by an operator action. The original +-- error condition is fixed in code; this migration is a one-shot data +-- cleanup. To recover specific feeds in prod, clear `deleted_at` on the +-- targeted rows manually and re-install the connector definition. diff --git a/db/schema.sql b/db/schema.sql index dae56a751..3aee817ce 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -4986,4 +4986,5 @@ INSERT INTO public.schema_migrations (version) VALUES ('20260516120000'), ('20260516200000'), ('20260516200100'), - ('20260517010000'); + ('20260517010000'), + ('20260517020000'); diff --git a/packages/server/src/server.ts b/packages/server/src/server.ts index 4f19e814f..6e28e460f 100644 --- a/packages/server/src/server.ts +++ b/packages/server/src/server.ts @@ -131,7 +131,11 @@ app.onError((err, c) => { }); markSentryReported(c); } - logger.error({ err, path: c.req.path }, 'Unhandled error in HTTP handler'); + // sentryReported:true tells the pino → Sentry forwarder in logger.ts + // to skip — Sentry already has this exception via the explicit + // captureException above. Without the marker, we'd send the same + // event twice. + logger.error({ err, path: c.req.path, sentryReported: true }, 'Unhandled error in HTTP handler'); return c.json({ error: 'Internal server error' }, 500); }); diff --git a/packages/server/src/utils/logger.ts b/packages/server/src/utils/logger.ts index a0d070c04..df7871167 100644 --- a/packages/server/src/utils/logger.ts +++ b/packages/server/src/utils/logger.ts @@ -1,3 +1,4 @@ +import * as Sentry from '@sentry/node'; import pino from 'pino'; /** @@ -22,9 +23,6 @@ const getLogLevel = (): pino.Level => { return 'debug'; }; -/** - * Create a Pino logger instance - */ // pino's default error serializer only fires for the `err` key, so // `logger.error({ error }, '...')` silently logs `error: {}` (Error's own // fields are non-enumerable). Register the same serializer on the `error` @@ -33,20 +31,121 @@ const getLogLevel = (): pino.Level => { // and hid `column "events.search_tsv" does not exist`. const errSerializer = pino.stdSerializers.err; -const logger = pino({ - level: getLogLevel(), - browser: { - asObject: false, +/** + * Sentry forwarding for logger.error() and logger.fatal(). + * + * Prior to this hook, `logger.error(...)` only wrote to stdout. The Sentry + * capture middleware in server.ts:85-113 only fires on HTTP 500 responses, + * so error-logged failures inside background jobs (CheckDueFeeds, runs + * queue, scheduled tasks) were invisible to monitoring. The 2026-05-16 + * audit found ~1914 errors / 5 min in stdout with zero Sentry issues. + * + * In-process dedupe: repeating errors are common (e.g. an orphan feed + * fails every 1-min CheckDueFeeds tick). We fingerprint by + * (msg, err.type, top stack frame) and only forward once per + * SENTRY_DEDUPE_WINDOW_MS per fingerprint. Sentry has its own grouping + * but every captureException still incurs an HTTP call + cost; this + * cuts the load without losing signal. + */ +const SENTRY_DEDUPE_WINDOW_MS = 60_000; +const SENTRY_DEDUPE_MAX_ENTRIES = 1000; +const sentryDedupe: Map = new Map(); + +function fingerprintAndCapture(parsed: Record): void { + const level = parsed.level; + if (level !== 'error' && level !== 'fatal') return; + + // Caller already captured this to Sentry (see server.ts onError + + // 500-response middleware). Skip to avoid duplicate events. + if (parsed.sentryReported === true) return; + + const msg = typeof parsed.msg === 'string' ? parsed.msg : 'logger.error'; + // pino.stdSerializers.err normalises both `err` and `error` (see serializers + // config below) to objects with `type` / `message` / `stack`. + const errObj = + (parsed.err as { type?: string; message?: string; stack?: string } | undefined) ?? + (parsed.error as { type?: string; message?: string; stack?: string } | undefined); + + // Include err.message in the fingerprint — pre-fix, "(msg, err.type, + // top stack frame)" grouped distinct errors raised from the same + // catch site (same Error type, same wrapping log line). One legit + // incident could be masked by a noisy unrelated one within the 60s + // window. err.message disambiguates them. + const errType = errObj?.type ?? ''; + const errMessage = errObj?.message ?? ''; + const stackTop = (errObj?.stack ?? '').split('\n')[1]?.trim() ?? ''; + const fingerprint = `${msg}|${errType}|${errMessage}|${stackTop}`; + + const now = Date.now(); + const last = sentryDedupe.get(fingerprint); + if (last !== undefined && now - last < SENTRY_DEDUPE_WINDOW_MS) return; + sentryDedupe.set(fingerprint, now); + + // Bound the dedupe map so a long-running pod doesn't grow it without limit. + if (sentryDedupe.size > SENTRY_DEDUPE_MAX_ENTRIES) { + const oldest = sentryDedupe.keys().next().value; + if (oldest !== undefined) sentryDedupe.delete(oldest); + } + + try { + if (errObj?.message) { + // Reconstruct an Error so Sentry's grouping works on the stack. + const reconstructed = new Error(errObj.message); + if (errObj.stack) reconstructed.stack = errObj.stack; + Sentry.captureException(reconstructed, { + extra: parsed, + tags: { source: 'pino', level: String(level) }, + }); + } else { + Sentry.captureMessage(msg, { + level: level === 'fatal' ? 'fatal' : 'error', + extra: parsed, + tags: { source: 'pino' }, + }); + } + } catch { + // Sentry not initialised (test envs) or transient SDK failure — never + // crash the logger over telemetry. + } +} + +/** + * pino destination that mirrors lines to stdout AND inspects each line for + * Sentry forwarding. Sync write is intentional: pino's default stdout path + * is sync too, and the JSON.parse + dedupe lookup is sub-microsecond. + */ +const sentryAwareStream: pino.DestinationStream = { + write(line: string): void { + process.stdout.write(line); + let parsed: unknown; + try { + parsed = JSON.parse(line); + } catch { + return; + } + if (parsed && typeof parsed === 'object') { + fingerprintAndCapture(parsed as Record); + } }, - formatters: { - level: (label) => { - return { level: label }; +}; + +const logger = pino( + { + level: getLogLevel(), + browser: { + asObject: false, + }, + formatters: { + level: (label) => { + return { level: label }; + }, + }, + serializers: { + err: errSerializer, + error: errSerializer, }, }, - serializers: { - err: errSerializer, - error: errSerializer, - }, -}); + sentryAwareStream +); export default logger; diff --git a/packages/server/src/utils/queue-helpers.ts b/packages/server/src/utils/queue-helpers.ts index 07ecf09c2..6fd930df0 100644 --- a/packages/server/src/utils/queue-helpers.ts +++ b/packages/server/src/utils/queue-helpers.ts @@ -93,7 +93,31 @@ async function createSyncRunWithClient(sql: DbClient, feedId: number): Promise