lobu-ai · buremba · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/charts/lobu/templates/deployment.yaml b/charts/lobu/templates/deployment.yaml
@@ -12,6 +12,12 @@ spec:
     {{- toYaml . | nindent 4 }}
   {{- else }}
   {{- if .Values.app.workspaces.enabled }}
+  # The workspaces PVC is RWO so we can't have two pods mount it at once;
+  # default to Recreate to avoid scheduling conflicts. Single-replica
+  # operators see a brief (~30s) "no available server" window on every
+  # deploy as a result. To eliminate it, set RWX storage on workspaces
+  # (or move workspaces off this deployment entirely) and override
+  # `app.strategy` to RollingUpdate via values. See PR #773 discussion.
   strategy:
     type: Recreate
   {{- end }}
@@ -36,6 +42,12 @@ spec:
       securityContext:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      # Grace period must exceed preStopDelaySeconds + reasonable shutdown
+      # time for the gateway (graceful_shutdown in server.ts cleans up
+      # task scheduler, embedded gateway, DB pool, HTTP server). Default
+      # k8s grace period is 30s; bump so the preStop sleep doesn't eat
+      # the whole window.
+      terminationGracePeriodSeconds: {{ .Values.app.terminationGracePeriodSeconds | default 45 }}
       containers:
         - name: app
           image: {{ include "lobu.appImage" . }}
@@ -94,6 +106,24 @@ spec:
             - secretRef:
                 name: {{ $secretName }}
           {{- end }}
+          {{- /*
+          PreStop hook is only useful under RollingUpdate (the new pod is
+          already serving, so deregistering the old pod via Service
+          endpoint removal + giving downstream caches time to notice it
+          shrinks the "old pod kept getting traffic during drain" window).
+          Under `Recreate` (the workspaces-RWO default), the new pod
+          doesn't start until the old one fully terminates — adding a
+          preStop sleep would EXTEND the no-available-server window by
+          its duration. We only emit the hook when preStopDelaySeconds
+          is explicitly > 0; ops repos using RollingUpdate set it,
+          Recreate-mode deploys leave it at the default 0.
+          */ -}}
+          {{- if gt (int (.Values.app.preStopDelaySeconds | default 0)) 0 }}
+          lifecycle:
+            preStop:
+              exec:
+                command: ["sh", "-c", "sleep {{ .Values.app.preStopDelaySeconds }}"]
+          {{- end }}
           # Readiness probes the DB too (/health/ready does SELECT 1).
           # Failing readiness pulls the pod out of the Service endpoint set
           # without restarting it — right semantic for a transient DB blip.

diff --git a/charts/lobu/values.yaml b/charts/lobu/values.yaml
@@ -35,6 +35,23 @@ app:
   podAnnotations: {}
   podSecurityContext: {}
   securityContext: {}
+  # Seconds the preStop hook sleeps before SIGTERM is sent. Gives Service
+  # endpoint deregistration + downstream LB cache time to notice the pod
+  # is going away, so in-flight requests during the deregistration lag
+  # still hit a live process.
+  #
+  # Default 0 (preStop hook NOT emitted). Set to ~15 only when running
+  # with `app.strategy.type: RollingUpdate` and replicaCount > 1.
+  # Under the default Recreate strategy (workspaces PVC is RWO), the
+  # new pod doesn't start until the old one terminates, so adding a
+  # preStop sleep would EXTEND the no-available-server window — the
+  # opposite of what we want.
+  preStopDelaySeconds: 0
+  # Total time k8s waits for the pod to stop before SIGKILL. Must be
+  # > preStopDelaySeconds + actual shutdown time. The gateway's graceful
+  # shutdown closes the HTTP server, stops the task scheduler, drains the
+  # DB pool — empirically ~5s.
+  terminationGracePeriodSeconds: 45
 
   resources:
     requests:

diff --git a/db/migrations/20260517020000_softdelete_orphan_feeds.sql b/db/migrations/20260517020000_softdelete_orphan_feeds.sql
@@ -0,0 +1,56 @@
+-- migrate:up
+
+-- Soft-delete feeds whose (connector_key, organization_id) has no active
+-- connector_definition row.
+--
+-- Why: the 2026-05-16 audit found feeds 117-155 (and others) referencing
+-- connector_key='website' in orgs that have no active definition for it
+-- (only one org has `website` active; one definition is archived). Every
+-- CheckDueFeeds tick (every minute) tried to materialize a sync run for
+-- these feeds and threw "No active connector definition found for X." —
+-- producing ~380 error logs / minute that masked real signal in stdout.
+--
+-- The app-side code path now warns + skips (no throw) for the same case
+-- so future orphans don't spam logs either. This migration is the one-time
+-- cleanup of the existing data.
+--
+-- Conservative criteria — match exactly the set CheckDueFeeds processes
+-- (so we only soft-delete feeds that actually produce the error stream).
+--   - feed has no pinned_version (= would have looked up connector_definitions)
+--   - feed.deleted_at IS NULL (still considered active)
+--   - feed.status = 'active' (CheckDueFeeds filters on this — see
+--     packages/server/src/scheduled/check-due-feeds.ts:36-43)
+--   - connection.deleted_at IS NULL AND connection.status = 'active' (same)
+--   - NO active connector_definition exists for that (key, organization) pair
+--
+-- Feeds in paused / pending_auth / error / revoked states are left alone
+-- — operators may be mid-recovery on them and they don't contribute to
+-- the error spam (CheckDueFeeds skips them anyway).
+--
+-- The same feed remains recoverable: clearing `deleted_at` + reinstalling
+-- the connector definition for the org restores it.
+
+UPDATE public.feeds f
+SET deleted_at = now()
+FROM public.connections c
+WHERE f.connection_id = c.id
+  AND f.deleted_at IS NULL
+  AND f.pinned_version IS NULL
+  AND f.status = 'active'
+  AND c.deleted_at IS NULL
+  AND c.status = 'active'
+  AND NOT EXISTS (
+    SELECT 1
+    FROM public.connector_definitions cd
+    WHERE cd.key = c.connector_key
+      AND cd.organization_id = f.organization_id
+      AND cd.status = 'active'
+  );
+
+-- migrate:down
+
+-- No-op: re-attaching the orphan feeds would require knowing which were
+-- soft-deleted by this migration vs. by an operator action. The original
+-- error condition is fixed in code; this migration is a one-shot data
+-- cleanup. To recover specific feeds in prod, clear `deleted_at` on the
+-- targeted rows manually and re-install the connector definition.
diff --git a/db/schema.sql b/db/schema.sql
@@ -4986,4 +4986,5 @@ INSERT INTO public.schema_migrations (version) VALUES
     ('20260516120000'),
     ('20260516200000'),
     ('20260516200100'),
-    ('20260517010000');
+    ('20260517010000'),
+    ('20260517020000');
diff --git a/packages/server/src/server.ts b/packages/server/src/server.ts
@@ -131,7 +131,11 @@ app.onError((err, c) => {
     });
     markSentryReported(c);
   }
-  logger.error({ err, path: c.req.path }, 'Unhandled error in HTTP handler');
+  // sentryReported:true tells the pino → Sentry forwarder in logger.ts
+  // to skip — Sentry already has this exception via the explicit
+  // captureException above. Without the marker, we'd send the same
+  // event twice.
+  logger.error({ err, path: c.req.path, sentryReported: true }, 'Unhandled error in HTTP handler');
   return c.json({ error: 'Internal server error' }, 500);
 });
 

diff --git a/packages/server/src/utils/logger.ts b/packages/server/src/utils/logger.ts
@@ -1,3 +1,4 @@
+import * as Sentry from '@sentry/node';
 import pino from 'pino';
 
 /**
@@ -22,9 +23,6 @@ const getLogLevel = (): pino.Level => {
   return 'debug';
 };
 
-/**
- * Create a Pino logger instance
- */
 // pino's default error serializer only fires for the `err` key, so
 // `logger.error({ error }, '...')` silently logs `error: {}` (Error's own
 // fields are non-enumerable). Register the same serializer on the `error`
@@ -33,20 +31,121 @@ const getLogLevel = (): pino.Level => {
 // and hid `column "events.search_tsv" does not exist`.
 const errSerializer = pino.stdSerializers.err;
 
-const logger = pino({
-  level: getLogLevel(),
-  browser: {
-    asObject: false,
+/**
+ * Sentry forwarding for logger.error() and logger.fatal().
+ *
+ * Prior to this hook, `logger.error(...)` only wrote to stdout. The Sentry
+ * capture middleware in server.ts:85-113 only fires on HTTP 500 responses,
+ * so error-logged failures inside background jobs (CheckDueFeeds, runs
+ * queue, scheduled tasks) were invisible to monitoring. The 2026-05-16
+ * audit found ~1914 errors / 5 min in stdout with zero Sentry issues.
+ *
+ * In-process dedupe: repeating errors are common (e.g. an orphan feed
+ * fails every 1-min CheckDueFeeds tick). We fingerprint by
+ * (msg, err.type, top stack frame) and only forward once per
+ * SENTRY_DEDUPE_WINDOW_MS per fingerprint. Sentry has its own grouping
+ * but every captureException still incurs an HTTP call + cost; this
+ * cuts the load without losing signal.
+ */
+const SENTRY_DEDUPE_WINDOW_MS = 60_000;
+const SENTRY_DEDUPE_MAX_ENTRIES = 1000;
+const sentryDedupe: Map<string, number> = new Map();
+
+function fingerprintAndCapture(parsed: Record<string, unknown>): void {
+  const level = parsed.level;
+  if (level !== 'error' && level !== 'fatal') return;
+
+  // Caller already captured this to Sentry (see server.ts onError +
+  // 500-response middleware). Skip to avoid duplicate events.
+  if (parsed.sentryReported === true) return;
+
+  const msg = typeof parsed.msg === 'string' ? parsed.msg : 'logger.error';
+  // pino.stdSerializers.err normalises both `err` and `error` (see serializers
+  // config below) to objects with `type` / `message` / `stack`.
+  const errObj =
+    (parsed.err as { type?: string; message?: string; stack?: string } | undefined) ??
+    (parsed.error as { type?: string; message?: string; stack?: string } | undefined);
+
+  // Include err.message in the fingerprint — pre-fix, "(msg, err.type,
+  // top stack frame)" grouped distinct errors raised from the same
+  // catch site (same Error type, same wrapping log line). One legit
+  // incident could be masked by a noisy unrelated one within the 60s
+  // window. err.message disambiguates them.
+  const errType = errObj?.type ?? '';
+  const errMessage = errObj?.message ?? '';
+  const stackTop = (errObj?.stack ?? '').split('\n')[1]?.trim() ?? '';
+  const fingerprint = `${msg}|${errType}|${errMessage}|${stackTop}`;
+
+  const now = Date.now();
+  const last = sentryDedupe.get(fingerprint);
+  if (last !== undefined && now - last < SENTRY_DEDUPE_WINDOW_MS) return;
+  sentryDedupe.set(fingerprint, now);
+
+  // Bound the dedupe map so a long-running pod doesn't grow it without limit.
+  if (sentryDedupe.size > SENTRY_DEDUPE_MAX_ENTRIES) {
+    const oldest = sentryDedupe.keys().next().value;
+    if (oldest !== undefined) sentryDedupe.delete(oldest);
+  }
+
+  try {
+    if (errObj?.message) {
+      // Reconstruct an Error so Sentry's grouping works on the stack.
+      const reconstructed = new Error(errObj.message);
+      if (errObj.stack) reconstructed.stack = errObj.stack;
+      Sentry.captureException(reconstructed, {
+        extra: parsed,
+        tags: { source: 'pino', level: String(level) },
+      });
+    } else {
+      Sentry.captureMessage(msg, {
+        level: level === 'fatal' ? 'fatal' : 'error',
+        extra: parsed,
+        tags: { source: 'pino' },
+      });
+    }
+  } catch {
+    // Sentry not initialised (test envs) or transient SDK failure — never
+    // crash the logger over telemetry.
+  }
+}
+
+/**
+ * pino destination that mirrors lines to stdout AND inspects each line for
+ * Sentry forwarding. Sync write is intentional: pino's default stdout path
+ * is sync too, and the JSON.parse + dedupe lookup is sub-microsecond.
+ */
+const sentryAwareStream: pino.DestinationStream = {
+  write(line: string): void {
+    process.stdout.write(line);
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(line);
+    } catch {
+      return;
+    }
+    if (parsed && typeof parsed === 'object') {
+      fingerprintAndCapture(parsed as Record<string, unknown>);
+    }
   },
-  formatters: {
-    level: (label) => {
-      return { level: label };
+};
+
+const logger = pino(
+  {
+    level: getLogLevel(),
+    browser: {
+      asObject: false,
+    },
+    formatters: {
+      level: (label) => {
+        return { level: label };
+      },
+    },
+    serializers: {
+      err: errSerializer,
+      error: errSerializer,
     },
   },
-  serializers: {
-    err: errSerializer,
-    error: errSerializer,
-  },
-});
+  sentryAwareStream
+);
 
 export default logger;
diff --git a/packages/server/src/utils/queue-helpers.ts b/packages/server/src/utils/queue-helpers.ts
@@ -93,7 +93,31 @@ async function createSyncRunWithClient(sql: DbClient, feedId: number): Promise<n
       LIMIT 1
     `;
     if (defRows.length === 0) {
-      throw new Error(`No active connector definition found for '${feed.connector_key}'.`);
+      // An orphan feed: the connector was archived/uninstalled in this org
+      // but the feed wasn't soft-deleted. Pre-fix this threw an Error on
+      // every CheckDueFeeds tick (1 / min), producing ~380 error logs / min
+      // for 33 feeds. Soft-delete the feed in-place so it stops appearing
+      // in CheckDueFeeds — otherwise warning + returning null would just
+      // produce a warn-level repeat at the same cadence forever, and a
+      // large enough orphan set could fill the CheckDueFeeds LIMIT 100
+      // and starve legitimate feeds.
+      //
+      // Operators can recover by clearing `deleted_at` after reinstalling
+      // the connector definition.
+      await sql`
+        UPDATE feeds
+        SET deleted_at = current_timestamp
+        WHERE id = ${feedId}
+      `;
+      logger.warn(
+        {
+          feedId,
+          connector_key: feed.connector_key,
+          organization_id: feed.organization_id,
+        },
+        '[queue] Soft-deleted orphan feed — no active connector_definition for (connector_key, org).'
+      );
+      return null;
     }
     connectorVersion = (defRows[0] as { version: string }).version;
   }

diff --git a/packages/web b/packages/web