lobu-ai · buremba · May 16, 2026 · May 16, 2026 · May 16, 2026 · chatgpt-codex-connector
diff --git a/charts/lobu/templates/deployment.yaml b/charts/lobu/templates/deployment.yaml
@@ -7,21 +7,49 @@ metadata:
     app.kubernetes.io/component: api
 spec:
   replicas: {{ .Values.app.replicaCount }}
-  {{- with .Values.app.strategy }}
+  {{- /*
+  Deploy strategy resolution:
+    1. Explicit `app.strategy` override always wins.
+    2. Else if `app.allowMultiReplica: true` AND workspaces is RWX
+       (or disabled) → RollingUpdate (maxSurge: 1, maxUnavailable: 0).
+       This is the operator opt-in path for true blue/green deploys.
+    3. Else → Recreate (the safe default).
+
+  Why `allowMultiReplica` is an explicit flag, not auto-detected from
+  RWX: even with RWX storage, several in-memory components break with
+  >1 gateway replicas OR during the brief RollingUpdate overlap:
+    * `SseManager` (gateway/services/sse-manager.ts) — SSE streams are
+      pod-local; a job claimed by pod B broadcasts to no-one if the
+      client is on pod A.
+    * AskUser question routing
+      (gateway/connections/interaction-bridge.ts:193-214) — pending
+      questions live in a per-pod Map, button clicks can land on the
+      wrong pod and be dropped.
+    * Telegram polling mode (gateway/connections/chat-instance-manager
+      .ts:610-613) — every replica long-polls the same bot, causing
+      conflicts.
+  RWX is necessary but not sufficient. The flag forces operators to
+  acknowledge "I have only webhook-mode Chat connections AND no
+  AskUser/SSE flows in flight" before opting in.
+  */}}
+  {{- $rwxConfigured := has "ReadWriteMany" (.Values.app.workspaces.accessModes | default (list)) }}
+  {{- $rollSafe := and .Values.app.allowMultiReplica (or (not .Values.app.workspaces.enabled) $rwxConfigured) }}
+  {{- if .Values.app.strategy }}
   strategy:
-    {{- toYaml . | nindent 4 }}
+    {{- toYaml .Values.app.strategy | nindent 4 }}
+  {{- else if $rollSafe }}
+  # Operator opted in via app.allowMultiReplica + RWX workspaces.
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
   {{- else }}
-  {{- if .Values.app.workspaces.enabled }}
-  # The workspaces PVC is RWO so we can't have two pods mount it at once;
-  # default to Recreate to avoid scheduling conflicts. Single-replica
-  # operators see a brief (~30s) "no available server" window on every
-  # deploy as a result. To eliminate it, set RWX storage on workspaces
-  # (or move workspaces off this deployment entirely) and override
-  # `app.strategy` to RollingUpdate via values. See PR #773 discussion.
+  # Safe default. RWO PVC + in-memory SSE/AskUser/Telegram-polling
+  # state make rolling overlap unsafe — see comment above.
   strategy:
     type: Recreate
   {{- end }}
-  {{- end }}
   selector:
     matchLabels:
       {{- include "lobu.appSelectorLabels" . | nindent 6 }}

diff --git a/charts/lobu/values.yaml b/charts/lobu/values.yaml
@@ -64,8 +64,38 @@ app:
   env:
     NODE_ENV: production
 
+  # Opt-in to multi-replica / rolling deploys. DEFAULT FALSE — leaving
+  # this off keeps the safe `strategy: Recreate` behavior. Setting it
+  # true makes the chart pick `RollingUpdate` (maxSurge: 1,
+  # maxUnavailable: 0) when workspaces is RWX or disabled.
+  #
+  # Prerequisites BEFORE setting true — chart cannot detect these:
+  #   1. Workspaces volume must be RWX-capable:
+  #      `app.workspaces.accessModes: [ReadWriteMany]` + a storage class
+  #      that backs it (NFS, EFS, CephFS, Longhorn-RWX, …).
+  #   2. NO active Chat connections in `mode: "polling"` (Telegram).
+  #      Multiple replicas long-polling the same bot conflict. Use
+  #      webhook mode only — see
+  #      gateway/connections/chat-instance-manager.ts:610.
+  #   3. Acknowledge that the gateway has in-memory state for SSE
+  #      streams (gateway/services/sse-manager.ts) and AskUser
+  #      questions (gateway/connections/interaction-bridge.ts:193).
+  #      A request whose SSE stream / AskUser click lands on a
+  #      different replica than the one holding the state will be
+  #      silently dropped. Migrating these to Postgres LISTEN/NOTIFY +
+  #      durable storage is tracked as separate hardening work — until
+  #      then, occasional dropped streams / button clicks are the cost
+  #      of zero-downtime deploys on this configuration.
+  #
+  # Single-replica + RollingUpdate (replicaCount: 1, allowMultiReplica: true)
+  # still creates a brief overlap window where both pods are running.
+  # The same in-memory caveats apply during that window, just for a
+  # shorter span (~5-15s).
+  allowMultiReplica: false
+
   # Persistent workspaces volume. Embedded agent workers store session and
-  # workspace state below /app/workspaces.
+  # workspace state below /app/workspaces (watcher run scratch, agent panel
+  # sessions, etc.).
   workspaces:
     enabled: true
     size: 20Gi

diff --git a/packages/web b/packages/web