diff --git a/charts/lobu/templates/deployment.yaml b/charts/lobu/templates/deployment.yaml index 22ee75e1a..876af4163 100644 --- a/charts/lobu/templates/deployment.yaml +++ b/charts/lobu/templates/deployment.yaml @@ -7,21 +7,49 @@ metadata: app.kubernetes.io/component: api spec: replicas: {{ .Values.app.replicaCount }} - {{- with .Values.app.strategy }} + {{- /* + Deploy strategy resolution: + 1. Explicit `app.strategy` override always wins. + 2. Else if `app.allowMultiReplica: true` AND workspaces is RWX + (or disabled) → RollingUpdate (maxSurge: 1, maxUnavailable: 0). + This is the operator opt-in path for true blue/green deploys. + 3. Else → Recreate (the safe default). + + Why `allowMultiReplica` is an explicit flag, not auto-detected from + RWX: even with RWX storage, several in-memory components break with + >1 gateway replicas OR during the brief RollingUpdate overlap: + * `SseManager` (gateway/services/sse-manager.ts) — SSE streams are + pod-local; a job claimed by pod B broadcasts to no-one if the + client is on pod A. + * AskUser question routing + (gateway/connections/interaction-bridge.ts:193-214) — pending + questions live in a per-pod Map, button clicks can land on the + wrong pod and be dropped. + * Telegram polling mode (gateway/connections/chat-instance-manager + .ts:610-613) — every replica long-polls the same bot, causing + conflicts. + RWX is necessary but not sufficient. The flag forces operators to + acknowledge "I have only webhook-mode Chat connections AND no + AskUser/SSE flows in flight" before opting in. + */}} + {{- $rwxConfigured := has "ReadWriteMany" (.Values.app.workspaces.accessModes | default (list)) }} + {{- $rollSafe := and .Values.app.allowMultiReplica (or (not .Values.app.workspaces.enabled) $rwxConfigured) }} + {{- if .Values.app.strategy }} strategy: - {{- toYaml . | nindent 4 }} + {{- toYaml .Values.app.strategy | nindent 4 }} + {{- else if $rollSafe }} + # Operator opted in via app.allowMultiReplica + RWX workspaces. + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 {{- else }} - {{- if .Values.app.workspaces.enabled }} - # The workspaces PVC is RWO so we can't have two pods mount it at once; - # default to Recreate to avoid scheduling conflicts. Single-replica - # operators see a brief (~30s) "no available server" window on every - # deploy as a result. To eliminate it, set RWX storage on workspaces - # (or move workspaces off this deployment entirely) and override - # `app.strategy` to RollingUpdate via values. See PR #773 discussion. + # Safe default. RWO PVC + in-memory SSE/AskUser/Telegram-polling + # state make rolling overlap unsafe — see comment above. strategy: type: Recreate {{- end }} - {{- end }} selector: matchLabels: {{- include "lobu.appSelectorLabels" . | nindent 6 }} diff --git a/charts/lobu/values.yaml b/charts/lobu/values.yaml index 3009cd977..0f713f2e5 100644 --- a/charts/lobu/values.yaml +++ b/charts/lobu/values.yaml @@ -64,8 +64,38 @@ app: env: NODE_ENV: production + # Opt-in to multi-replica / rolling deploys. DEFAULT FALSE — leaving + # this off keeps the safe `strategy: Recreate` behavior. Setting it + # true makes the chart pick `RollingUpdate` (maxSurge: 1, + # maxUnavailable: 0) when workspaces is RWX or disabled. + # + # Prerequisites BEFORE setting true — chart cannot detect these: + # 1. Workspaces volume must be RWX-capable: + # `app.workspaces.accessModes: [ReadWriteMany]` + a storage class + # that backs it (NFS, EFS, CephFS, Longhorn-RWX, …). + # 2. NO active Chat connections in `mode: "polling"` (Telegram). + # Multiple replicas long-polling the same bot conflict. Use + # webhook mode only — see + # gateway/connections/chat-instance-manager.ts:610. + # 3. Acknowledge that the gateway has in-memory state for SSE + # streams (gateway/services/sse-manager.ts) and AskUser + # questions (gateway/connections/interaction-bridge.ts:193). + # A request whose SSE stream / AskUser click lands on a + # different replica than the one holding the state will be + # silently dropped. Migrating these to Postgres LISTEN/NOTIFY + + # durable storage is tracked as separate hardening work — until + # then, occasional dropped streams / button clicks are the cost + # of zero-downtime deploys on this configuration. + # + # Single-replica + RollingUpdate (replicaCount: 1, allowMultiReplica: true) + # still creates a brief overlap window where both pods are running. + # The same in-memory caveats apply during that window, just for a + # shorter span (~5-15s). + allowMultiReplica: false + # Persistent workspaces volume. Embedded agent workers store session and - # workspace state below /app/workspaces. + # workspace state below /app/workspaces (watcher run scratch, agent panel + # sessions, etc.). workspaces: enabled: true size: 20Gi diff --git a/packages/web b/packages/web index ce4495369..e1129f5cf 160000 --- a/packages/web +++ b/packages/web @@ -1 +1 @@ -Subproject commit ce4495369983b9919a1259d2da65da1a6315d888 +Subproject commit e1129f5cf99dba8ce639d5e9e0020e5539dfc584