lobu-ai · buremba · May 18, 2026 · May 18, 2026
diff --git a/charts/lobu/templates/deployment.yaml b/charts/lobu/templates/deployment.yaml
@@ -103,6 +103,19 @@ spec:
             - name: PUBLIC_WEB_URL
               value: {{ printf "https://%s" (first .Values.ingress.hosts) | quote }}
             {{- end }}
+            {{- $workerSmoke := .Values.releaseGates.smokeTest.workerSmoke | default dict }}
+            {{- if and $workerSmoke (hasKey $workerSmoke "enabled") $workerSmoke.enabled }}
+            # Pin SMOKE_TEST_ALLOWED_HOST to the in-cluster app Service DNS
+            # name so /api/internal/smoke/dispatch refuses any request
+            # whose Host header is not the cluster-internal service. The
+            # smoke Job hits this exact hostname via its curl URL; public
+            # ingress traffic always carries the operator's external host
+            # in Host, so this is the second layer of ingress-bypass
+            # defense (the first is the x-forwarded-* refusal in the
+            # route handler).
+            - name: SMOKE_TEST_ALLOWED_HOST
+              value: {{ printf "%s-app" (include "lobu.fullname" .) | quote }}
+            {{- end }}
             {{- if .Values.embeddings.service.port }}
             {{- if .Values.embeddings.serviceUrl }}
             - name: EMBEDDINGS_SERVICE_URL

diff --git a/charts/lobu/templates/smoke-test-job.yaml b/charts/lobu/templates/smoke-test-job.yaml
@@ -1,4 +1,8 @@
 {{- if .Values.releaseGates.smokeTest.enabled }}
+{{- $workerSmoke := .Values.releaseGates.smokeTest.workerSmoke | default dict -}}
+{{- $workerSmokeEnabled := and $workerSmoke (hasKey $workerSmoke "enabled") $workerSmoke.enabled -}}
+{{- $workerSmokeTimeout := default 90 $workerSmoke.timeoutSeconds | int -}}
+{{- $totalTimeout := add (.Values.releaseGates.smokeTest.timeoutSeconds | int) (ternary $workerSmokeTimeout 0 $workerSmokeEnabled) -}}
 apiVersion: batch/v1
 kind: Job
 metadata:
@@ -11,7 +15,7 @@ metadata:
     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 spec:
   backoffLimit: 0
-  activeDeadlineSeconds: {{ add (.Values.releaseGates.smokeTest.timeoutSeconds | int) 30 }}
+  activeDeadlineSeconds: {{ add $totalTimeout 30 }}
   template:
     metadata:
       labels:
@@ -27,13 +31,47 @@ spec:
         - name: smoke-test
           image: {{ include "lobu.appImage" . }}
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          {{- if $workerSmokeEnabled }}
+          {{- $secretName := include "lobu.secretName" . }}
+          {{- if $secretName }}
+          envFrom:
+            - secretRef:
+                # Phase 3 needs SMOKE_TEST_TOKEN from the deployment Secret
+                # to authenticate against /api/internal/smoke/dispatch.
+                # DATABASE_URL also comes from the same Secret on production
+                # installs (chart-managed secrets.create path also exposes it
+                # via the inline `env` block below).
+                name: {{ $secretName }}
+          {{- end }}
+          {{- end }}
           env:
             {{- if and .Values.secrets.create (hasKey .Values.secrets.stringData "DATABASE_URL") }}
             - name: DATABASE_URL
               value: {{ index .Values.secrets.stringData "DATABASE_URL" | quote }}
             {{- end }}
             - name: REQUIRED_SCHEMA
               value: {{ join "," (default (list) .Values.releaseGates.smokeTest.requiredSchema) | quote }}
+            {{- if $workerSmokeEnabled }}
+            - name: WORKER_SMOKE_ENABLED
+              value: "1"
+            # NOTE: the smoke agentId and organizationId are NOT passed
+            # to the dispatch endpoint — the gateway pins them
+            # server-side from SMOKE_TEST_AGENT_ID / SMOKE_TEST_ORG_ID
+            # in the deployment Secret so a leaked SMOKE_TEST_TOKEN
+            # cannot target a real tenant. The chart values below are
+            # only here so operators can keep the chart-side knobs in
+            # sync with what they configure in the Secret.
+            - name: WORKER_SMOKE_CONV_PREFIX
+              value: {{ default "smoke-" $workerSmoke.conversationIdPrefix | quote }}
+            - name: WORKER_SMOKE_TIMEOUT
+              value: {{ $workerSmokeTimeout | quote }}
+            - name: WORKER_SMOKE_INTERVAL
+              value: {{ default 3 $workerSmoke.intervalSeconds | int | quote }}
+            - name: WORKER_SMOKE_RELEASE
+              value: {{ .Release.Name | quote }}
+            - name: WORKER_SMOKE_REVISION
+              value: {{ .Release.Revision | quote }}
+            {{- end }}
           command:
             - /bin/bash
             - -ec
@@ -121,4 +159,96 @@ spec:
                 await sql.end({ timeout: 1 }).catch(() => {});
               }
               NODE
+
+              if [ -z "${WORKER_SMOKE_ENABLED:-}" ]; then
+                echo "workerSmoke disabled — skipping phase 3"
+                exit 0
+              fi
+              if [ -z "${SMOKE_TEST_TOKEN:-}" ]; then
+                echo "SMOKE_TEST_TOKEN env not set — cannot run worker smoke" >&2
+                exit 1
+              fi
+
+              conv_id="${WORKER_SMOKE_CONV_PREFIX}${WORKER_SMOKE_RELEASE}-${WORKER_SMOKE_REVISION}"
+              dispatch_url="http://{{ include "lobu.fullname" . }}-app:{{ .Values.service.port }}/api/internal/smoke/dispatch"
+              echo "phase 3: dispatching smoke run to $dispatch_url (conversation_id=$conv_id)"
+
+              http_status=$(curl -sS --max-time 10 -o /tmp/lobu-smoke-dispatch -w '%{http_code}' \
+                -H "Authorization: Bearer ${SMOKE_TEST_TOKEN}" \
+                -H 'Content-Type: application/json' \
+                -X POST "$dispatch_url" \
+                --data "$(printf '{"conversationId":"%s","messageText":"smoke ping"}' \
+                  "${conv_id}")" \
+                || echo "000")
+
+              if [ "${http_status}" != "200" ]; then
+                echo "smoke dispatch returned HTTP ${http_status}" >&2
+                cat /tmp/lobu-smoke-dispatch >&2 || true
+                exit 1
+              fi
+
+              run_id=$(node --input-type=module <<'NODE'
+              import fs from "node:fs";
+              try {
+                const body = JSON.parse(fs.readFileSync("/tmp/lobu-smoke-dispatch", "utf8"));
+                if (typeof body.runId === "number" && body.runId > 0) {
+                  process.stdout.write(String(body.runId));
+                } else {
+                  process.exit(2);
+                }
+              } catch {
+                process.exit(3);
+              }
+              NODE
+              )
+              if [ -z "${run_id}" ]; then
+                echo "smoke dispatch did not return a runId" >&2
+                cat /tmp/lobu-smoke-dispatch >&2 || true
+                exit 1
+              fi
+              echo "smoke dispatch enqueued run_id=${run_id}, polling for completion"
+
+              deadline=$((SECONDS + WORKER_SMOKE_TIMEOUT))
+              while true; do
+                terminal=$(WORKER_SMOKE_RUN_ID="${run_id}" node --input-type=module <<'NODE'
+              import postgres from "postgres";
+              const sql = postgres(process.env.DATABASE_URL, {
+                max: 1,
+                connect_timeout: 10,
+                idle_timeout: 1,
+                onnotice: () => {},
+              });
+              try {
+                const rows = await sql`
+                  SELECT terminal_status
+                  FROM public.agent_transcript_snapshot
+                  WHERE run_id = ${Number(process.env.WORKER_SMOKE_RUN_ID)}
+                  LIMIT 1
+                `;
+                if (rows.length > 0) {
+                  process.stdout.write(String(rows[0].terminal_status || ""));
+                }
+              } finally {
+                await sql.end({ timeout: 1 }).catch(() => {});
+              }
+              NODE
+                )
+
+                if [ "${terminal}" = "completed" ]; then
+                  echo "worker smoke run completed (run_id=${run_id})"
+                  exit 0
+                fi
+
+                if [ -n "${terminal}" ] && [ "${terminal}" != "completed" ]; then
+                  echo "worker smoke run terminated with status='${terminal}' (run_id=${run_id})" >&2
+                  exit 1
+                fi
+
+                if [ "$SECONDS" -ge "$deadline" ]; then
+                  echo "worker smoke timed out after ${WORKER_SMOKE_TIMEOUT}s waiting for run_id=${run_id}" >&2
+                  exit 1
+                fi
+
+                sleep "${WORKER_SMOKE_INTERVAL}"
+              done
 {{- end }}
diff --git a/charts/lobu/values.yaml b/charts/lobu/values.yaml
@@ -260,3 +260,45 @@ releaseGates:
       - device_workers.organization_id
       - connections.device_worker_id
       - connections.organization_id
+
+    # Phase 3 of the smoke job — drive an actual worker run end-to-end.
+    #
+    # When enabled, the Job POSTs to the internal /api/internal/smoke/dispatch
+    # endpoint, which inserts a synthetic chat_message run. The runs-queue
+    # MessageConsumer in the app pod claims it, spawns a worker subprocess,
+    # the worker runs, and on terminal cleanup writes a row into
+    # `agent_transcript_snapshot`. The Job polls that row and fails the
+    # deploy if `terminal_status='completed'` doesn't materialise inside
+    # `workerSmokeTimeoutSeconds`. This makes the recurring class of
+    # "gateway boots fine but workers can't process a single message"
+    # regressions un-shippable.
+    #
+    # Default OFF: the chart cannot preprovision the synthetic agent for
+    # you. Operators MUST add three keys to the deployment Secret before
+    # enabling, AND preprovision a matching agent row:
+    #
+    #   1. Generate a random token (≥32 chars) and add to the Secret:
+    #        SMOKE_TEST_TOKEN=<random>
+    #        SMOKE_TEST_AGENT_ID=<your-smoke-agent-id>
+    #        SMOKE_TEST_ORG_ID=<your-smoke-org-id>
+    #      The gateway PINS the smoke agentId + organizationId from the
+    #      env at dispatch time — caller-supplied values are ignored.
+    #      This makes it structurally impossible for a leaked
+    #      SMOKE_TEST_TOKEN to dispatch a synthetic run against a real
+    #      tenant's agent.
+    #   2. Preprovision the synthetic agent. The simplest path is
+    #      `lobu apply` against a dedicated "smoke" org/agent project
+    #      whose only agent.id matches `SMOKE_TEST_AGENT_ID`.
+    #   3. Bump this stanza to `enabled: true` and roll the chart.
+    workerSmoke:
+      enabled: false
+      # The smoke Job appends a release-scoped suffix so each helm
+      # upgrade gets its own conversation id within this prefix.
+      conversationIdPrefix: "smoke-"
+      # How long to wait for the snapshot row to appear with
+      # `terminal_status='completed'`. Worker spawn + LLM round-trip +
+      # snapshot POST is ~10-30s on the prod cluster; 90s leaves
+      # headroom for a cold image pull on the worker subprocess.
+      timeoutSeconds: 90
+      # DB poll interval inside the Job.
+      intervalSeconds: 3