Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions charts/lobu/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ spec:
- name: PUBLIC_WEB_URL
value: {{ printf "https://%s" (first .Values.ingress.hosts) | quote }}
{{- end }}
{{- $workerSmoke := .Values.releaseGates.smokeTest.workerSmoke | default dict }}
{{- if and $workerSmoke (hasKey $workerSmoke "enabled") $workerSmoke.enabled }}
# Pin SMOKE_TEST_ALLOWED_HOST to the in-cluster app Service DNS
# name so /api/internal/smoke/dispatch refuses any request
# whose Host header is not the cluster-internal service. The
# smoke Job hits this exact hostname via its curl URL; public
# ingress traffic always carries the operator's external host
# in Host, so this is the second layer of ingress-bypass
# defense (the first is the x-forwarded-* refusal in the
# route handler).
- name: SMOKE_TEST_ALLOWED_HOST
value: {{ printf "%s-app" (include "lobu.fullname" .) | quote }}
{{- end }}
{{- if .Values.embeddings.service.port }}
{{- if .Values.embeddings.serviceUrl }}
- name: EMBEDDINGS_SERVICE_URL
Expand Down
132 changes: 131 additions & 1 deletion charts/lobu/templates/smoke-test-job.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
{{- if .Values.releaseGates.smokeTest.enabled }}
{{- $workerSmoke := .Values.releaseGates.smokeTest.workerSmoke | default dict -}}
{{- $workerSmokeEnabled := and $workerSmoke (hasKey $workerSmoke "enabled") $workerSmoke.enabled -}}
{{- $workerSmokeTimeout := default 90 $workerSmoke.timeoutSeconds | int -}}
{{- $totalTimeout := add (.Values.releaseGates.smokeTest.timeoutSeconds | int) (ternary $workerSmokeTimeout 0 $workerSmokeEnabled) -}}
apiVersion: batch/v1
kind: Job
metadata:
Expand All @@ -11,7 +15,7 @@ metadata:
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: {{ add (.Values.releaseGates.smokeTest.timeoutSeconds | int) 30 }}
activeDeadlineSeconds: {{ add $totalTimeout 30 }}
template:
metadata:
labels:
Expand All @@ -27,13 +31,47 @@ spec:
- name: smoke-test
image: {{ include "lobu.appImage" . }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- if $workerSmokeEnabled }}
{{- $secretName := include "lobu.secretName" . }}
{{- if $secretName }}
envFrom:
- secretRef:
# Phase 3 needs SMOKE_TEST_TOKEN from the deployment Secret
# to authenticate against /api/internal/smoke/dispatch.
# DATABASE_URL also comes from the same Secret on production
# installs (chart-managed secrets.create path also exposes it
# via the inline `env` block below).
name: {{ $secretName }}
{{- end }}
{{- end }}
env:
{{- if and .Values.secrets.create (hasKey .Values.secrets.stringData "DATABASE_URL") }}
- name: DATABASE_URL
value: {{ index .Values.secrets.stringData "DATABASE_URL" | quote }}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
{{- end }}
- name: REQUIRED_SCHEMA
value: {{ join "," (default (list) .Values.releaseGates.smokeTest.requiredSchema) | quote }}
{{- if $workerSmokeEnabled }}
- name: WORKER_SMOKE_ENABLED
value: "1"
# NOTE: the smoke agentId and organizationId are NOT passed
# to the dispatch endpoint — the gateway pins them
# server-side from SMOKE_TEST_AGENT_ID / SMOKE_TEST_ORG_ID
# in the deployment Secret so a leaked SMOKE_TEST_TOKEN
# cannot target a real tenant. The chart values below are
# only here so operators can keep the chart-side knobs in
# sync with what they configure in the Secret.
- name: WORKER_SMOKE_CONV_PREFIX
value: {{ default "smoke-" $workerSmoke.conversationIdPrefix | quote }}
- name: WORKER_SMOKE_TIMEOUT
value: {{ $workerSmokeTimeout | quote }}
- name: WORKER_SMOKE_INTERVAL
value: {{ default 3 $workerSmoke.intervalSeconds | int | quote }}
- name: WORKER_SMOKE_RELEASE
value: {{ .Release.Name | quote }}
- name: WORKER_SMOKE_REVISION
value: {{ .Release.Revision | quote }}
{{- end }}
command:
- /bin/bash
- -ec
Expand Down Expand Up @@ -121,4 +159,96 @@ spec:
await sql.end({ timeout: 1 }).catch(() => {});
}
NODE

if [ -z "${WORKER_SMOKE_ENABLED:-}" ]; then
echo "workerSmoke disabled — skipping phase 3"
exit 0
fi
if [ -z "${SMOKE_TEST_TOKEN:-}" ]; then
echo "SMOKE_TEST_TOKEN env not set — cannot run worker smoke" >&2
exit 1
fi

conv_id="${WORKER_SMOKE_CONV_PREFIX}${WORKER_SMOKE_RELEASE}-${WORKER_SMOKE_REVISION}"
dispatch_url="http://{{ include "lobu.fullname" . }}-app:{{ .Values.service.port }}/api/internal/smoke/dispatch"
echo "phase 3: dispatching smoke run to $dispatch_url (conversation_id=$conv_id)"

http_status=$(curl -sS --max-time 10 -o /tmp/lobu-smoke-dispatch -w '%{http_code}' \
-H "Authorization: Bearer ${SMOKE_TEST_TOKEN}" \
-H 'Content-Type: application/json' \
-X POST "$dispatch_url" \
--data "$(printf '{"conversationId":"%s","messageText":"smoke ping"}' \
"${conv_id}")" \
|| echo "000")

if [ "${http_status}" != "200" ]; then
echo "smoke dispatch returned HTTP ${http_status}" >&2
cat /tmp/lobu-smoke-dispatch >&2 || true
exit 1
fi

run_id=$(node --input-type=module <<'NODE'
import fs from "node:fs";
try {
const body = JSON.parse(fs.readFileSync("/tmp/lobu-smoke-dispatch", "utf8"));
if (typeof body.runId === "number" && body.runId > 0) {
process.stdout.write(String(body.runId));
} else {
process.exit(2);
}
} catch {
process.exit(3);
}
NODE
)
if [ -z "${run_id}" ]; then
echo "smoke dispatch did not return a runId" >&2
cat /tmp/lobu-smoke-dispatch >&2 || true
exit 1
fi
echo "smoke dispatch enqueued run_id=${run_id}, polling for completion"

deadline=$((SECONDS + WORKER_SMOKE_TIMEOUT))
while true; do
terminal=$(WORKER_SMOKE_RUN_ID="${run_id}" node --input-type=module <<'NODE'
import postgres from "postgres";
const sql = postgres(process.env.DATABASE_URL, {
max: 1,
connect_timeout: 10,
idle_timeout: 1,
onnotice: () => {},
});
try {
const rows = await sql`
SELECT terminal_status
FROM public.agent_transcript_snapshot
WHERE run_id = ${Number(process.env.WORKER_SMOKE_RUN_ID)}
LIMIT 1
`;
if (rows.length > 0) {
process.stdout.write(String(rows[0].terminal_status || ""));
}
} finally {
await sql.end({ timeout: 1 }).catch(() => {});
}
NODE
)

if [ "${terminal}" = "completed" ]; then
echo "worker smoke run completed (run_id=${run_id})"
exit 0
fi

if [ -n "${terminal}" ] && [ "${terminal}" != "completed" ]; then
echo "worker smoke run terminated with status='${terminal}' (run_id=${run_id})" >&2
exit 1
fi

if [ "$SECONDS" -ge "$deadline" ]; then
echo "worker smoke timed out after ${WORKER_SMOKE_TIMEOUT}s waiting for run_id=${run_id}" >&2
exit 1
fi

sleep "${WORKER_SMOKE_INTERVAL}"
done
{{- end }}
42 changes: 42 additions & 0 deletions charts/lobu/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,45 @@ releaseGates:
- device_workers.organization_id
- connections.device_worker_id
- connections.organization_id

# Phase 3 of the smoke job — drive an actual worker run end-to-end.
#
# When enabled, the Job POSTs to the internal /api/internal/smoke/dispatch
# endpoint, which inserts a synthetic chat_message run. The runs-queue
# MessageConsumer in the app pod claims it, spawns a worker subprocess,
# the worker runs, and on terminal cleanup writes a row into
# `agent_transcript_snapshot`. The Job polls that row and fails the
# deploy if `terminal_status='completed'` doesn't materialise inside
# `workerSmokeTimeoutSeconds`. This makes the recurring class of
# "gateway boots fine but workers can't process a single message"
# regressions un-shippable.
#
# Default OFF: the chart cannot preprovision the synthetic agent for
# you. Operators MUST add three keys to the deployment Secret before
# enabling, AND preprovision a matching agent row:
#
# 1. Generate a random token (≥32 chars) and add to the Secret:
# SMOKE_TEST_TOKEN=<random>
# SMOKE_TEST_AGENT_ID=<your-smoke-agent-id>
# SMOKE_TEST_ORG_ID=<your-smoke-org-id>
# The gateway PINS the smoke agentId + organizationId from the
# env at dispatch time — caller-supplied values are ignored.
# This makes it structurally impossible for a leaked
# SMOKE_TEST_TOKEN to dispatch a synthetic run against a real
# tenant's agent.
# 2. Preprovision the synthetic agent. The simplest path is
# `lobu apply` against a dedicated "smoke" org/agent project
# whose only agent.id matches `SMOKE_TEST_AGENT_ID`.
# 3. Bump this stanza to `enabled: true` and roll the chart.
workerSmoke:
enabled: false
# The smoke Job appends a release-scoped suffix so each helm
# upgrade gets its own conversation id within this prefix.
conversationIdPrefix: "smoke-"
# How long to wait for the snapshot row to appear with
# `terminal_status='completed'`. Worker spawn + LLM round-trip +
# snapshot POST is ~10-30s on the prod cluster; 90s leaves
# headroom for a cold image pull on the worker subprocess.
timeoutSeconds: 90
# DB poll interval inside the Job.
intervalSeconds: 3
Loading
Loading