diff --git a/charts/lobu/templates/prometheusrule.yaml b/charts/lobu/templates/prometheusrule.yaml index e6696dbd2..239fdee62 100644 --- a/charts/lobu/templates/prometheusrule.yaml +++ b/charts/lobu/templates/prometheusrule.yaml @@ -23,7 +23,7 @@ spec: # replicas down / never scraped). - alert: WatcherAutomationSilent expr: | - (sum(increase(lobu_scheduled_job_runs_total{job="watcher-automation",outcome="success"}[15m])) or on() vector(0)) == 0 + (sum(increase(lobu_scheduled_job_runs_total{task="watcher-automation",outcome="success"}[15m])) or on() vector(0)) == 0 for: 15m labels: severity: critical @@ -46,12 +46,12 @@ spec: # A critical scheduled job erroring repeatedly (e.g. the stale-run reaper). - alert: LobuScheduledJobFailing expr: | - sum(increase(lobu_scheduled_job_runs_total{job=~"watcher-automation|check-stalled-executions",outcome="error"}[15m])) by (job) > 0 + sum(increase(lobu_scheduled_job_runs_total{task=~"watcher-automation|check-stalled-executions",outcome="error"}[15m])) by (task) > 0 for: 15m labels: severity: warning service: lobu annotations: - summary: "Lobu scheduled job {{`{{ $labels.job }}`}} is failing" - description: "The {{`{{ $labels.job }}`}} cron task threw on every run over 15m." + summary: "Lobu scheduled job {{`{{ $labels.task }}`}} is failing" + description: "The {{`{{ $labels.task }}`}} cron task threw on every run over 15m." {{- end }} diff --git a/charts/lobu/templates/servicemonitor.yaml b/charts/lobu/templates/servicemonitor.yaml index 8c093bb76..cd5e72add 100644 --- a/charts/lobu/templates/servicemonitor.yaml +++ b/charts/lobu/templates/servicemonitor.yaml @@ -23,8 +23,11 @@ spec: matchLabels: {{- include "lobu.appSelectorLabels" . | nindent 6 }} endpoints: + # The Prometheus exporter lives on the gateway, which the unified server + # mounts under /lobu — so metrics are at /lobu/metrics, not /metrics + # (root /metrics 302-redirects to the SPA → scrape fails on Content-Type). - port: http - path: /metrics + path: {{ .Values.metrics.serviceMonitor.path }} interval: {{ .Values.metrics.serviceMonitor.interval }} scrapeTimeout: {{ .Values.metrics.serviceMonitor.scrapeTimeout }} {{- end }} diff --git a/charts/lobu/values.yaml b/charts/lobu/values.yaml index a4c9df6de..51d10d1ef 100644 --- a/charts/lobu/values.yaml +++ b/charts/lobu/values.yaml @@ -229,6 +229,9 @@ metrics: serviceMonitor: enabled: false # namespace: "" # defaults to the release namespace + # Metrics are exposed by the gateway, mounted under /lobu by the unified + # server — so the scrape path is /lobu/metrics (root /metrics redirects). + path: /lobu/metrics interval: 30s scrapeTimeout: 10s additionalLabels: {} # e.g. { release: kube-prometheus-stack } diff --git a/packages/server/src/gateway/metrics/prometheus.ts b/packages/server/src/gateway/metrics/prometheus.ts index 92a7598ee..d6bd3c562 100644 --- a/packages/server/src/gateway/metrics/prometheus.ts +++ b/packages/server/src/gateway/metrics/prometheus.ts @@ -72,7 +72,7 @@ function initializeMetrics() { // handled by rate()/increase(). registerMetric( "lobu_scheduled_job_runs_total", - "Scheduled (cron) task ticks by job name and outcome (success|error)", + "Scheduled (cron) task ticks by task name and outcome (success|error)", "counter" ); registerMetric( diff --git a/packages/server/src/scheduled/task-scheduler.ts b/packages/server/src/scheduled/task-scheduler.ts index 15063fd09..fbed69f6d 100644 --- a/packages/server/src/scheduled/task-scheduler.ts +++ b/packages/server/src/scheduled/task-scheduler.ts @@ -256,13 +256,16 @@ export class TaskScheduler { payload: data.payload, taskRunId: Number(job.id), }); + // Label is `task`, NOT `job`: Prometheus reserves `job` for the scrape + // target and overwrites any same-named metric label, which silently + // collapses per-task series (and broke the watcher-automation alert). incrementCounter('lobu_scheduled_job_runs_total', { - job: data.name, + task: data.name, outcome: 'success', }); } catch (err) { incrementCounter('lobu_scheduled_job_runs_total', { - job: data.name, + task: data.name, outcome: 'error', }); throw err;