Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions charts/lobu/templates/prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
# replicas down / never scraped).
- alert: WatcherAutomationSilent
expr: |
(sum(increase(lobu_scheduled_job_runs_total{job="watcher-automation",outcome="success"}[15m])) or on() vector(0)) == 0
(sum(increase(lobu_scheduled_job_runs_total{task="watcher-automation",outcome="success"}[15m])) or on() vector(0)) == 0
for: 15m
labels:
severity: critical
Expand All @@ -46,12 +46,12 @@ spec:
# A critical scheduled job erroring repeatedly (e.g. the stale-run reaper).
- alert: LobuScheduledJobFailing
expr: |
sum(increase(lobu_scheduled_job_runs_total{job=~"watcher-automation|check-stalled-executions",outcome="error"}[15m])) by (job) > 0
sum(increase(lobu_scheduled_job_runs_total{task=~"watcher-automation|check-stalled-executions",outcome="error"}[15m])) by (task) > 0
for: 15m
labels:
severity: warning
service: lobu
annotations:
summary: "Lobu scheduled job {{`{{ $labels.job }}`}} is failing"
description: "The {{`{{ $labels.job }}`}} cron task threw on every run over 15m."
summary: "Lobu scheduled job {{`{{ $labels.task }}`}} is failing"
description: "The {{`{{ $labels.task }}`}} cron task threw on every run over 15m."
{{- end }}
5 changes: 4 additions & 1 deletion charts/lobu/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ spec:
matchLabels:
{{- include "lobu.appSelectorLabels" . | nindent 6 }}
endpoints:
# The Prometheus exporter lives on the gateway, which the unified server
# mounts under /lobu — so metrics are at /lobu/metrics, not /metrics
# (root /metrics 302-redirects to the SPA → scrape fails on Content-Type).
- port: http
path: /metrics
path: {{ .Values.metrics.serviceMonitor.path }}
interval: {{ .Values.metrics.serviceMonitor.interval }}
scrapeTimeout: {{ .Values.metrics.serviceMonitor.scrapeTimeout }}
{{- end }}
3 changes: 3 additions & 0 deletions charts/lobu/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ metrics:
serviceMonitor:
enabled: false
# namespace: "" # defaults to the release namespace
# Metrics are exposed by the gateway, mounted under /lobu by the unified
# server — so the scrape path is /lobu/metrics (root /metrics redirects).
path: /lobu/metrics
interval: 30s
scrapeTimeout: 10s
additionalLabels: {} # e.g. { release: kube-prometheus-stack }
Expand Down
2 changes: 1 addition & 1 deletion packages/server/src/gateway/metrics/prometheus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ function initializeMetrics() {
// handled by rate()/increase().
registerMetric(
"lobu_scheduled_job_runs_total",
"Scheduled (cron) task ticks by job name and outcome (success|error)",
"Scheduled (cron) task ticks by task name and outcome (success|error)",
"counter"
);
registerMetric(
Expand Down
7 changes: 5 additions & 2 deletions packages/server/src/scheduled/task-scheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -256,13 +256,16 @@ export class TaskScheduler {
payload: data.payload,
taskRunId: Number(job.id),
});
// Label is `task`, NOT `job`: Prometheus reserves `job` for the scrape
// target and overwrites any same-named metric label, which silently
// collapses per-task series (and broke the watcher-automation alert).
incrementCounter('lobu_scheduled_job_runs_total', {
job: data.name,
task: data.name,
outcome: 'success',
});
} catch (err) {
incrementCounter('lobu_scheduled_job_runs_total', {
job: data.name,
task: data.name,
outcome: 'error',
});
throw err;
Expand Down
Loading