From ffceadf7dd61dad210472e02b2299bb52e4aba05 Mon Sep 17 00:00:00 2001 From: Ayoub Mrini Date: Wed, 3 Sep 2025 11:07:05 +0200 Subject: [PATCH] chore(jsonnet): use prometheus_remote_storage_queue_highest_timestamp_in_seconds in PrometheusRemoteWriteBehind This metric was introduced in https://github.com/openshift/prometheus/pull/262 and related PRs. Dashboard expressions are not changed, since updating them may be more complex. Fixing the alert is more important and we can always revisit that if it causes confusion. On main, dashboards will be adjusted later once the jsonnet dependencies are updated. --- assets/prometheus-k8s/prometheus-rule.yaml | 5 +++-- jsonnet/utils/sanitize-rules.libsonnet | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/assets/prometheus-k8s/prometheus-rule.yaml b/assets/prometheus-k8s/prometheus-rule.yaml index 85d43626f7..d4fdc1111d 100644 --- a/assets/prometheus-k8s/prometheus-rule.yaml +++ b/assets/prometheus-k8s/prometheus-rule.yaml @@ -166,11 +166,12 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. summary: Prometheus remote write is behind. expr: | + # Use the metric added in https://github.com/openshift/prometheus/pull/262 and related PRs. # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) - - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) ) > 120 diff --git a/jsonnet/utils/sanitize-rules.libsonnet b/jsonnet/utils/sanitize-rules.libsonnet index 8560190745..0b0b1b42ab 100644 --- a/jsonnet/utils/sanitize-rules.libsonnet +++ b/jsonnet/utils/sanitize-rules.libsonnet @@ -409,6 +409,17 @@ local patchedRules = [ labels: { severity: 'info', }, + expr: ||| + # Use the metric added in https://github.com/openshift/prometheus/pull/262 and related PRs. + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + - + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + > 120 + |||, }, ], },