diff --git a/assets/prometheus-k8s/prometheus-rule.yaml b/assets/prometheus-k8s/prometheus-rule.yaml index 85d43626f7..d4fdc1111d 100644 --- a/assets/prometheus-k8s/prometheus-rule.yaml +++ b/assets/prometheus-k8s/prometheus-rule.yaml @@ -166,11 +166,12 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. summary: Prometheus remote write is behind. expr: | + # Use the metric added in https://github.com/openshift/prometheus/pull/262 and related PRs. # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) - - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) ) > 120 diff --git a/jsonnet/utils/sanitize-rules.libsonnet b/jsonnet/utils/sanitize-rules.libsonnet index 8560190745..0b0b1b42ab 100644 --- a/jsonnet/utils/sanitize-rules.libsonnet +++ b/jsonnet/utils/sanitize-rules.libsonnet @@ -409,6 +409,17 @@ local patchedRules = [ labels: { severity: 'info', }, + expr: ||| + # Use the metric added in https://github.com/openshift/prometheus/pull/262 and related PRs. + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + - + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + > 120 + |||, }, ], },