From 65c22e68a98c2538dd9e24bbf4da59a2d8ab5f2c Mon Sep 17 00:00:00 2001 From: R-Lawton Date: Tue, 30 Apr 2024 21:39:58 +0100 Subject: [PATCH 1/6] availability and latency slos --- examples/alerts/availability_test.yaml | 44 ++++++ examples/alerts/latency_test.yaml | 41 +++++ examples/alerts/slo-availability.yaml | 197 ++++++++++++++++++++++++ examples/alerts/slo-latency.yaml | 182 ++++++++++++++++++++++ examples/alerts/sloth/availability.yaml | 26 ++++ examples/alerts/sloth/latency.yaml | 25 +++ 6 files changed, 515 insertions(+) create mode 100644 examples/alerts/availability_test.yaml create mode 100644 examples/alerts/latency_test.yaml create mode 100644 examples/alerts/slo-availability.yaml create mode 100644 examples/alerts/slo-latency.yaml create mode 100644 examples/alerts/sloth/availability.yaml create mode 100644 examples/alerts/sloth/latency.yaml diff --git a/examples/alerts/availability_test.yaml b/examples/alerts/availability_test.yaml new file mode 100644 index 000000000..a88dcb658 --- /dev/null +++ b/examples/alerts/availability_test.yaml @@ -0,0 +1,44 @@ +rule_files: + - slo-availability.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: istio_requests_total{job="ingress-metrics-proxy",response_code="500"} + values: "0+0x30 0+10x30" + - series: istio_requests_total{job="ingress-metrics-proxy"} + values: "0+1x30 31+90x30" + alert_rule_test: + - eval_time: 30m + alertname: KuadrantAvailabilityHighErrorRate + exp_alerts: [] + - eval_time: 60m + alertname: KuadrantAvailabilityHighErrorRate + exp_alerts: + - exp_labels: + alertname: KuadrantAvailabilityHighErrorRate + category: availability + owner: kuadrant-org + severity: critical + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_severity: page + sloth_slo: requests-availability + exp_annotations: + summary: High error rate on HTTPRoute requests responses + title: (page) kuadrant requests-availability SLO error budget burn rate is too fast. + - exp_labels: + alertname: KuadrantAvailabilityHighErrorRate + category: availability + owner: kuadrant-org + severity: warning + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_severity: ticket + sloth_slo: requests-availability + exp_annotations: + summary: High error rate on HTTPRoute requests responses + title: (ticket) kuadrant requests-availability SLO error budget burn rate is too fast. + \ No newline at end of file diff --git a/examples/alerts/latency_test.yaml b/examples/alerts/latency_test.yaml new file mode 100644 index 000000000..904387ec7 --- /dev/null +++ b/examples/alerts/latency_test.yaml @@ -0,0 +1,41 @@ +rule_files: + - slo-latency.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: istio_request_duration_milliseconds_bucket{le="250",job="ingress-metrics-proxy",response_code="200"} + values: "0+0x30 31+10x30" + - series: istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"} + values: "0+1x30 31+90x30" + alert_rule_test: + - eval_time: 30m + alertname: KuadrantlatencyHighErrorRate + exp_alerts: [] + - eval_time: 60m + alertname: KuadrantlatencyHighErrorRate + exp_alerts: + - exp_labels: + alertname: KuadrantlatencyHighErrorRate + category: latency + severity: critical + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_severity: page + sloth_slo: kuadrant-requests-latency + exp_annotations: + summary: High latency on HTTPRoute requests responses + title: (page) kuadrant kuadrant-requests-latency SLO error budget burn rate is too fast. + - exp_labels: + alertname: KuadrantlatencyHighErrorRate + category: latency + severity: warning + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_severity: ticket + sloth_slo: kuadrant-requests-latency + exp_annotations: + summary: High latency on HTTPRoute requests responses + title: (ticket) kuadrant kuadrant-requests-latency SLO error budget burn rate is too fast. \ No newline at end of file diff --git a/examples/alerts/slo-availability.yaml b/examples/alerts/slo-availability.yaml new file mode 100644 index 000000000..f00c0245a --- /dev/null +++ b/examples/alerts/slo-availability.yaml @@ -0,0 +1,197 @@ +groups: +- name: sloth-slo-sli-recordings-kuadrant-requests-availability + rules: + - record: slo:sli_error:ratio_rate5m + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[5m])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[5m]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 5m + - record: slo:sli_error:ratio_rate30m + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[30m])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[30m]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 30m + - record: slo:sli_error:ratio_rate1h + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1h])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1h]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 1h + - record: slo:sli_error:ratio_rate2h + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[2h])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[2h]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 2h + - record: slo:sli_error:ratio_rate6h + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[6h])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[6h]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 6h + - record: slo:sli_error:ratio_rate1d + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1d])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1d]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 1d + - record: slo:sli_error:ratio_rate3d + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[3d])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[3d]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 3d + - record: slo:sli_error:ratio_rate30d + expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[30d]) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 30d +- name: sloth-slo-meta-recordings-kuadrant-requests-availability + rules: + - record: slo:objective:ratio + expr: vector(0.9995) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:error_budget:ratio + expr: vector(1-0.9995) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:time_period:days + expr: vector(30) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:current_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:period_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate30d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:period_error_budget_remaining:ratio + expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-availability", + sloth_service="kuadrant", sloth_slo="requests-availability"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: sloth_slo_info + expr: vector(1) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_mode: cli-gen-prom + sloth_objective: "99.95" + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 +- name: sloth-slo-alerts-kuadrant-requests-availability + rules: + - alert: KuadrantAvailabilityHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (14.4 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (14.4 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (6 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (6 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: availability + severity: critical + sloth_severity: page + annotations: + summary: High error rate on HTTPRoute requests responses + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. + - alert: KuadrantAvailabilityHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (3 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (3 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (1 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (1 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: availability + severity: warning + sloth_severity: ticket + annotations: + summary: High error rate on HTTPRoute requests responses + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. \ No newline at end of file diff --git a/examples/alerts/slo-latency.yaml b/examples/alerts/slo-latency.yaml new file mode 100644 index 000000000..bed4005c8 --- /dev/null +++ b/examples/alerts/slo-latency.yaml @@ -0,0 +1,182 @@ +groups: +- name: sloth-slo-sli-recordings-kuadrant-kuadrant-requests-latency + rules: + - record: slo:sli_error:ratio_rate5m + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[5m]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[5m]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 5m + - record: slo:sli_error:ratio_rate30m + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[30m]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[30m]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 30m + - record: slo:sli_error:ratio_rate1h + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1h]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1h]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 1h + - record: slo:sli_error:ratio_rate2h + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[2h]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[2h]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 2h + - record: slo:sli_error:ratio_rate6h + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[6h]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[6h]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 6h + - record: slo:sli_error:ratio_rate1d + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1d]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1d]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 1d + - record: slo:sli_error:ratio_rate3d + expr: | + (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[3d]) )by (request_host)) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[3d]))by (request_host)) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 3d + - record: slo:sli_error:ratio_rate30d + expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"}[30d]) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_window: 30d +- name: sloth-slo-meta-recordings-kuadrant-kuadrant-requests-latency + rules: + - record: slo:objective:ratio + expr: vector(0.9995) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + - record: slo:error_budget:ratio + expr: vector(1-0.9995) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + - record: slo:time_period:days + expr: vector(30) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + - record: slo:current_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + - record: slo:period_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate30d{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + - record: slo:period_error_budget_remaining:ratio + expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-kuadrant-requests-latency", + sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + - record: sloth_slo_info + expr: vector(1) + labels: + sloth_id: kuadrant-kuadrant-requests-latency + sloth_mode: cli-gen-prom + sloth_objective: "99.95" + sloth_service: kuadrant + sloth_slo: kuadrant-requests-latency + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 +- name: sloth-slo-alerts-kuadrant-kuadrant-requests-latency + rules: + - alert: KuadrantlatencyHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (14.4 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (14.4 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (6 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (6 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: latency + severity: critical + sloth_severity: page + annotations: + summary: High latency on HTTPRoute requests responses + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. + - alert: KuadrantlatencyHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (3 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (3 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (1 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (1 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: latency + severity: warning + sloth_severity: ticket + annotations: + summary: High latency on HTTPRoute requests responses + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. \ No newline at end of file diff --git a/examples/alerts/sloth/availability.yaml b/examples/alerts/sloth/availability.yaml new file mode 100644 index 000000000..f031020cf --- /dev/null +++ b/examples/alerts/sloth/availability.yaml @@ -0,0 +1,26 @@ +version: "prometheus/v1" +service: "kuadrant" +labels: + owner: "kuadrant-org" +slos: + - name: "requests-availability" + objective: 99.95 + description: "Multi window multi burn rate SLO based on availability for HTTP request responses." + sli: + events: + error_query: sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[{{.window}}])) by (request_host) + total_query: sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[{{.window}}]) )by (request_host) + alerting: + name: KuadrantAvailabilityHighErrorRate + labels: + category: "availability" + annotations: + # Overwrite default Sloth SLO alert summmary on ticket and page alerts. + summary: "High error rate on HTTPRoute requests responses" + page_alert: + labels: + severity: critical + ticket_alert: + labels: + severity: warning + diff --git a/examples/alerts/sloth/latency.yaml b/examples/alerts/sloth/latency.yaml new file mode 100644 index 000000000..e26d551da --- /dev/null +++ b/examples/alerts/sloth/latency.yaml @@ -0,0 +1,25 @@ +version: "prometheus/v1" +service: "kuadrant" +labels: + owner: "kuadrant-org" +slos: + - name: "requests-latency" + objective: 99.95 + description: "Multi window multi burn rate SLO based on latency for HTTP request responses." + sli: + events: + error_query: sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[{{.window}}]) )by (request_host) + total_query: sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) + alerting: + name: KuadrantlatencyHighErrorRate + labels: + category: "latency" + annotations: + # Overwrite default Sloth SLO alert summmary on ticket and page alerts. + summary: "High latency on HTTPRoute requests responses {{$labels.request_host}}" + page_alert: + labels: + severity: critical + ticket_alert: + labels: + severity: warning \ No newline at end of file From 6b77aaadfde9d5ab69f0919df94ff255686748ae Mon Sep 17 00:00:00 2001 From: R-Lawton Date: Fri, 3 May 2024 12:17:35 +0100 Subject: [PATCH 2/6] changing latency slo format to match sloth and adding unti tests and make target --- examples/alerts/README.md | 35 ++ examples/alerts/slo-availability.yaml | 400 +++++++++--------- examples/alerts/slo-latency.yaml | 386 +++++++++-------- examples/alerts/sloth/availability.yaml | 2 +- examples/alerts/sloth/latency.yaml | 10 +- .../alerts/{ => tests}/availability_test.yaml | 7 +- examples/alerts/{ => tests}/latency_test.yaml | 21 +- make/alerts.mk | 25 ++ 8 files changed, 491 insertions(+), 395 deletions(-) create mode 100644 examples/alerts/README.md rename examples/alerts/{ => tests}/availability_test.yaml (93%) rename examples/alerts/{ => tests}/latency_test.yaml (66%) create mode 100644 make/alerts.mk diff --git a/examples/alerts/README.md b/examples/alerts/README.md new file mode 100644 index 000000000..da13b1754 --- /dev/null +++ b/examples/alerts/README.md @@ -0,0 +1,35 @@ +## SLO Multi burn rate multi window alerts +Kuadrant have created two example SLO alerts to help give ideas on the types of SLO alerts that could be used with the operator. We have created one alert for latency and one for availability, both are Multiwindow, Multi-Burn-Rate Alerts. The alerts show a scenario where a 28d rolling window is used and a uptime of 99.95% i.e only .1% error margin is desired. This in real world time would be downtime of around. These values can be changed to suit different scenarios + +| Time Frame | Duration | +|------------|------------| +| Daily: | 43s | +| Weekly: | 5m 2.4s | +| Monthly: | 21m 44s | +| Quarterly: | 1h 5m 12s | +| Yearly: | 4h 20m 49s | + +### Sloth +Sloth is a tool to aid in the creation of multi burn rate and multi window SLO alerts and was used to create both the availability and latency alerts. It follows the common standard set out by [Google's SRE book](https://sre.google/workbook/implementing-slos/). Sloth generates alerts based on specific specs given. The specs for our example alerts can be found in the example/sloth folder. + +#### Metrics used for the alerts + +#### Availability +For the availability SLO alerts the Istio metric `istio_requests_total` was used as its a counter type metric meaning the values can only increase as well as it gives information on all requests handled by the Istio proxy. + +#### Latency +For the availability SLO alerts the Istio metric 'istio_requests_total' was used as its a Distribution type metric meaning values are mapped to different ranges of frequency's as well as it gives information duration of requests. + +### Sloth generation +You can modify the examples Sloth specs we have and regenerate the prometheus rules using the Sloth CLI and the generate command. For more information please the [Sloth website](https://sloth.dev/usage/cli/) + +``` +generate -i examples/alerts/sloth/latency.yaml --default-slo-period=28d +``` + +### Prometheus unit tests +There are also two matching unit tests to verify and test the alerts that Sloth has generated. These can be run using the make target: + +``` +make alerts-tests +``` diff --git a/examples/alerts/slo-availability.yaml b/examples/alerts/slo-availability.yaml index f00c0245a..1ec8a8bb8 100644 --- a/examples/alerts/slo-availability.yaml +++ b/examples/alerts/slo-availability.yaml @@ -1,197 +1,203 @@ -groups: -- name: sloth-slo-sli-recordings-kuadrant-requests-availability - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[5m])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[5m]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[30m])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[30m]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1h])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1h]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[2h])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[2h]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[6h])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[6h]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1d])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1d]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[3d])) by (request_host)) - / - (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[3d]) )by (request_host)) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[30d]) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_window: 30d -- name: sloth-slo-meta-recordings-kuadrant-requests-availability - rules: - - record: slo:objective:ratio - expr: vector(0.9995) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - - record: slo:error_budget:ratio - expr: vector(1-0.9995) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - - record: slo:time_period:days - expr: vector(30) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-availability", - sloth_service="kuadrant", sloth_slo="requests-availability"} - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_service: kuadrant - sloth_slo: requests-availability - - record: sloth_slo_info - expr: vector(1) - labels: - owner: kuadrant-org - sloth_id: kuadrant-requests-availability - sloth_mode: cli-gen-prom - sloth_objective: "99.95" - sloth_service: kuadrant - sloth_slo: requests-availability - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-kuadrant-requests-availability - rules: - - alert: KuadrantAvailabilityHighErrorRate - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (14.4 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (14.4 * 0.0004999999999999716)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (6 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (6 * 0.0004999999999999716)) without (sloth_window) - ) - labels: - category: availability - severity: critical - sloth_severity: page - annotations: - summary: High error rate on HTTPRoute requests responses - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: KuadrantAvailabilityHighErrorRate - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (3 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (3 * 0.0004999999999999716)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (1 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (1 * 0.0004999999999999716)) without (sloth_window) - ) - labels: - category: availability - severity: warning - sloth_severity: ticket - annotations: - summary: High error rate on HTTPRoute requests responses - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. \ No newline at end of file +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: availability-slo + namespace: monitoring +spec: + groups: + - name: sloth-slo-sli-recordings-kuadrant-requests-availability + rules: + - record: slo:sli_error:ratio_rate5m + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[5m])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[5m]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 5m + - record: slo:sli_error:ratio_rate30m + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[30m])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[30m]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 30m + - record: slo:sli_error:ratio_rate1h + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1h])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1h]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 1h + - record: slo:sli_error:ratio_rate2h + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[2h])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[2h]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 2h + - record: slo:sli_error:ratio_rate6h + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[6h])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[6h]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 6h + - record: slo:sli_error:ratio_rate1d + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1d])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1d]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 1d + - record: slo:sli_error:ratio_rate3d + expr: | + (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[3d])) by (request_host)) + / + (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[3d]) )by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 3d + - record: slo:sli_error:ratio_rate4w + expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[4w]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[4w]) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_window: 4w + - name: sloth-slo-meta-recordings-kuadrant-requests-availability + rules: + - record: slo:objective:ratio + expr: vector(0.9995) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:error_budget:ratio + expr: vector(1-0.9995) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:time_period:days + expr: vector(28) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:current_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:period_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate4w{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: slo:period_error_budget_remaining:ratio + expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-availability", + sloth_service="kuadrant", sloth_slo="requests-availability"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_service: kuadrant + sloth_slo: requests-availability + - record: sloth_slo_info + expr: vector(1) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-availability + sloth_mode: cli-gen-prom + sloth_objective: "99.95" + sloth_service: kuadrant + sloth_slo: requests-availability + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + - name: sloth-slo-alerts-kuadrant-requests-availability + rules: + - alert: KuadrantAvailabilityHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (13.44 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (13.44 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (5.6000000000000005 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (5.6000000000000005 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: availability + severity: critical + sloth_severity: page + annotations: + summary: High error rate on HTTPRoute requests responses + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. + - alert: KuadrantAvailabilityHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (2.8000000000000003 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (2.8000000000000003 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (0.9333333333333333 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (0.9333333333333333 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: availability + severity: warning + sloth_severity: ticket + annotations: + summary: High error rate on HTTPRoute requests responses + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. diff --git a/examples/alerts/slo-latency.yaml b/examples/alerts/slo-latency.yaml index bed4005c8..e92379606 100644 --- a/examples/alerts/slo-latency.yaml +++ b/examples/alerts/slo-latency.yaml @@ -1,182 +1,204 @@ -groups: -- name: sloth-slo-sli-recordings-kuadrant-kuadrant-requests-latency - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[5m]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[5m]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[30m]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[30m]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1h]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1h]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[2h]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[2h]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[6h]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[6h]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1d]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1d]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[3d]) )by (request_host)) - / - (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[3d]))by (request_host)) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"}[30d]) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_window: 30d -- name: sloth-slo-meta-recordings-kuadrant-kuadrant-requests-latency - rules: - - record: slo:objective:ratio - expr: vector(0.9995) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - - record: slo:error_budget:ratio - expr: vector(1-0.9995) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-kuadrant-requests-latency", - sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: kuadrant-kuadrant-requests-latency - sloth_mode: cli-gen-prom - sloth_objective: "99.95" - sloth_service: kuadrant - sloth_slo: kuadrant-requests-latency - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-kuadrant-kuadrant-requests-latency - rules: - - alert: KuadrantlatencyHighErrorRate - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (14.4 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (14.4 * 0.0004999999999999716)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (6 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (6 * 0.0004999999999999716)) without (sloth_window) - ) - labels: - category: latency - severity: critical - sloth_severity: page - annotations: - summary: High latency on HTTPRoute requests responses - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: KuadrantlatencyHighErrorRate - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (3 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (3 * 0.0004999999999999716)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (1 * 0.0004999999999999716)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="kuadrant-kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="kuadrant-requests-latency"} > (1 * 0.0004999999999999716)) without (sloth_window) - ) - labels: - category: latency - severity: warning - sloth_severity: ticket - annotations: - summary: High latency on HTTPRoute requests responses - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. \ No newline at end of file +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: latency-slo + namespace: monitoring +spec: + groups: + - name: sloth-slo-sli-recordings-kuadrant-requests-latency + rules: + - record: slo:sli_error:ratio_rate5m + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[5m]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[5m]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[5m]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 5m + - record: slo:sli_error:ratio_rate30m + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[30m]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[30m]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[30m]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 30m + - record: slo:sli_error:ratio_rate1h + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1h]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1h]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1h]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 1h + - record: slo:sli_error:ratio_rate2h + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[2h]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[2h]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[2h]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 2h + - record: slo:sli_error:ratio_rate6h + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[6h]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[6h]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[6h]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 6h + - record: slo:sli_error:ratio_rate1d + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1d]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1d]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1d]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 1d + - record: slo:sli_error:ratio_rate3d + expr: | + (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[3d]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[3d]) )by (request_host) )) + / + (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[3d]))by (request_host)) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 3d + - record: slo:sli_error:ratio_rate4w + expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"}[4w]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"}[4w]) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_window: 4w + - name: sloth-slo-meta-recordings-kuadrant-requests-latency + rules: + - record: slo:objective:ratio + expr: vector(0.9995) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + - record: slo:error_budget:ratio + expr: vector(1-0.9995) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + - record: slo:time_period:days + expr: vector(28) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + - record: slo:current_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + - record: slo:period_burn_rate:ratio + expr: | + slo:sli_error:ratio_rate4w{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + - record: slo:period_error_budget_remaining:ratio + expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", + sloth_slo="requests-latency"} + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_service: kuadrant + sloth_slo: requests-latency + - record: sloth_slo_info + expr: vector(1) + labels: + owner: kuadrant-org + sloth_id: kuadrant-requests-latency + sloth_mode: cli-gen-prom + sloth_objective: "99.95" + sloth_service: kuadrant + sloth_slo: requests-latency + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + - name: sloth-slo-alerts-kuadrant-requests-latency + rules: + - alert: KuadrantlatencyHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (13.44 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (13.44 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (5.6000000000000005 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (5.6000000000000005 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: latency + severity: critical + sloth_severity: page + annotations: + summary: High latency on HTTPRoute requests responses + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. + - alert: KuadrantlatencyHighErrorRate + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (2.8000000000000003 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (2.8000000000000003 * 0.0004999999999999716)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (0.9333333333333333 * 0.0004999999999999716)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (0.9333333333333333 * 0.0004999999999999716)) without (sloth_window) + ) + labels: + category: latency + severity: warning + sloth_severity: ticket + annotations: + summary: High latency on HTTPRoute requests responses + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. + \ No newline at end of file diff --git a/examples/alerts/sloth/availability.yaml b/examples/alerts/sloth/availability.yaml index f031020cf..fb165a249 100644 --- a/examples/alerts/sloth/availability.yaml +++ b/examples/alerts/sloth/availability.yaml @@ -23,4 +23,4 @@ slos: ticket_alert: labels: severity: warning - + diff --git a/examples/alerts/sloth/latency.yaml b/examples/alerts/sloth/latency.yaml index e26d551da..4480bbe67 100644 --- a/examples/alerts/sloth/latency.yaml +++ b/examples/alerts/sloth/latency.yaml @@ -8,7 +8,11 @@ slos: description: "Multi window multi burn rate SLO based on latency for HTTP request responses." sli: events: - error_query: sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[{{.window}}]) )by (request_host) + error_query: ( + sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) + - + sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[{{.window}}]) )by (request_host) + ) total_query: sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) alerting: name: KuadrantlatencyHighErrorRate @@ -16,10 +20,10 @@ slos: category: "latency" annotations: # Overwrite default Sloth SLO alert summmary on ticket and page alerts. - summary: "High latency on HTTPRoute requests responses {{$labels.request_host}}" + summary: "High latency on HTTPRoute requests responses" page_alert: labels: severity: critical ticket_alert: labels: - severity: warning \ No newline at end of file + severity: warning diff --git a/examples/alerts/availability_test.yaml b/examples/alerts/tests/availability_test.yaml similarity index 93% rename from examples/alerts/availability_test.yaml rename to examples/alerts/tests/availability_test.yaml index a88dcb658..34760fd9a 100644 --- a/examples/alerts/availability_test.yaml +++ b/examples/alerts/tests/availability_test.yaml @@ -1,5 +1,5 @@ rule_files: - - slo-availability.yaml + - /prometheus/availability-rules.yaml evaluation_interval: 1m @@ -9,7 +9,7 @@ tests: - series: istio_requests_total{job="ingress-metrics-proxy",response_code="500"} values: "0+0x30 0+10x30" - series: istio_requests_total{job="ingress-metrics-proxy"} - values: "0+1x30 31+90x30" + values: "0+1x30 31+100x30" alert_rule_test: - eval_time: 30m alertname: KuadrantAvailabilityHighErrorRate @@ -40,5 +40,6 @@ tests: sloth_slo: requests-availability exp_annotations: summary: High error rate on HTTPRoute requests responses - title: (ticket) kuadrant requests-availability SLO error budget burn rate is too fast. + title: (ticket) kuadrant requests-availability SLO error budget burn rate is too fast. + \ No newline at end of file diff --git a/examples/alerts/latency_test.yaml b/examples/alerts/tests/latency_test.yaml similarity index 66% rename from examples/alerts/latency_test.yaml rename to examples/alerts/tests/latency_test.yaml index 904387ec7..0f70d3e33 100644 --- a/examples/alerts/latency_test.yaml +++ b/examples/alerts/tests/latency_test.yaml @@ -1,5 +1,5 @@ rule_files: - - slo-latency.yaml + - /prometheus/latency-rules.yaml evaluation_interval: 1m @@ -7,9 +7,9 @@ tests: - interval: 1m input_series: - series: istio_request_duration_milliseconds_bucket{le="250",job="ingress-metrics-proxy",response_code="200"} - values: "0+0x30 31+10x30" + values: "0+1x30 31+10x30" - series: istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"} - values: "0+1x30 31+90x30" + values: "0+1x30 31+100x30" alert_rule_test: - eval_time: 30m alertname: KuadrantlatencyHighErrorRate @@ -20,22 +20,25 @@ tests: - exp_labels: alertname: KuadrantlatencyHighErrorRate category: latency + owner: kuadrant-org severity: critical - sloth_id: kuadrant-kuadrant-requests-latency + sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_severity: page - sloth_slo: kuadrant-requests-latency + sloth_slo: requests-latency exp_annotations: summary: High latency on HTTPRoute requests responses - title: (page) kuadrant kuadrant-requests-latency SLO error budget burn rate is too fast. + title: (page) kuadrant requests-latency SLO error budget burn rate is too fast. - exp_labels: alertname: KuadrantlatencyHighErrorRate category: latency + owner: kuadrant-org severity: warning - sloth_id: kuadrant-kuadrant-requests-latency + sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_severity: ticket - sloth_slo: kuadrant-requests-latency + sloth_slo: requests-latency exp_annotations: summary: High latency on HTTPRoute requests responses - title: (ticket) kuadrant kuadrant-requests-latency SLO error budget burn rate is too fast. \ No newline at end of file + title: (ticket) kuadrant requests-latency SLO error budget burn rate is too fast. + \ No newline at end of file diff --git a/make/alerts.mk b/make/alerts.mk new file mode 100644 index 000000000..8047d1049 --- /dev/null +++ b/make/alerts.mk @@ -0,0 +1,25 @@ +export WORKDIR ?= $(shell pwd) +export IMAGE ?= quay.io/prometheus/prometheus +export AVAILABILITY_SLO_RULES ?= ${WORKDIR}/examples/alerts/slo-availability.yaml +export LATENCY_SLO_RULES ?= ${WORKDIR}/examples/alerts/slo-latency.yaml +export UNIT_TEST_DIR ?= ${WORKDIR}/examples/alerts/tests + + + +container-runtime-tool: + $(eval CONTAINER_RUNTIME_BIN := $(shell if command -v docker &>/dev/null; then \ + echo "docker"; \ + elif command -v podman &>/dev/null; then \ + echo "podman"; \ + else \ + echo "Neither Docker nor Podman is installed. Exiting..."; \ + exit 1; \ + fi)) + + +alerts-tests: container-runtime-tool + $(CONTAINER_RUNTIME_BIN) run --rm -t \ + -v $(AVAILABILITY_SLO_RULES):/prometheus/slo-availability.yaml \ + -v $(LATENCY_SLO_RULES):/prometheus/slo-latency.yaml \ + -v $(UNIT_TEST_DIR):/prometheus/tests --entrypoint=/bin/sh \ +$(IMAGE) -c 'tail -n +7 slo-latency.yaml > latency-rules.yaml && tail -n +7 slo-availability.yaml > availability-rules.yaml && cd tests && promtool test rules *' From b5b5d73ff1b4f4d38714a834fe7e877386839cc7 Mon Sep 17 00:00:00 2001 From: R-Lawton Date: Thu, 9 May 2024 12:11:55 +0100 Subject: [PATCH 3/6] new sloth make target and comments for promtool and feedback --- examples/alerts/README.md | 12 ++++++--- examples/alerts/slo-latency.yaml | 2 +- examples/alerts/tests/availability_test.yaml | 3 +++ examples/alerts/tests/latency_test.yaml | 3 +++ make/alerts.mk | 28 ++++++++++++++------ 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/examples/alerts/README.md b/examples/alerts/README.md index da13b1754..e6dd9d8bc 100644 --- a/examples/alerts/README.md +++ b/examples/alerts/README.md @@ -1,5 +1,5 @@ ## SLO Multi burn rate multi window alerts -Kuadrant have created two example SLO alerts to help give ideas on the types of SLO alerts that could be used with the operator. We have created one alert for latency and one for availability, both are Multiwindow, Multi-Burn-Rate Alerts. The alerts show a scenario where a 28d rolling window is used and a uptime of 99.95% i.e only .1% error margin is desired. This in real world time would be downtime of around. These values can be changed to suit different scenarios +Kuadrant have created two example SLO alerts to help give ideas on the types of SLO alerts that could be used with the operator. We have created one alert for latency and one for availability, both are Multiwindow, Multi-Burn-Rate Alerts. The alerts show a scenario where a 28d rolling window is used and a uptime of 99.95% i.e only 0.05% error budget margin is desired. This in real world time would be downtime of around: | Time Frame | Duration | |------------|------------| @@ -9,6 +9,8 @@ Kuadrant have created two example SLO alerts to help give ideas on the types of | Quarterly: | 1h 5m 12s | | Yearly: | 4h 20m 49s | +These values can be changed to suit different scenarios + ### Sloth Sloth is a tool to aid in the creation of multi burn rate and multi window SLO alerts and was used to create both the availability and latency alerts. It follows the common standard set out by [Google's SRE book](https://sre.google/workbook/implementing-slos/). Sloth generates alerts based on specific specs given. The specs for our example alerts can be found in the example/sloth folder. @@ -18,15 +20,19 @@ Sloth is a tool to aid in the creation of multi burn rate and multi window SLO a For the availability SLO alerts the Istio metric `istio_requests_total` was used as its a counter type metric meaning the values can only increase as well as it gives information on all requests handled by the Istio proxy. #### Latency -For the availability SLO alerts the Istio metric 'istio_requests_total' was used as its a Distribution type metric meaning values are mapped to different ranges of frequency's as well as it gives information duration of requests. +For the availability SLO alerts the Istio metric `istio_request_duration_milliseconds` was used as its a [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram). ### Sloth generation You can modify the examples Sloth specs we have and regenerate the prometheus rules using the Sloth CLI and the generate command. For more information please the [Sloth website](https://sloth.dev/usage/cli/) ``` -generate -i examples/alerts/sloth/latency.yaml --default-slo-period=28d +sloth generate -i examples/alerts/sloth/latency.yaml --default-slo-period=28d ``` +You can also use the make target to generate the rules to. +``` +make sloth-generate +``` ### Prometheus unit tests There are also two matching unit tests to verify and test the alerts that Sloth has generated. These can be run using the make target: diff --git a/examples/alerts/slo-latency.yaml b/examples/alerts/slo-latency.yaml index e92379606..aa118fc34 100644 --- a/examples/alerts/slo-latency.yaml +++ b/examples/alerts/slo-latency.yaml @@ -201,4 +201,4 @@ spec: summary: High latency on HTTPRoute requests responses title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget burn rate is too fast. - \ No newline at end of file + \ No newline at end of file diff --git a/examples/alerts/tests/availability_test.yaml b/examples/alerts/tests/availability_test.yaml index 34760fd9a..ceae1ab8b 100644 --- a/examples/alerts/tests/availability_test.yaml +++ b/examples/alerts/tests/availability_test.yaml @@ -6,8 +6,11 @@ evaluation_interval: 1m tests: - interval: 1m input_series: + # Promtool uses expanding notation as its way of creating time series (https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#series) + # 0+0x30 = 0, 0, 0, ...0 For a total of 31 times THEN 0+10x30 = 0, 10 ,20, ... 300 For a total of 31 times - series: istio_requests_total{job="ingress-metrics-proxy",response_code="500"} values: "0+0x30 0+10x30" + # 0+1x30 = 0, 1, 2, ...30 For a total of 31 times THEN 31+100x30 = 31, 131 ,231, 331 ... 3031 For a total of 31 times - series: istio_requests_total{job="ingress-metrics-proxy"} values: "0+1x30 31+100x30" alert_rule_test: diff --git a/examples/alerts/tests/latency_test.yaml b/examples/alerts/tests/latency_test.yaml index 0f70d3e33..bf8bed313 100644 --- a/examples/alerts/tests/latency_test.yaml +++ b/examples/alerts/tests/latency_test.yaml @@ -6,8 +6,11 @@ evaluation_interval: 1m tests: - interval: 1m input_series: + # Promtool uses expanding notation as its way of creating time series (https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#series) + # 0+1x30 = 0, 1, 2, ...30 For a total of 31 times THEN 31+10x30 = 31, 41 ,51, ... 331 For a total of 31 times - series: istio_request_duration_milliseconds_bucket{le="250",job="ingress-metrics-proxy",response_code="200"} values: "0+1x30 31+10x30" + # 0+1x30 = 0, 1, 2, ...30 For a total of 31 times THEN 31+100x30 = 31, 131 ,231, 331 ... 3031 For a total of 31 times - series: istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"} values: "0+1x30 31+100x30" alert_rule_test: diff --git a/make/alerts.mk b/make/alerts.mk index 8047d1049..a6e62eca4 100644 --- a/make/alerts.mk +++ b/make/alerts.mk @@ -1,9 +1,13 @@ +##@ Alerts export WORKDIR ?= $(shell pwd) export IMAGE ?= quay.io/prometheus/prometheus -export AVAILABILITY_SLO_RULES ?= ${WORKDIR}/examples/alerts/slo-availability.yaml -export LATENCY_SLO_RULES ?= ${WORKDIR}/examples/alerts/slo-latency.yaml -export UNIT_TEST_DIR ?= ${WORKDIR}/examples/alerts/tests - +export AVAILABILITY_SLO_RULES ?= $(WORKDIR)/examples/alerts/slo-availability.yaml +export LATENCY_SLO_RULES ?= $(WORKDIR)/examples/alerts/slo-latency.yaml +export UNIT_TEST_DIR ?= $(WORKDIR)/examples/alerts/tests +export OS = $(shell uname | tr '[:upper:]' '[:lower:]') +export ARCH = $(shell uname -m | tr '[:upper:]' '[:lower:]') +export SLOTH = $(WORKDIR)/bin/sloth +export ALERTS_SLOTH = /examples/alerts/sloth container-runtime-tool: @@ -16,10 +20,18 @@ container-runtime-tool: exit 1; \ fi)) - -alerts-tests: container-runtime-tool +alerts-tests: container-runtime-tool # Test alerts using promtool $(CONTAINER_RUNTIME_BIN) run --rm -t \ -v $(AVAILABILITY_SLO_RULES):/prometheus/slo-availability.yaml \ -v $(LATENCY_SLO_RULES):/prometheus/slo-latency.yaml \ - -v $(UNIT_TEST_DIR):/prometheus/tests --entrypoint=/bin/sh \ -$(IMAGE) -c 'tail -n +7 slo-latency.yaml > latency-rules.yaml && tail -n +7 slo-availability.yaml > availability-rules.yaml && cd tests && promtool test rules *' + -v $(UNIT_TEST_DIR):/prometheus/tests --entrypoint=/bin/sh \ + $(IMAGE) -c 'tail -n +7 slo-latency.yaml > latency-rules.yaml && tail -n +7 slo-availability.yaml > availability-rules.yaml && cd tests && promtool test rules *' + +sloth: $(SLOTH) # Install Sloth +$(SLOTH): + cd $(WORKDIR)/bin && curl -L https://github.com/slok/sloth/releases/download/v0.11.0/sloth-$(OS)-$(ARCH) > sloth && chmod +x sloth + +sloth-generate: sloth # Generate alerts using Sloth templates + for FILE in $(wildcard $(WORKDIR)$(ALERTS_SLOTH)/* ) ; do \ + $(SLOTH) generate -i $$FILE --default-slo-period=28d ; \ + done From 484f4533d591a4bccacd6a91785e6c48bdfedc0f Mon Sep 17 00:00:00 2001 From: David Martin Date: Thu, 9 May 2024 15:44:33 +0100 Subject: [PATCH 4/6] Update the sloth-generate target to output PrometheusRules files --- examples/alerts/slo-availability.yaml | 87 +++++++++++--------- examples/alerts/slo-latency.yaml | 88 +++++++++++---------- examples/alerts/sloth/availability.yaml | 26 ------ examples/alerts/sloth/latency.yaml | 29 ------- examples/alerts/sloth/slo-availability.yaml | 31 ++++++++ examples/alerts/sloth/slo-latency.yaml | 34 ++++++++ make/alerts.mk | 7 +- 7 files changed, 164 insertions(+), 138 deletions(-) delete mode 100644 examples/alerts/sloth/availability.yaml delete mode 100644 examples/alerts/sloth/latency.yaml create mode 100644 examples/alerts/sloth/slo-availability.yaml create mode 100644 examples/alerts/sloth/slo-latency.yaml diff --git a/examples/alerts/slo-availability.yaml b/examples/alerts/slo-availability.yaml index 1ec8a8bb8..3263bc611 100644 --- a/examples/alerts/slo-availability.yaml +++ b/examples/alerts/slo-availability.yaml @@ -1,14 +1,22 @@ + +--- +# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. +# DO NOT EDIT. + apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: SLO + app.kubernetes.io/managed-by: sloth name: availability-slo namespace: monitoring spec: groups: - name: sloth-slo-sli-recordings-kuadrant-requests-availability rules: - - record: slo:sli_error:ratio_rate5m - expr: | + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[5m])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[5m]) )by (request_host)) @@ -18,8 +26,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | + record: slo:sli_error:ratio_rate5m + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[30m])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[30m]) )by (request_host)) @@ -29,8 +37,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | + record: slo:sli_error:ratio_rate30m + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1h])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1h]) )by (request_host)) @@ -40,8 +48,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | + record: slo:sli_error:ratio_rate1h + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[2h])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[2h]) )by (request_host)) @@ -51,8 +59,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | + record: slo:sli_error:ratio_rate2h + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[6h])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[6h]) )by (request_host)) @@ -62,8 +70,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | + record: slo:sli_error:ratio_rate6h + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[1d])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[1d]) )by (request_host)) @@ -73,8 +81,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | + record: slo:sli_error:ratio_rate1d + - expr: | (sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[3d])) by (request_host)) / (sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[3d]) )by (request_host)) @@ -84,8 +92,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 3d - - record: slo:sli_error:ratio_rate4w - expr: | + record: slo:sli_error:ratio_rate3d + - expr: | sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[4w]) / ignoring (sloth_window) count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"}[4w]) @@ -95,31 +103,31 @@ spec: sloth_service: kuadrant sloth_slo: requests-availability sloth_window: 4w + record: slo:sli_error:ratio_rate4w - name: sloth-slo-meta-recordings-kuadrant-requests-availability rules: - - record: slo:objective:ratio - expr: vector(0.9995) + - expr: vector(0.9995) labels: owner: kuadrant-org sloth_id: kuadrant-requests-availability sloth_service: kuadrant sloth_slo: requests-availability - - record: slo:error_budget:ratio - expr: vector(1-0.9995) + record: slo:objective:ratio + - expr: vector(1-0.9995) labels: owner: kuadrant-org sloth_id: kuadrant-requests-availability sloth_service: kuadrant sloth_slo: requests-availability - - record: slo:time_period:days - expr: vector(28) + record: slo:error_budget:ratio + - expr: vector(28) labels: owner: kuadrant-org sloth_id: kuadrant-requests-availability sloth_service: kuadrant sloth_slo: requests-availability - - record: slo:current_burn_rate:ratio - expr: | + record: slo:time_period:days + - expr: | slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} / on(sloth_id, sloth_slo, sloth_service) group_left slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} @@ -128,8 +136,8 @@ spec: sloth_id: kuadrant-requests-availability sloth_service: kuadrant sloth_slo: requests-availability - - record: slo:period_burn_rate:ratio - expr: | + record: slo:current_burn_rate:ratio + - expr: | slo:sli_error:ratio_rate4w{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} / on(sloth_id, sloth_slo, sloth_service) group_left slo:error_budget:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} @@ -138,28 +146,33 @@ spec: sloth_id: kuadrant-requests-availability sloth_service: kuadrant sloth_slo: requests-availability - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-availability", + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} labels: owner: kuadrant-org sloth_id: kuadrant-requests-availability sloth_service: kuadrant sloth_slo: requests-availability - - record: sloth_slo_info - expr: vector(1) + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) labels: owner: kuadrant-org sloth_id: kuadrant-requests-availability - sloth_mode: cli-gen-prom + sloth_mode: cli-gen-k8s sloth_objective: "99.95" sloth_service: kuadrant sloth_slo: requests-availability - sloth_spec: prometheus/v1 + sloth_spec: sloth.slok.dev/v1 sloth_version: v0.11.0 + record: sloth_slo_info - name: sloth-slo-alerts-kuadrant-requests-availability rules: - alert: KuadrantAvailabilityHighErrorRate + annotations: + summary: High error rate on HTTPRoute requests responses + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. expr: | ( max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (13.44 * 0.0004999999999999716)) without (sloth_window) @@ -176,11 +189,11 @@ spec: category: availability severity: critical sloth_severity: page + - alert: KuadrantAvailabilityHighErrorRate annotations: summary: High error rate on HTTPRoute requests responses - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: KuadrantAvailabilityHighErrorRate + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. expr: | ( max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-requests-availability", sloth_service="kuadrant", sloth_slo="requests-availability"} > (2.8000000000000003 * 0.0004999999999999716)) without (sloth_window) @@ -197,7 +210,3 @@ spec: category: availability severity: warning sloth_severity: ticket - annotations: - summary: High error rate on HTTPRoute requests responses - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/examples/alerts/slo-latency.yaml b/examples/alerts/slo-latency.yaml index aa118fc34..63e61c2cd 100644 --- a/examples/alerts/slo-latency.yaml +++ b/examples/alerts/slo-latency.yaml @@ -1,14 +1,22 @@ + +--- +# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. +# DO NOT EDIT. + apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: SLO + app.kubernetes.io/managed-by: sloth name: latency-slo namespace: monitoring spec: groups: - name: sloth-slo-sli-recordings-kuadrant-requests-latency rules: - - record: slo:sli_error:ratio_rate5m - expr: | + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[5m]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[5m]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[5m]))by (request_host)) @@ -18,8 +26,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | + record: slo:sli_error:ratio_rate5m + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[30m]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[30m]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[30m]))by (request_host)) @@ -29,8 +37,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | + record: slo:sli_error:ratio_rate30m + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1h]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1h]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1h]))by (request_host)) @@ -40,8 +48,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | + record: slo:sli_error:ratio_rate1h + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[2h]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[2h]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[2h]))by (request_host)) @@ -51,8 +59,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | + record: slo:sli_error:ratio_rate2h + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[6h]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[6h]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[6h]))by (request_host)) @@ -62,8 +70,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | + record: slo:sli_error:ratio_rate6h + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1d]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[1d]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[1d]))by (request_host)) @@ -73,8 +81,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | + record: slo:sli_error:ratio_rate1d + - expr: | (( sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[3d]))by (request_host) - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[3d]) )by (request_host) )) / (sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[3d]))by (request_host)) @@ -84,8 +92,8 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 3d - - record: slo:sli_error:ratio_rate4w - expr: | + record: slo:sli_error:ratio_rate3d + - expr: | sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"}[4w]) / ignoring (sloth_window) count_over_time(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"}[4w]) @@ -95,31 +103,31 @@ spec: sloth_service: kuadrant sloth_slo: requests-latency sloth_window: 4w + record: slo:sli_error:ratio_rate4w - name: sloth-slo-meta-recordings-kuadrant-requests-latency rules: - - record: slo:objective:ratio - expr: vector(0.9995) + - expr: vector(0.9995) labels: owner: kuadrant-org sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_slo: requests-latency - - record: slo:error_budget:ratio - expr: vector(1-0.9995) + record: slo:objective:ratio + - expr: vector(1-0.9995) labels: owner: kuadrant-org sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_slo: requests-latency - - record: slo:time_period:days - expr: vector(28) + record: slo:error_budget:ratio + - expr: vector(28) labels: owner: kuadrant-org sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_slo: requests-latency - - record: slo:current_burn_rate:ratio - expr: | + record: slo:time_period:days + - expr: | slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} / on(sloth_id, sloth_slo, sloth_service) group_left slo:error_budget:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} @@ -128,8 +136,8 @@ spec: sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_slo: requests-latency - - record: slo:period_burn_rate:ratio - expr: | + record: slo:current_burn_rate:ratio + - expr: | slo:sli_error:ratio_rate4w{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} / on(sloth_id, sloth_slo, sloth_service) group_left slo:error_budget:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} @@ -138,28 +146,33 @@ spec: sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_slo: requests-latency - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} labels: owner: kuadrant-org sloth_id: kuadrant-requests-latency sloth_service: kuadrant sloth_slo: requests-latency - - record: sloth_slo_info - expr: vector(1) + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) labels: owner: kuadrant-org sloth_id: kuadrant-requests-latency - sloth_mode: cli-gen-prom + sloth_mode: cli-gen-k8s sloth_objective: "99.95" sloth_service: kuadrant sloth_slo: requests-latency - sloth_spec: prometheus/v1 + sloth_spec: sloth.slok.dev/v1 sloth_version: v0.11.0 + record: sloth_slo_info - name: sloth-slo-alerts-kuadrant-requests-latency rules: - alert: KuadrantlatencyHighErrorRate + annotations: + summary: High latency on HTTPRoute requests responses + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget + burn rate is too fast. expr: | ( max(slo:sli_error:ratio_rate5m{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (13.44 * 0.0004999999999999716)) without (sloth_window) @@ -176,11 +189,11 @@ spec: category: latency severity: critical sloth_severity: page + - alert: KuadrantlatencyHighErrorRate annotations: summary: High latency on HTTPRoute requests responses - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: KuadrantlatencyHighErrorRate + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. expr: | ( max(slo:sli_error:ratio_rate2h{sloth_id="kuadrant-requests-latency", sloth_service="kuadrant", sloth_slo="requests-latency"} > (2.8000000000000003 * 0.0004999999999999716)) without (sloth_window) @@ -197,8 +210,3 @@ spec: category: latency severity: warning sloth_severity: ticket - annotations: - summary: High latency on HTTPRoute requests responses - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - \ No newline at end of file diff --git a/examples/alerts/sloth/availability.yaml b/examples/alerts/sloth/availability.yaml deleted file mode 100644 index fb165a249..000000000 --- a/examples/alerts/sloth/availability.yaml +++ /dev/null @@ -1,26 +0,0 @@ -version: "prometheus/v1" -service: "kuadrant" -labels: - owner: "kuadrant-org" -slos: - - name: "requests-availability" - objective: 99.95 - description: "Multi window multi burn rate SLO based on availability for HTTP request responses." - sli: - events: - error_query: sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[{{.window}}])) by (request_host) - total_query: sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[{{.window}}]) )by (request_host) - alerting: - name: KuadrantAvailabilityHighErrorRate - labels: - category: "availability" - annotations: - # Overwrite default Sloth SLO alert summmary on ticket and page alerts. - summary: "High error rate on HTTPRoute requests responses" - page_alert: - labels: - severity: critical - ticket_alert: - labels: - severity: warning - diff --git a/examples/alerts/sloth/latency.yaml b/examples/alerts/sloth/latency.yaml deleted file mode 100644 index 4480bbe67..000000000 --- a/examples/alerts/sloth/latency.yaml +++ /dev/null @@ -1,29 +0,0 @@ -version: "prometheus/v1" -service: "kuadrant" -labels: - owner: "kuadrant-org" -slos: - - name: "requests-latency" - objective: 99.95 - description: "Multi window multi burn rate SLO based on latency for HTTP request responses." - sli: - events: - error_query: ( - sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) - - - sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[{{.window}}]) )by (request_host) - ) - total_query: sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) - alerting: - name: KuadrantlatencyHighErrorRate - labels: - category: "latency" - annotations: - # Overwrite default Sloth SLO alert summmary on ticket and page alerts. - summary: "High latency on HTTPRoute requests responses" - page_alert: - labels: - severity: critical - ticket_alert: - labels: - severity: warning diff --git a/examples/alerts/sloth/slo-availability.yaml b/examples/alerts/sloth/slo-availability.yaml new file mode 100644 index 000000000..65768e265 --- /dev/null +++ b/examples/alerts/sloth/slo-availability.yaml @@ -0,0 +1,31 @@ +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + name: availability-slo + namespace: monitoring +spec: + service: "kuadrant" + labels: + owner: "kuadrant-org" + slos: + - name: "requests-availability" + objective: 99.95 + description: "Multi window multi burn rate SLO based on availability for HTTP request responses." + sli: + events: + errorQuery: sum(rate(istio_requests_total{job="ingress-metrics-proxy",response_code=~"5.*"}[{{.window}}])) by (request_host) + totalQuery: sum(rate(istio_requests_total{job="ingress-metrics-proxy"}[{{.window}}]) )by (request_host) + alerting: + name: KuadrantAvailabilityHighErrorRate + labels: + category: "availability" + annotations: + # Overwrite default Sloth SLO alert summmary on ticket and page alerts. + summary: "High error rate on HTTPRoute requests responses" + pageAlert: + labels: + severity: critical + ticketAlert: + labels: + severity: warning + diff --git a/examples/alerts/sloth/slo-latency.yaml b/examples/alerts/sloth/slo-latency.yaml new file mode 100644 index 000000000..6f526dc82 --- /dev/null +++ b/examples/alerts/sloth/slo-latency.yaml @@ -0,0 +1,34 @@ +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + name: latency-slo + namespace: monitoring +spec: + service: "kuadrant" + labels: + owner: "kuadrant-org" + slos: + - name: "requests-latency" + objective: 99.95 + description: "Multi window multi burn rate SLO based on latency for HTTP request responses." + sli: + events: + errorQuery: ( + sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) + - + sum(rate(istio_request_duration_milliseconds_bucket{le="250", job="ingress-metrics-proxy", response_code="200" }[{{.window}}]) )by (request_host) + ) + totalQuery: sum(rate(istio_request_duration_milliseconds_count{job="ingress-metrics-proxy",response_code="200"}[{{.window}}]))by (request_host) + alerting: + name: KuadrantlatencyHighErrorRate + labels: + category: "latency" + annotations: + # Overwrite default Sloth SLO alert summmary on ticket and page alerts. + summary: "High latency on HTTPRoute requests responses" + pageAlert: + labels: + severity: critical + ticketAlert: + labels: + severity: warning diff --git a/make/alerts.mk b/make/alerts.mk index a6e62eca4..f9cc785be 100644 --- a/make/alerts.mk +++ b/make/alerts.mk @@ -7,7 +7,8 @@ export UNIT_TEST_DIR ?= $(WORKDIR)/examples/alerts/tests export OS = $(shell uname | tr '[:upper:]' '[:lower:]') export ARCH = $(shell uname -m | tr '[:upper:]' '[:lower:]') export SLOTH = $(WORKDIR)/bin/sloth -export ALERTS_SLOTH = /examples/alerts/sloth +export ALERTS_SLOTH_INPUT_DIR = /examples/alerts/sloth +export ALERTS_SLOTH_OUTPUT_DIR = /examples/alerts container-runtime-tool: @@ -32,6 +33,4 @@ $(SLOTH): cd $(WORKDIR)/bin && curl -L https://github.com/slok/sloth/releases/download/v0.11.0/sloth-$(OS)-$(ARCH) > sloth && chmod +x sloth sloth-generate: sloth # Generate alerts using Sloth templates - for FILE in $(wildcard $(WORKDIR)$(ALERTS_SLOTH)/* ) ; do \ - $(SLOTH) generate -i $$FILE --default-slo-period=28d ; \ - done + $(SLOTH) generate -i $(WORKDIR)$(ALERTS_SLOTH_INPUT_DIR) -o $(WORKDIR)$(ALERTS_SLOTH_OUTPUT_DIR) --default-slo-period=28d From aae8a46a286cbff5db7658c96e44b8fcef601c2b Mon Sep 17 00:00:00 2001 From: R-Lawton Date: Thu, 9 May 2024 16:31:41 +0100 Subject: [PATCH 5/6] =?UTF-8?q?update=20tail=20in=20make=20target=20and=20?= =?UTF-8?q?linter=20(=E2=80=A2=CC=80o=E2=80=A2=CC=81)=E0=B8=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{availability_test.yaml => slo-availability-test.yaml} | 6 +++--- .../tests/{latency_test.yaml => slo-latency-test.yaml} | 4 ++-- make/alerts.mk | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) rename examples/alerts/tests/{availability_test.yaml => slo-availability-test.yaml} (97%) rename examples/alerts/tests/{latency_test.yaml => slo-latency-test.yaml} (97%) diff --git a/examples/alerts/tests/availability_test.yaml b/examples/alerts/tests/slo-availability-test.yaml similarity index 97% rename from examples/alerts/tests/availability_test.yaml rename to examples/alerts/tests/slo-availability-test.yaml index ceae1ab8b..9559e7229 100644 --- a/examples/alerts/tests/availability_test.yaml +++ b/examples/alerts/tests/slo-availability-test.yaml @@ -43,6 +43,6 @@ tests: sloth_slo: requests-availability exp_annotations: summary: High error rate on HTTPRoute requests responses - title: (ticket) kuadrant requests-availability SLO error budget burn rate is too fast. - - \ No newline at end of file + title: (ticket) kuadrant requests-availability SLO error budget burn rate is too fast. + + \ No newline at end of file diff --git a/examples/alerts/tests/latency_test.yaml b/examples/alerts/tests/slo-latency-test.yaml similarity index 97% rename from examples/alerts/tests/latency_test.yaml rename to examples/alerts/tests/slo-latency-test.yaml index bf8bed313..de331c0de 100644 --- a/examples/alerts/tests/latency_test.yaml +++ b/examples/alerts/tests/slo-latency-test.yaml @@ -43,5 +43,5 @@ tests: sloth_slo: requests-latency exp_annotations: summary: High latency on HTTPRoute requests responses - title: (ticket) kuadrant requests-latency SLO error budget burn rate is too fast. - \ No newline at end of file + title: (ticket) kuadrant requests-latency SLO error budget burn rate is too fast. + diff --git a/make/alerts.mk b/make/alerts.mk index f9cc785be..31ec00db2 100644 --- a/make/alerts.mk +++ b/make/alerts.mk @@ -26,7 +26,7 @@ alerts-tests: container-runtime-tool # Test alerts using promtool -v $(AVAILABILITY_SLO_RULES):/prometheus/slo-availability.yaml \ -v $(LATENCY_SLO_RULES):/prometheus/slo-latency.yaml \ -v $(UNIT_TEST_DIR):/prometheus/tests --entrypoint=/bin/sh \ - $(IMAGE) -c 'tail -n +7 slo-latency.yaml > latency-rules.yaml && tail -n +7 slo-availability.yaml > availability-rules.yaml && cd tests && promtool test rules *' + $(IMAGE) -c 'tail -n +16 slo-latency.yaml > latency-rules.yaml && tail -n +16 slo-availability.yaml > availability-rules.yaml && cd tests && promtool test rules *' sloth: $(SLOTH) # Install Sloth $(SLOTH): From e4438bda53d53328c9e79048ac3792fe0a05e8e2 Mon Sep 17 00:00:00 2001 From: R-Lawton Date: Thu, 9 May 2024 16:36:43 +0100 Subject: [PATCH 6/6] =?UTF-8?q?linter=20(=E2=80=A2=CC=80o=E2=80=A2=CC=81)?= =?UTF-8?q?=E0=B8=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/alerts/tests/slo-availability-test.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/alerts/tests/slo-availability-test.yaml b/examples/alerts/tests/slo-availability-test.yaml index 9559e7229..9b8a33526 100644 --- a/examples/alerts/tests/slo-availability-test.yaml +++ b/examples/alerts/tests/slo-availability-test.yaml @@ -45,4 +45,3 @@ tests: summary: High error rate on HTTPRoute requests responses title: (ticket) kuadrant requests-availability SLO error budget burn rate is too fast. - \ No newline at end of file