Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions manifests/pipecd/prom-rules/alerting_rules.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# TODO: Add Aleting rules
groups:
- name: Instances
- name: IncomingRequests
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
- alert: gRPCErrorRate
expr: |
(
sum by (pipecd_grpc_service) (rate(grpc_server_handled_total{pipecd_component="server", grpc_code!="OK", grpc_code!="NotFound"}[5m]))
/
sum by (pipecd_grpc_service) (rate(grpc_server_handled_total{pipecd_component="server"}[5m]))
) > 0.05
for: 0s
labels:
severity: page
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
summary: 'Instance {{ $labels.instance }} down'
description: 'The error rate of {{ $labels.pipecd_grpc_service }} API exceeded 5%.'
summary: 'The error rate of {{ $labels.pipecd_grpc_service }} API is getting higher.'
26 changes: 26 additions & 0 deletions manifests/pipecd/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,32 @@ data:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml

{{- if .Values.prometheus.alertmanager.enabled }}
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
regex: {{ .Release.Namespace }}
action: keep
- source_labels: [__meta_kubernetes_pod_label_app]
regex: prometheus
action: keep
- source_labels: [__meta_kubernetes_pod_label_component]
regex: alertmanager
action: keep
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_probe]
regex: {{ index .Values.prometheus.alertmanager.podAnnotations "prometheus.io/probe" | default ".*" }}
action: keep
- source_labels: [__meta_kubernetes_pod_container_port_number]
regex: "9093"
action: keep
{{- end }}

scrape_configs:
- job_name: pipecd-gateway
scrape_interval: 1m
Expand Down