diff --git a/charts/base-cluster/templates/flux/rules/flux-status.yaml b/charts/base-cluster/templates/flux/rules/flux-status.yaml index ddfae6cdb3..5141f86b1f 100644 --- a/charts/base-cluster/templates/flux/rules/flux-status.yaml +++ b/charts/base-cluster/templates/flux/rules/flux-status.yaml @@ -21,17 +21,27 @@ spec: description: {{ "Resource {{ $labels.customresource_kind }}/{{ $labels.exported_namespace }}/{{ $labels.name }} has been in a failed state for longer than 10 minutes." | quote }} summary: Resource has failed. expr: |- - gotk_reconcile_condition_info{type="Ready",status="False"} == 1 + min(gotk_reconcile_condition_gauge{type="Ready"}) by (customresource_kind, exported_namespace, name) == 0 for: 10m labels: severity: critical period: WorkingHours + - alert: ResourcesSuspended + annotations: + description: {{ "Resource {{ $labels.customresource_kind }}/{{ $labels.exported_namespace }}/{{ $labels.name }} has been suspended for longer than 1 hour." | quote }} + summary: Resource is suspended. + expr: |- + max(gotk_reconcile_suspended_gauge) by (customresource_kind, exported_namespace, name) == 1 + for: 1h + labels: + severity: critical + period: WorkingHours - alert: MetricsMissing annotations: description: The flux metrics are missing summary: The flux metrics are missing expr: |- - absent(gotk_reconcile_condition_info) == 1 + (absent(gotk_reconcile_condition_gauge) or absent(gotk_reconcile_suspended_gauge)) == 1 for: 5m labels: severity: critical diff --git a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_kube-state-metrics-config.yaml b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_kube-state-metrics-config.yaml index 4fe6e09e81..851ad217e0 100644 --- a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_kube-state-metrics-config.yaml +++ b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_kube-state-metrics-config.yaml @@ -39,23 +39,21 @@ customResourceState: config: spec: {{- $types := dict - "Bucket" (dict "group" "source" "version" "v1beta2") - "GitRepository" (dict "group" "source" "version" "v1") - "HelmChart" (dict "group" "source" "version" "v1") - "HelmRelease" (dict "group" "helm" "version" "v2") - "HelmRepository" (dict "group" "source" "version" "v1") - "ImagePolicy" (dict "group" "image" "version" "v1beta2") - "ImageRepository" (dict "group" "image" "version" "v1beta2") - "ImageUpdateAutomation" (dict "group" "image" "version" "v1beta2") - "Kustomization" (dict "group" "kustomize" "version" "v1") - "OCIRepository" (dict "group" "source" "version" "v1beta2") - "Receiver" (dict "group" "notification" "version" "v1") + "Bucket" "source" + "GitRepository" "source" + "HelmChart" "source" + "HelmRelease" "helm" + "HelmRepository" "source" + "ImagePolicy" "image" + "ImageRepository" "image" + "ImageUpdateAutomation" "image" + "Kustomization" "kustomize" + "OCIRepository" "source" + "Receiver" "notification" -}} {{- $resources := list -}} - {{- range $kind, $spec := $types -}} - {{- $group := $spec.group -}} - {{- $version := $spec.version -}} + {{- range $kind, $group := $types -}} {{- $resources = append $resources (dict "groupVersionKind" (dict "group" (printf "%s.toolkit.fluxcd.io" $group) @@ -64,16 +62,31 @@ customResourceState: "metricNamePrefix" "gotk" "metrics" (list (dict - "name" "reconcile_condition_info" + "name" "reconcile_condition_gauge" "help" (printf "The current conditions of a Flux %s resource." $kind) "each" (dict - "type" "Info" - "info" (dict + "type" "Gauge" + "gauge" (dict "path" (list "status" "conditions") "labelsFromPath" (dict "type" (list "type") - "status" (list "status") ) + "valueFrom" (list "status") + ) + ) + "labelsFromPath" (dict + "exported_namespace" (list "metadata" "namespace") + "name" (list "metadata" "name") + ) + ) + (dict + "name" "reconcile_suspended_gauge" + "help" (printf "The reconciliation suspended status of a Flux %s resource." $kind) + "each" (dict + "type" "Gauge" + "gauge" (dict + "path" (list "spec") + "valueFrom" (list "suspend") ) ) "labelsFromPath" (dict