Skip to content
This repository has been archived by the owner on May 5, 2024. It is now read-only.

Refactor prom rules #947

Merged
merged 3 commits into from
Apr 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions .github/scripts/validate-kustomize.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env bash

# This script downloads the Flux OpenAPI schemas, then it validates the
# Flux custom resources and the kustomize overlays using kubeval.
# This script is meant to be run locally and in CI before the changes
# are merged on the main branch that's synced by Flux.

# Copyright 2020 The Flux authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script is meant to be run locally and in CI to validate the Kubernetes
# manifests (including Flux custom resources) before changes are merged into
# the branch synced by Flux in-cluster.

# Prerequisites
# - yq v4.6
# - kustomize v4.1
# - kubeval v0.15.x

set -o errexit

echo "INFO - Downloading Flux OpenAPI schemas"
mkdir -p /tmp/flux-crd-schemas/master-standalone-strict
#curl -sL https://github.com/fluxcd/flux2/releases/latest/download/crd-schemas.tar.gz | tar zxf - -C /tmp/flux-crd-schemas/master-standalone-strict

# mirror kustomize-controller build options
kustomize_flags="--load-restrictor=LoadRestrictionsNone --reorder=legacy"
kustomize_config="kustomization.yaml"


# find . -type f -name '*.yaml' -print0 | while IFS= read -r -d $'\0' file;
# do
# echo "INFO - Validating $file"
# yq -e 'true' "$file" > /dev/null
# done

echo "INFO - Validating clusters"
find ./k8s/clusters -type f -name '*.yaml' -maxdepth 1 -print0 | while IFS= read -r -d $'\0' file;
do
kubeval "${file}" --strict --ignore-missing-schemas --additional-schema-locations=file:///tmp/flux-crd-schemas
if [[ ${PIPESTATUS[0]} != 0 ]]; then
exit 1
fi
done

echo "INFO - Validating kustomize overlays"
find . -type f -name $kustomize_config -print0 | while IFS= read -r -d $'\0' file;
do
echo "INFO - Validating kustomization ${file/%$kustomize_config}"
# Secrets are ignored with --skip-kinds due to using SOPS with FluxCD
# shellcheck disable=SC2086
kustomize build "${file/%$kustomize_config}" $kustomize_flags | kubeval --ignore-missing-schemas --strict --additional-schema-locations=file:///tmp/flux-crd-schemas --skip-kinds Secret
if [[ ${PIPESTATUS[0]} != 0 ]]; then
exit 1
fi
done
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,8 @@ repos:
hooks:
- id: fix-smartquotes
- id: fix-ligatures

- repo: https://github.com/zricethezav/gitleaks
rev: v8.5.3
hooks:
- id: gitleaks
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ kind: Kustomization
resources:
- ./cert-manager.yaml
- ./rook-ceph.yaml
# # - ./dev.yaml
- ./downloads.yaml
- ./databases.yaml
- ./flux-system.yaml
Expand Down
17 changes: 0 additions & 17 deletions k8s/clusters/hegira/flux/orchestration/rook-ceph.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,20 +96,3 @@ spec:
sourceRef:
kind: GitRepository
name: home-cluster
---
apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
kind: Kustomization
metadata:
name: rook-ceph-monitoring
namespace: flux-system
spec:
dependsOn:
- name: rook-ceph-namespace
- name: rook-ceph-operator
interval: 5m
path: "./k8s/manifests/rook-ceph/monitoring"
prune: true
wait: true
sourceRef:
kind: GitRepository
name: home-cluster
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ kind: Kustomization
resources:
- ./secret.sops.yaml
- ./clusterissuer.yaml
- ./prometheusrule.yaml
1 change: 0 additions & 1 deletion k8s/manifests/databases/postgresql/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ kind: Kustomization
resources:
- ./config-pvc.yaml
- ./helmrelease.yaml
- ./monitoring

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- blueprints
- monitoring
- policies
- profiles

This file was deleted.

This file was deleted.

1 change: 0 additions & 1 deletion k8s/manifests/kube-system/cilium/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml
- servicemonitor.yaml
1 change: 0 additions & 1 deletion k8s/manifests/kube-system/descheduler/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml
- monitoring
2 changes: 1 addition & 1 deletion k8s/manifests/media/plex/helmrelease.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ spec:
external-dns.alpha.kubernetes.io/cloudflare-proxied: "true"
hajimari.io/enable: "true"
hajimari.io/icon: plex
hajimari.io/appName: plex
hajimari.io/appName: plex # TODO #942 fix hajimari ingress to /web/index (base is 401)
hosts:
- host: "plex.${CLUSTER_DOMAIN}"
paths:
Expand Down
2 changes: 1 addition & 1 deletion k8s/manifests/media/plex/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./pvc.yaml
- ./config-pvc.yaml
- ./helmrelease.yaml
# - ./probe.yaml
- ./endpoint-monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml
- prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- prometheusrule.yaml
- cert-manager.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- postgresql.yaml
- redis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,23 @@ spec:

podMetricsEndpoints:
- port: http-prom
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: flux-alert-rules
namespace: flux-system
labels:
prometheus: flux-rules
role: alert-rules
spec:
groups:
- name: GitOpsToolkit
rules:
- alert: ReconciliationFailure
expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (namespace, name, kind) + on(namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (namespace, name, kind)) * 2 == 1
for: 10m
labels:
severity: page
annotations:
summary: "{{ $labels.kind }} {{ $labels.namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes."
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- flux-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,26 @@ spec:
for: 1m
labels:
severity: critical
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: k10
namespace: kasten-io
spec:
namespaceSelector:
matchNames:
- kasten-io
selector:
matchLabels:
app: prometheus
endpoints:
- port: http
scheme: http
path: /k10/prometheus/federate
honorLabels: true
interval: 15s
params:
"match[]":
- '{__name__=~"jobs.*"}'
- '{__name__=~"catalog.*"}'
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- alert.yaml
- k10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- node-exporter.yaml
- upsc.yaml
- cert-manager
- databases
- kasten-io
- network-system
- rook-ceph
- services
- flux-system
- system-monitoring
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- cilium.yaml
- descheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- metrics.yaml
- rook-ceph.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- alerts.yaml
- minio.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: minio-rules
namespace: cert-manager
spec:
groups:
- name: minio.rules
rules:
- alert: MinioS3Errors
expr: |
increase(minio_s3_requests_errors_total[5m]) > 5
for: 0m
labels:
severity: critical
annotations:
description: "Minio is experiencing increased S3 errors. Operations
may not occur as expected, and service may be impacted"
summary: "Minio is experiecing elevated S3 errors."
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- node-exporter.yaml
- blackbox-exporter.yaml
- loki.yaml
- prom-smartctl.yaml
- thanos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus-smartctl
# FIXME - needs alerts!
# FIXME #943 - needs alerts!
1 change: 0 additions & 1 deletion k8s/manifests/system-monitoring/loki/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml
- prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ kind: Kustomization
resources:
- ./daemonset.yaml
- ./service.yaml
- ./servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- dashboard
- monitoring
- secret.sops.yaml
- helmrelease.yaml

This file was deleted.