diff --git a/kubernetes/staging/apps/rook-ceph/kustomization.yaml b/kubernetes/staging/apps/rook-ceph/kustomization.yaml new file mode 100644 index 0000000000..af910b7de9 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/kustomization.yaml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - rook-ceph/install.yaml diff --git a/kubernetes/staging/apps/rook-ceph/namespace.yaml b/kubernetes/staging/apps/rook-ceph/namespace.yaml new file mode 100644 index 0000000000..1696c56ee6 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: rook-ceph diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons/kustomization.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons/kustomization.yaml new file mode 100644 index 0000000000..c59b7c4324 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - rules.yaml diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons/rules.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons/rules.yaml new file mode 100644 index 0000000000..f55a4ae9ce --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons/rules.yaml @@ -0,0 +1,904 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: rook-prometheus + role: alert-rules + name: prometheus-ceph-rules + namespace: default +spec: + # Import the raw prometheus rules since they have descriptions that should not be processed with the helm templates + # copied from https://github.com/ceph/ceph/blob/master/monitoring/ceph-mixin/prometheus_alerts.yml + groups: + - name: cluster health + rules: + - alert: CephHealthError + expr: ceph_health_status == 2 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.2.1 + annotations: + summary: Cluster is in an ERROR state + description: > + Ceph in HEALTH_ERROR state for more than 5 minutes. + Please check "ceph health detail" for more information. + + - alert: CephHealthWarning + expr: ceph_health_status == 1 + for: 15m + labels: + severity: warning + type: ceph_default + annotations: + summary: Cluster is in a WARNING state + description: > + Ceph has been in HEALTH_WARN for more than 15 minutes. + Please check "ceph health detail" for more information. + + - name: mon + rules: + - alert: CephMonDownQuorumAtRisk + expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.3.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: Monitor quorum is at risk + description: | + {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active + Without quorum the cluster will become inoperable, affecting all connected clients and services. + + The following monitors are down: + {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: CephMonDown + expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)) + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: One of more ceph monitors are down + description: | + {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. + Quorum is still intact, but the loss of further monitors will make your cluster inoperable. + + The following monitors are down: + {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: CephMonDiskspaceCritical + expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.3.2 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit + summary: Disk space on at least one monitor is critically low + description: | + The free space available to a monitor's store is critically low (<5% by default). + You should increase the space available to the monitor(s). The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} + {{- end }} + + - alert: CephMonDiskspaceLow + expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low + summary: Disk space on at least one monitor is approaching full + description: | + The space available to a monitor's store is approaching full (>70% is the default). + You should increase the space available to the monitor store. The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} + {{- end }} + + - alert: CephMonClockSkew + expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew + summary: Clock skew across the Monitor hosts detected + description: | + The ceph monitors rely on a consistent time reference to maintain + quorum and cluster consistency. This event indicates that at least + one of your mons is not sync'd correctly. + + Review the cluster status with ceph -s. This will show which monitors + are affected. Check the time sync status on each monitor host. + + - name: osd + rules: + - alert: CephOSDDownHigh + expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.1 + annotations: + summary: More than 10% of OSDs are down + description: | + {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). + + The following OSDs are down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: CephOSDHostDown + expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.8 + annotations: + summary: An OSD host is offline + description: | + The following OSDs are down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} + - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} + {{- end }} + - alert: CephOSDDown + expr: ceph_health_detail{name="OSD_DOWN"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.2 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down + summary: An OSD has been marked down/unavailable + description: | + {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. + + The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + + - alert: CephOSDNearFull + expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.3 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull + summary: OSD(s) running low on free space (NEARFULL) + description: | + One or more OSDs have reached their NEARFULL threshold + + Use 'ceph health detail' to identify which OSDs have reached this threshold. + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: CephOSDFull + expr: ceph_health_detail{name="OSD_FULL"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.6 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full + summary: OSD(s) is full, writes blocked + description: | + An OSD has reached it's full threshold. Writes from all pools that share the + affected OSD will be blocked. + + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: CephOSDBackfillFull + expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull + summary: OSD(s) too full for backfill operations + description: | + An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations + completing for some pools. Check the current capacity utilisation with 'ceph df' + + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: CephOSDTooManyRepairs + expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs + summary: OSD has hit a high number of read errors + description: | + Reads from an OSD have used a secondary PG to return data to the client, indicating + a potential failing disk. + - alert: CephOSDTimeoutsPublicNetwork + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + summary: Network issues delaying OSD heartbeats (public network) + description: | + OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - alert: CephOSDTimeoutsClusterNetwork + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + summary: Network issues delaying OSD heartbeats (cluster network) + description: | + OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - alert: CephOSDInternalDiskSizeMismatch + expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch + summary: OSD size inconsistency error + description: | + One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata. + This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs. + - alert: CephDeviceFailurePredicted + expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 + summary: Device(s) have been predicted to fail soon + description: | + The device health module has determined that one or more devices will fail + soon. To review the device states use 'ceph device ls'. To show a specific + device use 'ceph device info '. + + Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once + the osd is empty remove and replace the OSD. + - alert: CephDeviceFailurePredictionTooHigh + expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.7 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany + summary: Too many devices have been predicted to fail, unable to resolve + description: | + The device health module has determined that the number of devices predicted to + fail can not be remediated automatically, since it would take too many osd's out of + the cluster, impacting performance and potentially availabililty. You should add new + OSDs to the cluster to allow data to be relocated to avoid the data integrity issues. + - alert: CephDeviceFailureRelocationIncomplete + expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use + summary: A device failure is predicted, but unable to relocate data + description: | + The device health module has determined that one or more devices will fail + soon, but the normal process of relocating the data on the device to other + OSDs in the cluster is blocked. + + Check the the cluster has available freespace. It may be necessary to add + more disks to the cluster to allow the data from the failing device to + successfully migrate. + + - alert: CephOSDFlapping + expr: | + ( + rate(ceph_osd_up[5m]) + * on(ceph_daemon) group_left(hostname) ceph_osd_metadata + ) * 60 > 1 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.4 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds + summary: Network issues are causing OSD's to flap (mark each other out) + description: > + OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was + marked down and back up at {{ $value | humanize }} times once a + minute for 5 minutes. This could indicate a network issue (latency, + packet drop, disruption) on the clusters "cluster network". Check the + network environment on the listed host(s). + + - alert: CephOSDReadErrors + expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors + summary: Device read errors detected + description: > + An OSD has encountered read errors, but the OSD has recovered by retrying + the reads. This may indicate an issue with the Hardware or Kernel. + # alert on high deviation from average PG count + - alert: CephPGImbalance + expr: | + abs( + ( + (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.5 + annotations: + summary: PG allocations are not balanced across devices + description: > + OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates + by more than 30% from average PG count. + # alert on high commit latency...but how high is too high + + - name: mds + rules: + - alert: CephFilesystemDamaged + expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.1 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: Ceph filesystem is damaged. + description: > + The filesystems metadata has been corrupted. Data access + may be blocked. + + Either analyse the output from the mds daemon admin socket, or + escalate to support + - alert: CephFilesystemOffline + expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.3 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down + summary: Ceph filesystem is offline + description: > + All MDS ranks are unavailable. The ceph daemons providing the metadata + for the Ceph filesystem are all down, rendering the filesystem offline. + - alert: CephFilesystemDegraded + expr: ceph_health_detail{name="FS_DEGRADED"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.4 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded + summary: Ceph filesystem is degraded + description: > + One or more metadata daemons (MDS ranks) are failed or in a + damaged state. At best the filesystem is partially available, + worst case is the filesystem is completely unusable. + - alert: CephFilesystemMDSRanksLow + expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max + summary: Ceph MDS daemon count is lower than configured + description: > + The filesystem's "max_mds" setting defined the number of MDS ranks in + the filesystem. The current number of active MDS daemons is less than + this setting. + - alert: CephFilesystemInsufficientStandby + expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby + summary: Ceph filesystem standby daemons too low + description: > + The minimum number of standby daemons determined by standby_count_wanted + is less than the actual number of standby daemons. Adjust the standby count + or increase the number of mds daemons within the filesystem. + - alert: CephFilesystemFailureNoStandby + expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.5 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds + summary: Ceph MDS daemon failed, no further standby available + description: > + An MDS daemon has failed, leaving only one active rank without + further standby. Investigate the cause of the failure or add a + standby daemon + - alert: CephFilesystemReadOnly + expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.2 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: Ceph filesystem in read only mode, due to write error(s) + description: > + The filesystem has switched to READ ONLY due to an unexpected + write error, when writing to the metadata pool + + Either analyse the output from the mds daemon admin socket, or + escalate to support + + - name: mgr + rules: + - alert: CephMgrModuleCrash + expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.6.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash + summary: A mgr module has recently crashed + description: > + One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A + crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to + investigate which module has failed, and archive it to acknowledge the failure. + - alert: CephMgrPrometheusModuleInactive + expr: up{job="ceph"} == 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.6.2 + annotations: + summary: Ceph's mgr/prometheus module is not available + description: > + The mgr/prometheus module at {{ $labels.instance }} is unreachable. This + could mean that the module has been disabled or the mgr itself is down. + + Without the mgr/prometheus module metrics and alerts will no longer + function. Open a shell to ceph and use 'ceph -s' to to determine whether the + mgr is active. If the mgr is not active, restart it, otherwise you can check + the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's + not listed as enabled, enable it with 'ceph mgr module enable prometheus' + + - name: pgs + rules: + - alert: CephPGsInactive + expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.1 + annotations: + summary: One or more Placement Groups are inactive + description: > + {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. + Inactive placement groups aren't able to serve read/write + requests. + - alert: CephPGsUnclean + expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0 + for: 15m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.2 + annotations: + summary: One or more platcment groups are marked unclean + description: > + {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}. + Unclean PGs haven't been able to completely recover from a previous failure. + - alert: CephPGsDamaged + expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.4 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged + summary: Placement group damaged, manual intervention needed + description: > + During data consistency checks (scrub), at least one PG has been flagged as being + damaged or inconsistent. + + Check to see which PG is affected, and attempt a manual repair if necessary. To list + problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use + the 'ceph pg repair ' command. + - alert: CephPGRecoveryAtRisk + expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.5 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full + summary: OSDs are too full for automatic recovery + description: > + Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their + 'full' threshold. Add more capacity to the cluster, or delete unwanted data. + - alert: CephPGUnavilableBlockingIO + # PG_AVAILABILITY, but an OSD is not in a DOWN state + expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.3 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability + summary: Placement group is unavailable, blocking some I/O + description: > + Data availability is reduced impacting the clusters ability to service I/O to some data. One or + more placement groups (PGs) are in a state that blocks IO. + - alert: CephPGBackfillAtRisk + expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.6 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full + summary: Backfill operations are blocked, due to lack of freespace + description: > + Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs + have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data. + - alert: CephPGNotScrubbed + expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed + summary: Placement group(s) have not been scrubbed + description: | + One or more PGs have not been scrubbed recently. The scrub process is a data integrity + feature, protectng against bit-rot. It checks that objects and their metadata (size and + attributes) match across object replicas. When PGs miss their scrub window, it may + indicate the scrub window is too small, or PGs were not in a 'clean' state during the + scrub window. + + You can manually initiate a scrub with: ceph pg scrub + - alert: CephPGsHighPerOSD + expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs + summary: Placement groups per OSD is too high + description: | + The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). + + Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status' + and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide + the autoscaler based on the expected relative size of the pool + (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') + - alert: CephPGNotDeepScrubbed + expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed + summary: Placement group(s) have not been deep scrubbed + description: | + One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity + feature, protectng against bit-rot. It compares the contents of objects and their + replicas for inconsistency. When PGs miss their deep scrub window, it may indicate + that the window is too small or PGs were not in a 'clean' state during the deep-scrub + window. + + You can manually initiate a deep scrub with: ceph pg deep-scrub + + - name: nodes + rules: + - alert: CephNodeRootFilesystemFull + expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.1 + annotations: + summary: Root filesystem is dangerously full + description: > + Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free. + + # alert on nic packet errors and drops rates > 1% packets/s + # - alert: CephNodeNetworkPacketDrops + # expr: | + # ( + # increase(node_network_receive_drop_total{device!="lo"}[1m]) + + # increase(node_network_transmit_drop_total{device!="lo"}[1m]) + # ) / ( + # increase(node_network_receive_packets_total{device!="lo"}[1m]) + + # increase(node_network_transmit_packets_total{device!="lo"}[1m]) + # ) >= 0.001 or ( + # increase(node_network_receive_drop_total{device!="lo"}[1m]) + + # increase(node_network_transmit_drop_total{device!="lo"}[1m]) + # ) >= 50 + # for: 5m + # labels: + # severity: warning + # type: ceph_default + # oid: 1.3.6.1.4.1.50495.1.2.1.8.2 + # annotations: + # summary: One or more Nics is seeing packet drops + # description: > + # Node {{ $labels.instance }} experiences packet drop > 0.01% or > + # 50 packets/s on interface {{ $labels.device }}. + + - alert: CephNodeNetworkPacketErrors + expr: | + ( + increase(node_network_receive_errs_total{device!="lo"}[1m]) + + increase(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + increase(node_network_receive_packets_total{device!="lo"}[1m]) + + increase(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + increase(node_network_receive_errs_total{device!="lo"}[1m]) + + increase(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.3 + annotations: + summary: One or more Nics is seeing packet errors + description: > + Node {{ $labels.instance }} experiences packet errors > 0.01% or + > 10 packets/s on interface {{ $labels.device }}. + + # Restrict to device names beginning with '/' to skip false alarms from + # tmpfs, overlay type filesystems + # - alert: CephNodeDiskspaceWarning + # expr: | + # predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * + # on(instance) group_left(nodename) node_uname_info < 0 + # for: 30m + # labels: + # severity: warning + # type: ceph_default + # oid: 1.3.6.1.4.1.50495.1.2.1.8.4 + # annotations: + # summary: Host filesystem freespace is getting low + # description: > + # Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} + # will be full in less than 5 days assuming the average fill-up + # rate of the past 48 hours. + + # - alert: CephNodeInconsistentMTU + # expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"})) + # labels: + # severity: warning + # type: ceph_default + # annotations: + # summary: MTU settings across Ceph hosts are inconsistent + # description: > + # Node {{ $labels.instance }} has a different MTU size ({{ $value }}) + # than the median value on device {{ $labels.device }}. + + - name: pools + rules: + # - alert: CephPoolGrowthWarning + # expr: | + # (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) + # group_right ceph_pool_metadata) >= 95 + # labels: + # severity: warning + # type: ceph_default + # oid: 1.3.6.1.4.1.50495.1.2.1.9.2 + # annotations: + # summary: Pool growth rate may soon exceed it's capacity + # description: > + # Pool '{{ $labels.name }}' will be full in less than 5 days + # assuming the average fill-up rate of the past 48 hours. + - alert: CephPoolBackfillFull + expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 + labels: + severity: warning + type: ceph_default + annotations: + summary: Freespace in a pool is too low for recovery/rebalance + description: > + A pool is approaching it's near full threshold, which will + prevent rebalance operations from completing. You should + consider adding more capacity to the pool. + + - alert: CephPoolFull + expr: ceph_health_detail{name="POOL_FULL"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.9.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full + summary: Pool is full - writes are blocked + description: | + A pool has reached it's MAX quota, or the OSDs supporting the pool + have reached their FULL threshold. Until this is resolved, writes to + the pool will be blocked. + Pool Breakdown (top 5) + {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }} + - {{ .Labels.name }} at {{ .Value }}% + {{- end }} + Either increase the pools quota, or add capacity to the cluster first + then increase it's quota (e.g. ceph osd pool set quota max_bytes ) + - alert: CephPoolNearFull + expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + summary: One or more Ceph pools are getting full + description: | + A pool has exceeeded it warning (percent full) threshold, or the OSDs + supporting the pool have reached their NEARFULL thresholds. Writes may + continue, but you are at risk of the pool going read only if more capacity + isn't made available. + + Determine the affected pool with 'ceph df detail', for example looking + at QUOTA BYTES and STORED. Either increase the pools quota, or add + capacity to the cluster first then increase it's quota + (e.g. ceph osd pool set quota max_bytes ) + - name: healthchecks + rules: + - alert: CephSlowOps + expr: ceph_healthcheck_slow_ops > 0 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: MON/OSD operations are slow to complete + description: > + {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) + # cephadm alerts + - name: cephadm + rules: + - alert: CephadmUpgradeFailed + expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.11.2 + annotations: + summary: Ceph version upgrade has failed + description: > + The cephadm cluster upgrade process has failed. The cluster remains in + an undetermined state. + + Please review the cephadm logs, to understand the nature of the issue + - alert: CephadmDaemonFailed + expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.11.1 + annotations: + summary: A ceph daemon manged by cephadm is down + description: > + A daemon managed by cephadm is no longer active. Determine, which + daemon is down with 'ceph health detail'. you may start daemons with + the 'ceph orch daemon start ' + - alert: CephadmPaused + expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused + summary: Orchestration tasks via cephadm are PAUSED + description: > + Cluster management has been paused manually. This will prevent the + orchestrator from service management and reconciliation. If this is + not intentional, resume cephadm operations with 'ceph orch resume' + + # prometheus alerts + # - name: PrometheusServer + # rules: + # - alert: PrometheusJobMissing + # expr: absent(up{job="ceph"}) + # for: 30s + # labels: + # severity: critical + # type: ceph_default + # oid: 1.3.6.1.4.1.50495.1.2.1.12.1 + # annotations: + # summary: The scrape job for Ceph is missing from Prometheus + # description: | + # The prometheus job that scrapes from Ceph is no longer defined, this + # will effectively mean you'll have no metrics or alerts for the cluster. + # + # Please review the job definitions in the prometheus.yml file of the prometheus + # instance. + # Object related events + - name: rados + rules: + - alert: CephObjectMissing + expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.10.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound + summary: Object(s) has been marked UNFOUND + description: | + A version of a RADOS object can not be found, even though all OSDs are up. I/O + requests for this object from clients will block (hang). Resolving this issue may + require the object to be rolled back to a prior version manually, and manually verified. + # Generic + - name: generic + rules: + - alert: CephDaemonCrash + expr: ceph_health_detail{name="RECENT_CRASH"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.1.2 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash + summary: One or more Ceph daemons have crashed, and are pending acknowledgement + description: | + One or more daemons have crashed recently, and need to be acknowledged. This notification + ensures that software crashes don't go unseen. To acknowledge a crash, use the + 'ceph crash archive ' command. diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/cluster/helm-release.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/cluster/helm-release.yaml new file mode 100644 index 0000000000..01566854c6 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/cluster/helm-release.yaml @@ -0,0 +1,189 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.zinn.ca/helm.toolkit.fluxcd.io/helmrelease_v2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: rook-ceph-cluster +spec: + interval: 15m + chart: + spec: + chart: rook-ceph-cluster + version: v1.16.1 + sourceRef: + kind: HelmRepository + name: rook-ceph-charts + namespace: flux-system + interval: 15m + maxHistory: 3 + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + dependsOn: + - name: rook-ceph-operator + namespace: rook-ceph + - name: snapshot-controller + namespace: volsync-system + values: + toolbox: + enabled: true + monitoring: + enabled: true + createPrometheusRules: true + configOverride: | + [global] + bdev_enable_discard = true + bdev_async_discard = true + + cephClusterSpec: + cephVersion: + image: quay.io/ceph/ceph:v19.2.0@sha256:200087c35811bf28e8a8073b15fa86c07cce85c575f1ccd62d1d6ddbfdc6770a + + resources: + mgr: + requests: + cpu: "125m" + memory: "549M" + limits: + memory: "1219M" + mon: + requests: + cpu: "49m" + memory: "477M" + osd: + requests: + cpu: "442m" + memory: "2678M" + limits: + memory: "5944M" + prepareosd: + requests: + cpu: "250m" + memory: "50Mi" + limits: + memory: "200Mi" + mgr-sidecar: + requests: + cpu: "49m" + memory: "94M" + limits: + memory: "200M" + crashcollector: + requests: + cpu: "15m" + memory: "64M" + limits: + memory: "64M" + logcollector: + requests: + cpu: "100m" + memory: "100M" + limits: + memory: "1G" + cleanup: + requests: + cpu: "250m" + memory: "100M" + limits: + memory: "1G" + + crashCollector: + disable: false + + dashboard: + enabled: true + urlPrefix: / + ssl: false + + storage: + useAllNodes: false + useAllDevices: false + config: + osdsPerDevice: "1" + nodes: + - name: "stage-1" + devices: + - name: "sdb" + - name: "stage-2" + devices: + - name: "sdb" + - name: "stage-3" + devices: + - name: "sdb" + + ingress: + dashboard: + ingressClassName: nginx + host: + name: rook.${SECRET_DOMAIN_NAME} + path: / + + cephBlockPools: + - name: ceph-blockpool + spec: + failureDomain: host + replicated: + size: 3 + storageClass: + enabled: true + name: ceph-block + isDefault: true + reclaimPolicy: Delete + allowVolumeExpansion: true + parameters: + imageFormat: "2" + imageFeatures: layering + csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph + csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node + csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph + csi.storage.k8s.io/fstype: ext4 + + cephBlockPoolsVolumeSnapshotClass: + enabled: true + name: csi-ceph-blockpool + isDefault: false + deletionPolicy: Delete + + cephFileSystems: + - name: ceph-filesystem + spec: + metadataPool: + replicated: + size: 3 + dataPools: + - failureDomain: host + replicated: + size: 3 + metadataServer: + activeCount: 1 + activeStandby: true + storageClass: + enabled: true + isDefault: false + name: ceph-filesystem + reclaimPolicy: Delete + allowVolumeExpansion: true + mountOptions: [] + parameters: + csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph + csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node + csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph + csi.storage.k8s.io/fstype: ext4 + + cephFileSystemVolumeSnapshotClass: + enabled: true + name: csi-ceph-filesystem + isDefault: false + deletionPolicy: Delete + + cephObjectStores: [] diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/cluster/kustomization.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/cluster/kustomization.yaml new file mode 100644 index 0000000000..51567a4234 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/cluster/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/install.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/install.yaml new file mode 100644 index 0000000000..eec2f5615a --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/install.yaml @@ -0,0 +1,55 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: rook-ceph-operator + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + targetNamespace: rook-ceph + path: ./kubernetes/staging/apps/rook-ceph/rook-ceph/operator + sourceRef: + kind: GitRepository + name: flux-system + prune: true + wait: true + interval: 10m +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: rook-ceph-cluster + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + targetNamespace: rook-ceph + path: ./kubernetes/staging/apps/rook-ceph/rook-ceph/cluster + sourceRef: + kind: GitRepository + name: flux-system + dependsOn: + - name: rook-ceph-operator + prune: true + wait: true + interval: 10m +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: rook-ceph-addons + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + targetNamespace: rook-ceph + path: ./kubernetes/staging/apps/rook-ceph/rook-ceph/add-ons + sourceRef: + kind: GitRepository + name: flux-system + dependsOn: + - name: rook-ceph-operator + prune: true + wait: true + interval: 10m diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/dashboard-secret.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/dashboard-secret.yaml new file mode 100644 index 0000000000..98b5ea8190 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/dashboard-secret.yaml @@ -0,0 +1,20 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: rook-ceph-dashboard + namespace: rook-ceph +spec: + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-connect + target: + name: rook-ceph-dashboard-password # rook-ceph expects this name + template: + engineVersion: v2 + data: + password: "{{ .password }}" + dataFrom: + - extract: + key: rook-dashboard diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/helm-release.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/helm-release.yaml new file mode 100644 index 0000000000..87ec4c18aa --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/helm-release.yaml @@ -0,0 +1,50 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.zinn.ca/helm.toolkit.fluxcd.io/helmrelease_v2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: rook-ceph-operator +spec: + interval: 15m + releaseName: rook-ceph + chart: + spec: + chart: rook-ceph + version: v1.16.1 + sourceRef: + kind: HelmRepository + name: rook-ceph-charts + namespace: flux-system + interval: 15m + maxHistory: 3 + install: + createNamespace: true + crds: CreateReplace + remediation: + retries: 3 + upgrade: + crds: CreateReplace + remediation: + retries: 3 + dependsOn: + - name: snapshot-controller + namespace: volsync-system + values: + crds: + enabled: true + pspEnable: false + csi: + enableVolumeGroupSnapshot: false # TODO: enable this when v1beta1 CRDs are available + csiAddons: + enabled: false + enableLiveness: true + serviceMonitor: + enabled: false + monitoring: + enabled: true + resources: + requests: + cpu: 109m + memory: 204Mi + limits: + memory: 453Mi diff --git a/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/kustomization.yaml b/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/kustomization.yaml new file mode 100644 index 0000000000..b67d007230 --- /dev/null +++ b/kubernetes/staging/apps/rook-ceph/rook-ceph/operator/kustomization.yaml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - dashboard-secret.yaml + - helm-release.yaml diff --git a/kubernetes/staging/apps/volsync-system/kustomization.yaml b/kubernetes/staging/apps/volsync-system/kustomization.yaml new file mode 100644 index 0000000000..33d046e758 --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/kustomization.yaml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - snapshot-controller/install.yaml + - volsync/install.yaml diff --git a/kubernetes/staging/apps/volsync-system/namespace.yaml b/kubernetes/staging/apps/volsync-system/namespace.yaml new file mode 100644 index 0000000000..8c6bf6f2da --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/namespace.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: volsync-system + annotations: + kustomize.toolkit.fluxcd.io/prune: disabled diff --git a/kubernetes/staging/apps/volsync-system/snapshot-controller/app/helm-release.yaml b/kubernetes/staging/apps/volsync-system/snapshot-controller/app/helm-release.yaml new file mode 100644 index 0000000000..a8471e4278 --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/snapshot-controller/app/helm-release.yaml @@ -0,0 +1,31 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.zinn.ca/helm.toolkit.fluxcd.io/helmrelease_v2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: snapshot-controller +spec: + interval: 15m + chart: + spec: + chart: snapshot-controller + version: 4.0.0 + sourceRef: + kind: HelmRepository + name: piraeus-charts + namespace: flux-system + interval: 15m + maxHistory: 3 + install: + createNamespace: true + crds: CreateReplace + remediation: + retries: 3 + upgrade: + crds: CreateReplace + remediation: + retries: 3 + values: + controller: + serviceMonitor: + create: true diff --git a/kubernetes/staging/apps/volsync-system/snapshot-controller/app/kustomization.yaml b/kubernetes/staging/apps/volsync-system/snapshot-controller/app/kustomization.yaml new file mode 100644 index 0000000000..51567a4234 --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/snapshot-controller/app/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml diff --git a/kubernetes/staging/apps/volsync-system/snapshot-controller/install.yaml b/kubernetes/staging/apps/volsync-system/snapshot-controller/install.yaml new file mode 100644 index 0000000000..f7eb0d558a --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/snapshot-controller/install.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: kube-system-snapshot-controller + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + targetNamespace: volsync-system + path: ./kubernetes/staging/apps/volsync-system/snapshot-controller/app + sourceRef: + kind: GitRepository + name: flux-system + prune: true + wait: false + interval: 10m diff --git a/kubernetes/staging/apps/volsync-system/volsync/app/helm-release.yaml b/kubernetes/staging/apps/volsync-system/volsync/app/helm-release.yaml new file mode 100644 index 0000000000..a178f47776 --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/volsync/app/helm-release.yaml @@ -0,0 +1,31 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.zinn.ca/helm.toolkit.fluxcd.io/helmrelease_v2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: volsync +spec: + interval: 15m + chart: + spec: + chart: volsync + version: 0.11.0 + sourceRef: + kind: HelmRepository + name: backube-charts + namespace: flux-system + maxHistory: 3 + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + dependsOn: + - name: snapshot-controller + namespace: volsync-system + values: + manageCRDs: true + metrics: + disableAuth: true diff --git a/kubernetes/staging/apps/volsync-system/volsync/app/kustomization.yaml b/kubernetes/staging/apps/volsync-system/volsync/app/kustomization.yaml new file mode 100644 index 0000000000..a5f227123f --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/volsync/app/kustomization.yaml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml + - prometheus-rule.yaml diff --git a/kubernetes/staging/apps/volsync-system/volsync/app/prometheus-rule.yaml b/kubernetes/staging/apps/volsync-system/volsync/app/prometheus-rule.yaml new file mode 100644 index 0000000000..caf73c1add --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/volsync/app/prometheus-rule.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: volsync +spec: + groups: + - name: volsync.rules + rules: + - alert: VolSyncComponentAbsent + annotations: + summary: VolSync component has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~".*volsync.*"} == 1) + for: 15m + labels: + severity: critical + - alert: VolSyncVolumeOutOfSync + annotations: + summary: >- + {{ $labels.obj_namespace }}/{{ $labels.obj_name }} volume + is out of sync. + expr: | + volsync_volume_out_of_sync == 1 + for: 15m + labels: + severity: critical diff --git a/kubernetes/staging/apps/volsync-system/volsync/install.yaml b/kubernetes/staging/apps/volsync-system/volsync/install.yaml new file mode 100644 index 0000000000..eef5e64364 --- /dev/null +++ b/kubernetes/staging/apps/volsync-system/volsync/install.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: system-volsync + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + targetNamespace: volsync-system + path: ./kubernetes/staging/apps/volsync-system/volsync/app + sourceRef: + kind: GitRepository + name: flux-system + dependsOn: + - name: kube-system-snapshot-controller + prune: true + wait: true + interval: 10m