diff --git a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml index a0de148657..12e29df3dc 100644 --- a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml +++ b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml @@ -43,3 +43,12 @@ spec: severity: warning annotations: message: "Kubelet health failure threshold reached" + - name: system-memory-exceeds-reservation + rules: + - alert: SystemMemoryExceedsReservation + expr: | + sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.9) + labels: + severity: warning + annotations: + message: "System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 90% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the node. The reservation may be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods."