Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
annotations:
networkoperator.openshift.io/ignore-errors: ""
name: master-rules
namespace: openshift-ovn-kubernetes
spec:
groups:
- name: cluster-network-operator-master.rules
rules:
- record: cluster:ovnkube_master_egress_routing_via_host:max
expr: max(ovnkube_master_egress_routing_via_host)
- record: cluster:ovn_db_nbdb_not_cluster_member:abs
expr: abs(count(ovn_db_cluster_server_status{db_name="OVN_Northbound", server_status="cluster member"}) - {{.OvnkubeMasterReplicas}})
- record: cluster:ovn_db_sbdb_not_cluster_member:abs
expr: abs(count(ovn_db_cluster_server_status{db_name="OVN_Southbound", server_status="cluster member"}) - {{.OvnkubeMasterReplicas}})
- record: cluster:ovn_db_nbdb_missing_inbound_connections:abs
expr: abs(sum(ovn_db_cluster_inbound_connections_total{db_name="OVN_Northbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
- record: cluster:ovn_db_sbdb_missing_inbound_connections:abs
expr: abs(sum(ovn_db_cluster_inbound_connections_total{db_name="OVN_Southbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
- record: cluster:ovn_db_nbdb_missing_outbound_connections:abs
expr: abs(sum(ovn_db_cluster_outbound_connections_total{db_name="OVN_Northbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
- record: cluster:ovn_db_sbdb_missing_outbound_connections:abs
expr: abs(sum(ovn_db_cluster_outbound_connections_total{db_name="OVN_Southbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
# OVN kubernetes cluster manager functional alerts
- alert: V4SubnetAllocationThresholdExceeded
annotations:
summary: More than 80% of v4 subnets available to assign to the nodes are allocated. Current v4 subnet allocation percentage is {{"{{"}} $value | humanizePercentage {{"}}"}}.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/V4SubnetAllocationThresholdExceeded.md
description: More than 80% of IPv4 subnets are used. Insufficient IPv4 subnets could degrade provisioning of workloads.
expr: ovnkube_clustermanager_allocated_v4_host_subnets / ovnkube_clustermanager_num_v4_host_subnets > 0.8
for: 10m
labels:
severity: warning
- alert: V6SubnetAllocationThresholdExceeded
annotations:
summary: More than 80% of the v6 subnets available to assign to the nodes are allocated. Current v6 subnet allocation percentage is {{"{{"}} $value | humanizePercentage {{"}}"}}.
description: More than 80% of IPv6 subnets are used. Insufficient IPv6 subnets could degrade provisioning of workloads.
expr: ovnkube_clustermanager_allocated_v6_host_subnets / ovnkube_clustermanager_num_v6_host_subnets > 0.8
for: 10m
labels:
severity: warning
# OVN kubernetes master functional alerts
- alert: NoRunningOvnController
annotations:
summary: There is no running ovn-kubernetes controller.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnMaster.md #TODO: Update Runbook
description: |
Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
implemented while there are no OVN Kubernetes pods.
expr: | #should we be checking for ovn-controller or ovnkube-controller?
absent(up{container="ovnkube-controller", namespace="openshift-ovn-kubernetes"} == 1) #
for: 5m
labels:
namespace: openshift-ovn-kubernetes
severity: critical
- alert: NoOvnClusterManager
annotations:
summary: There is no ovn-kubernetes Cluster Manager leader.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnMasterLeader.md #TODO: Update runbook
description: |
Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
implemented while there is no OVN Kubernetes leader. Existing workloads should continue to have connectivity.
OVN-Kubernetes control plane is not functional.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max by (namespace) (max_over_time(ovnkube_clustermanager[5m])) == 0
for: 5m
labels:
severity: critical
# OVN northbound and southbound databases functional alerts
- alert: NorthboundStaleOnNode
annotations:
summary: ovn-kubernetes has not written anything to the northbound database on a node for too long.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md
description: |
Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
implemented. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane and/or
OVN northbound database may not be functional.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120
for: 10m
labels:
severity: critical #todo: Determine severity here as this is not as big a deal anymore
- alert: SouthboundStaleOnNode
annotations:
summary: ovn-northd has not successfully synced any changes to the southbound DB for too long.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md
description: |
Networking control plane is degraded. Networking configuration updates may not be applied to the cluster or
taking a long time to apply. This usually means there is a large load on OVN component 'northd' or it is not
functioning.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120
for: 10m
labels:
severity: critical


Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: ovnkube-master
app: ovnkube-controller
annotations:
networkoperator.openshift.io/ignore-errors: ""
name: monitor-ovn-master-metrics
Expand All @@ -23,26 +23,26 @@ spec:
- openshift-ovn-kubernetes
selector:
matchLabels:
app: ovnkube-master
app: ovnkube-controller
---
apiVersion: v1
kind: Service
metadata:
labels:
app: ovnkube-master
name: ovn-kubernetes-master
app: ovnkube-controller
name: ovn-kubernetes-controller
namespace: openshift-ovn-kubernetes
annotations:
service.beta.openshift.io/serving-cert-secret-name: ovn-master-metrics-cert
spec:
selector:
app: ovnkube-master
app: ovnkube-controller
clusterIP: None
publishNotReadyAddresses: true
ports:
- name: metrics
port: 9102
protocol: TCP
targetPort: 9102
targetPort: 9102 ### TODO it's now 9112
sessionAffinity: None
type: ClusterIP
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# The ovnkube control-plane components
kind: DaemonSet
apiVersion: apps/v1
metadata:
name: ovnkube-master
namespace: openshift-ovn-kubernetes
annotations:
kubernetes.io/description: |
This daemonset launches the ovn-kubernetes controller (master) networking components.
release.openshift.io/version: "{{.ReleaseVersion}}"
spec:
selector:
matchLabels:
app: ovnkube-master
updateStrategy:
type: RollingUpdate
rollingUpdate:
# by default, Deployments spin up the new pod before terminating the old one
# but we don't want that - because ovsdb holds the lock.
maxSurge: 0
maxUnavailable: 3
template:
metadata:
annotations:
target.workload.openshift.io/management: '{"effect": "PreferredDuringScheduling"}'
labels:
app: ovnkube-master
ovn-db-pod: "true"
component: network
type: infra
openshift.io/component: network
kubernetes.io/os: "linux"
spec:
serviceAccountName: ovn-kubernetes-controller
hostNetwork: true
dnsPolicy: Default
priorityClassName: "system-cluster-critical"
# volumes in all containers:
# (container) -> (host)
# /etc/openvswitch -> /var/lib/ovn/etc - ovsdb data
# /var/lib/openvswitch -> /var/lib/ovn/data - ovsdb pki state
# /run/openvswitch -> tmpfs - sockets
# /env -> configmap env-overrides - debug overrides
containers:
- name: kube-rbac-proxy
image: {{.KubeRBACProxyImage}}
command:
- /bin/bash
- -c
- |
#!/bin/bash
set -euo pipefail
TLS_PK=/etc/pki/tls/metrics-cert/tls.key
TLS_CERT=/etc/pki/tls/metrics-cert/tls.crt
# As the secret mount is optional we must wait for the files to be present.
# The service is created in monitor.yaml and this is created in sdn.yaml.
TS=$(date +%s)
WARN_TS=$(( ${TS} + $(( 20 * 60)) ))
HAS_LOGGED_INFO=0

log_missing_certs(){
CUR_TS=$(date +%s)
if [[ "${CUR_TS}" -gt "WARN_TS" ]]; then
echo $(date -Iseconds) WARN: ovn-master-metrics-cert not mounted after 20 minutes.
elif [[ "${HAS_LOGGED_INFO}" -eq 0 ]] ; then
echo $(date -Iseconds) INFO: ovn-master-metrics-cert not mounted. Waiting 20 minutes.
HAS_LOGGED_INFO=1
fi
}
while [[ ! -f "${TLS_PK}" || ! -f "${TLS_CERT}" ]] ; do
log_missing_certs
sleep 5
done

echo $(date -Iseconds) INFO: ovn-master-metrics-certs mounted, starting kube-rbac-proxy
exec /usr/bin/kube-rbac-proxy \
--logtostderr \
--secure-listen-address=:9106 \
--tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 \
--upstream=http://127.0.0.1:29104/ \
--tls-private-key-file=${TLS_PK} \
--tls-cert-file=${TLS_CERT}
ports:
- containerPort: 9106
name: https
resources:
requests:
cpu: 10m
memory: 20Mi
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- name: ovn-master-metrics-cert
mountPath: /etc/pki/tls/metrics-cert
readOnly: True

# ovnkube master: convert kubernetes objects in to nbdb logical network components
- name: ovnkube-control-plane
image: "{{.OvnImage}}"
command:
- /bin/bash
- -c
- |
set -xe
if [[ -f "/env/_master" ]]; then
set -o allexport
source "/env/_master"
set +o allexport
fi

if [ "{{.OVN_GATEWAY_MODE}}" == "shared" ]; then
gateway_mode_flags="--gateway-mode shared --gateway-interface br-ex"
elif [ "{{.OVN_GATEWAY_MODE}}" == "local" ]; then
gateway_mode_flags="--gateway-mode local --gateway-interface br-ex"
else
echo "Invalid OVN_GATEWAY_MODE: \"{{.OVN_GATEWAY_MODE}}\". Must be \"local\" or \"shared\"."
exit 1
fi

multi_network_enabled_flag=
if [[ "{{.OVN_MULTI_NETWORK_ENABLE}}" == "true" ]]; then
multi_network_enabled_flag="--enable-multi-network"
fi

echo "I$(date "+%m%d %H:%M:%S.%N") - ovnkube-control-plane - start ovnkube --init-master ${K8S_NODE}"
exec /usr/bin/ovnkube \
--init-cluster-manager "${K8S_NODE}" \
--config-file=/run/ovnkube-config/ovnkube.conf \
--ovn-empty-lb-events \
--loglevel "${OVN_KUBE_LOG_LEVEL}" \
--metrics-bind-address "127.0.0.1:29104" \
--metrics-enable-pprof \
--metrics-enable-config-duration \
${gateway_mode_flags} \
--enable-multicast \
--disable-snat-multiple-gws \
${multi_network_enabled_flag} \
--enable-interconnect \
--acl-logging-rate-limit "{{.OVNPolicyAuditRateLimit}}"
volumeMounts:
# for checking ovs-configuration service
# - mountPath: /etc/systemd/system
# name: systemd-units
# readOnly: true
# - mountPath: /etc/openvswitch/
# name: etc-openvswitch
# - mountPath: /etc/ovn/
# name: etc-openvswitch
# - mountPath: /var/lib/openvswitch/
# name: var-lib-openvswitch
# - mountPath: /run/openvswitch/
# name: run-openvswitch
# - mountPath: /run/ovn/
# name: run-ovn
- mountPath: /run/ovnkube-config/
name: ovnkube-config
- mountPath: /env
name: env-overrides
- mountPath: /ovn-cert
name: ovn-cert
- mountPath: /ovn-ca
name: ovn-ca
resources:
requests:
cpu: 10m
memory: 300Mi
env:
- name: OVN_KUBE_LOG_LEVEL
value: "4"
- name: K8S_NODE
valueFrom:
fieldRef:
fieldPath: spec.nodeName
ports:
- name: metrics-port
containerPort: 29104
terminationMessagePolicy: FallbackToLogsOnError
nodeSelector:
node-role.kubernetes.io/master: ""
beta.kubernetes.io/os: "linux"
volumes:
# for checking ovs-configuration service
- name: systemd-units
hostPath:
path: /etc/systemd/system
- name: etc-openvswitch
hostPath:
path: /var/lib/ovn/etc
- name: var-lib-openvswitch
hostPath:
path: /var/lib/ovn/data
- name: run-openvswitch
hostPath:
path: /var/run/openvswitch
- name: run-ovn
hostPath:
path: /var/run/ovn
- name: ovnkube-config
configMap:
name: ovnkube-config
- name: env-overrides
configMap:
name: env-overrides
optional: true
- name: ovn-ca
configMap:
name: ovn-ca
- name: ovn-cert
secret:
secretName: ovn-cert
- name: ovn-master-metrics-cert
secret:
secretName: ovn-master-metrics-cert
optional: true
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Exists"
- key: "node.kubernetes.io/not-ready"
operator: "Exists"
- key: "node.kubernetes.io/unreachable"
operator: "Exists"
- key: "node.kubernetes.io/network-unavailable"
operator: "Exists"
Loading