openshift · bpickard22 · Jan 17, 2022 · Jun 22, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml b/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
diff --git a/...vn-kubernetes/self-hosted/004-config.yaml → ...rnetes/self-hosted/common/004-config.yaml b/...vn-kubernetes/self-hosted/004-config.yaml → ...rnetes/self-hosted/common/004-config.yaml
diff --git a/...ubernetes/self-hosted/007-flowschema.yaml → ...es/self-hosted/common/007-flowschema.yaml b/...ubernetes/self-hosted/007-flowschema.yaml → ...es/self-hosted/common/007-flowschema.yaml
diff --git a/bindata/network/ovn-kubernetes/self-hosted/common/alert-rules-control-plane.yaml b/bindata/network/ovn-kubernetes/self-hosted/common/alert-rules-control-plane.yaml
@@ -0,0 +1,108 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    prometheus: k8s
+    role: alert-rules
+  annotations:
+    networkoperator.openshift.io/ignore-errors: ""
+  name: master-rules
+  namespace: openshift-ovn-kubernetes
+spec:
+  groups:
+  - name: cluster-network-operator-master.rules
+    rules:
+    - record: cluster:ovnkube_master_egress_routing_via_host:max
+      expr: max(ovnkube_master_egress_routing_via_host)
+    - record: cluster:ovn_db_nbdb_not_cluster_member:abs
+      expr: abs(count(ovn_db_cluster_server_status{db_name="OVN_Northbound", server_status="cluster member"}) - {{.OvnkubeMasterReplicas}})
+    - record: cluster:ovn_db_sbdb_not_cluster_member:abs
+      expr: abs(count(ovn_db_cluster_server_status{db_name="OVN_Southbound", server_status="cluster member"}) - {{.OvnkubeMasterReplicas}})
+    - record: cluster:ovn_db_nbdb_missing_inbound_connections:abs
+      expr: abs(sum(ovn_db_cluster_inbound_connections_total{db_name="OVN_Northbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
+    - record: cluster:ovn_db_sbdb_missing_inbound_connections:abs
+      expr: abs(sum(ovn_db_cluster_inbound_connections_total{db_name="OVN_Southbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
+    - record: cluster:ovn_db_nbdb_missing_outbound_connections:abs
+      expr: abs(sum(ovn_db_cluster_outbound_connections_total{db_name="OVN_Northbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
+    - record: cluster:ovn_db_sbdb_missing_outbound_connections:abs
+      expr: abs(sum(ovn_db_cluster_outbound_connections_total{db_name="OVN_Southbound"}) - ({{.OvnkubeMasterReplicas}} * ({{.OvnkubeMasterReplicas}}-1)))
+    # OVN kubernetes cluster manager functional alerts
+    - alert: V4SubnetAllocationThresholdExceeded
+      annotations:
+        summary: More than 80% of v4 subnets available to assign to the nodes are allocated. Current v4 subnet allocation percentage is {{"{{"}} $value | humanizePercentage {{"}}"}}.
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/V4SubnetAllocationThresholdExceeded.md
+        description: More than 80% of IPv4 subnets are used. Insufficient IPv4 subnets could degrade provisioning of workloads.
+      expr: ovnkube_clustermanager_allocated_v4_host_subnets / ovnkube_clustermanager_num_v4_host_subnets > 0.8
+      for: 10m
+      labels:
+        severity: warning
+    - alert: V6SubnetAllocationThresholdExceeded
+      annotations:
+        summary: More than 80% of the v6 subnets available to assign to the nodes are allocated. Current v6 subnet allocation percentage is {{"{{"}} $value | humanizePercentage {{"}}"}}.
+        description: More than 80% of IPv6 subnets are used. Insufficient IPv6 subnets could degrade provisioning of workloads.
+      expr: ovnkube_clustermanager_allocated_v6_host_subnets / ovnkube_clustermanager_num_v6_host_subnets > 0.8
+      for: 10m
+      labels:
+        severity: warning
+    # OVN kubernetes master functional alerts
+    - alert: NoRunningOvnController
+      annotations:
+        summary: There is no running ovn-kubernetes controller.
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnMaster.md #TODO: Update Runbook
+        description: |
+          Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
+          implemented while there are no OVN Kubernetes pods.
+      expr: | #should we be checking for ovn-controller or ovnkube-controller?
+        absent(up{container="ovnkube-controller", namespace="openshift-ovn-kubernetes"} == 1) # 
+      for: 5m
+      labels:
+        namespace: openshift-ovn-kubernetes
+        severity: critical
+    - alert: NoOvnClusterManager
+      annotations:
+        summary: There is no ovn-kubernetes Cluster Manager leader.
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnMasterLeader.md #TODO: Update runbook
+        description: |
+          Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
+          implemented while there is no OVN Kubernetes leader. Existing workloads should continue to have connectivity.
+          OVN-Kubernetes control plane is not functional.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        max by (namespace) (max_over_time(ovnkube_clustermanager[5m])) == 0
+      for: 5m
+      labels:
+        severity: critical
+    # OVN northbound and southbound databases functional alerts
+    - alert: NorthboundStaleOnNode
+      annotations:
+        summary: ovn-kubernetes has not written anything to the northbound database on a node for too long.
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md
+        description: |
+          Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
+          implemented. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane and/or
+          OVN northbound database may not be functional.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120
+      for: 10m
+      labels:
+        severity: critical #todo: Determine severity here as this is not as big a deal anymore
+    - alert: SouthboundStaleOnNode
+      annotations:
+        summary: ovn-northd has not successfully synced any changes to the southbound DB for too long.
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md
+        description: |
+          Networking control plane is degraded. Networking configuration updates may not be applied to the cluster or
+          taking a long time to apply. This usually means there is a large load on OVN component 'northd' or it is not
+          functioning.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120
+      for: 10m
+      labels:
+        severity: critical
+
+
diff --git a/...ubernetes/self-hosted/monitor-master.yaml → ...es/self-hosted/common/monitor-master.yaml b/...ubernetes/self-hosted/monitor-master.yaml → ...es/self-hosted/common/monitor-master.yaml
@@ -3,7 +3,7 @@ apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   labels:
-    app: ovnkube-master
+    app: ovnkube-controller
   annotations:
     networkoperator.openshift.io/ignore-errors: ""
   name: monitor-ovn-master-metrics
@@ -23,26 +23,26 @@ spec:
     - openshift-ovn-kubernetes
   selector:
     matchLabels:
-      app: ovnkube-master
+      app: ovnkube-controller
 ---
 apiVersion: v1
 kind: Service
 metadata:
   labels:
-    app: ovnkube-master
-  name: ovn-kubernetes-master
+    app: ovnkube-controller
+  name: ovn-kubernetes-controller
   namespace: openshift-ovn-kubernetes
   annotations:
     service.beta.openshift.io/serving-cert-secret-name: ovn-master-metrics-cert
 spec:
   selector:
-    app: ovnkube-master
+    app: ovnkube-controller
   clusterIP: None
   publishNotReadyAddresses: true
   ports:
   - name: metrics
     port: 9102
     protocol: TCP
-    targetPort: 9102
+    targetPort: 9102  ### TODO it's now 9112
   sessionAffinity: None
   type: ClusterIP
diff --git a/...ata/network/ovn-kubernetes/self-hosted/multi-zone-interconnect/ovnkube-control-plane.yaml b/...ata/network/ovn-kubernetes/self-hosted/multi-zone-interconnect/ovnkube-control-plane.yaml
@@ -0,0 +1,222 @@
+# The ovnkube control-plane components
+kind: DaemonSet
+apiVersion: apps/v1
+metadata:
+  name: ovnkube-master
+  namespace: openshift-ovn-kubernetes
+  annotations:
+    kubernetes.io/description: |
+      This daemonset launches the ovn-kubernetes controller (master) networking components.
+    release.openshift.io/version: "{{.ReleaseVersion}}"
+spec:
+  selector:
+    matchLabels:
+      app: ovnkube-master
+  updateStrategy:
+    type: RollingUpdate
+    rollingUpdate:
+      # by default, Deployments spin up the new pod before terminating the old one
+      # but we don't want that - because ovsdb holds the lock.
+      maxSurge: 0
+      maxUnavailable: 3
+  template:
+    metadata:
+      annotations:
+        target.workload.openshift.io/management: '{"effect": "PreferredDuringScheduling"}'
+      labels:
+        app: ovnkube-master
+        ovn-db-pod: "true"
+        component: network
+        type: infra
+        openshift.io/component: network
+        kubernetes.io/os: "linux"
+    spec:
+      serviceAccountName: ovn-kubernetes-controller
+      hostNetwork: true
+      dnsPolicy: Default
+      priorityClassName: "system-cluster-critical"
+      # volumes in all containers:
+      # (container) -> (host)
+      # /etc/openvswitch -> /var/lib/ovn/etc - ovsdb data
+      # /var/lib/openvswitch -> /var/lib/ovn/data - ovsdb pki state
+      # /run/openvswitch -> tmpfs - sockets
+      # /env -> configmap env-overrides - debug overrides
+      containers:
+      - name: kube-rbac-proxy
+        image: {{.KubeRBACProxyImage}}
+        command:
+        - /bin/bash
+        - -c
+        - |
+          #!/bin/bash
+          set -euo pipefail
+          TLS_PK=/etc/pki/tls/metrics-cert/tls.key
+          TLS_CERT=/etc/pki/tls/metrics-cert/tls.crt
+          # As the secret mount is optional we must wait for the files to be present.
+          # The service is created in monitor.yaml and this is created in sdn.yaml.
+          TS=$(date +%s)
+          WARN_TS=$(( ${TS} + $(( 20 * 60)) ))
+          HAS_LOGGED_INFO=0
+
+          log_missing_certs(){
+              CUR_TS=$(date +%s)
+              if [[ "${CUR_TS}" -gt "WARN_TS"  ]]; then
+                echo $(date -Iseconds) WARN: ovn-master-metrics-cert not mounted after 20 minutes.
+              elif [[ "${HAS_LOGGED_INFO}" -eq 0 ]] ; then
+                echo $(date -Iseconds) INFO: ovn-master-metrics-cert not mounted. Waiting 20 minutes.
+                HAS_LOGGED_INFO=1
+              fi
+          }
+          while [[ ! -f "${TLS_PK}" ||  ! -f "${TLS_CERT}" ]] ; do
+            log_missing_certs
+            sleep 5
+          done
+
+          echo $(date -Iseconds) INFO: ovn-master-metrics-certs mounted, starting kube-rbac-proxy
+          exec /usr/bin/kube-rbac-proxy \
+            --logtostderr \
+            --secure-listen-address=:9106 \
+            --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 \
+            --upstream=http://127.0.0.1:29104/ \
+            --tls-private-key-file=${TLS_PK} \
+            --tls-cert-file=${TLS_CERT}
+        ports:
+        - containerPort: 9106
+          name: https
+        resources:
+          requests:
+            cpu: 10m
+            memory: 20Mi
+        terminationMessagePolicy: FallbackToLogsOnError
+        volumeMounts:
+        - name: ovn-master-metrics-cert
+          mountPath: /etc/pki/tls/metrics-cert
+          readOnly: True
+
+      # ovnkube master: convert kubernetes objects in to nbdb logical network components
+      - name: ovnkube-control-plane
+        image: "{{.OvnImage}}"
+        command:
+        - /bin/bash
+        - -c
+        - |
+          set -xe
+          if [[ -f "/env/_master" ]]; then
+            set -o allexport
+            source "/env/_master"
+            set +o allexport
+          fi
+
+          if [ "{{.OVN_GATEWAY_MODE}}" == "shared" ]; then
+            gateway_mode_flags="--gateway-mode shared --gateway-interface br-ex"
+          elif [ "{{.OVN_GATEWAY_MODE}}" == "local" ]; then
+            gateway_mode_flags="--gateway-mode local --gateway-interface br-ex"
+          else
+            echo "Invalid OVN_GATEWAY_MODE: \"{{.OVN_GATEWAY_MODE}}\". Must be \"local\" or \"shared\"."
+            exit 1
+          fi
+
+          multi_network_enabled_flag=
+          if [[ "{{.OVN_MULTI_NETWORK_ENABLE}}" == "true" ]]; then
+            multi_network_enabled_flag="--enable-multi-network"
+          fi
+
+          echo "I$(date "+%m%d %H:%M:%S.%N") - ovnkube-control-plane - start ovnkube --init-master ${K8S_NODE}"
+          exec /usr/bin/ovnkube \
+            --init-cluster-manager "${K8S_NODE}" \
+            --config-file=/run/ovnkube-config/ovnkube.conf \
+            --ovn-empty-lb-events \
+            --loglevel "${OVN_KUBE_LOG_LEVEL}" \
+            --metrics-bind-address "127.0.0.1:29104" \
+            --metrics-enable-pprof \
+            --metrics-enable-config-duration \
+            ${gateway_mode_flags} \
+            --enable-multicast \
+            --disable-snat-multiple-gws \
+            ${multi_network_enabled_flag} \
+            --enable-interconnect \
+            --acl-logging-rate-limit "{{.OVNPolicyAuditRateLimit}}"
+        volumeMounts:
+        # for checking ovs-configuration service
+        # - mountPath: /etc/systemd/system
+        #   name: systemd-units
+        #   readOnly: true
+        # - mountPath: /etc/openvswitch/
+        #   name: etc-openvswitch
+        # - mountPath: /etc/ovn/
+        #   name: etc-openvswitch
+        # - mountPath: /var/lib/openvswitch/
+        #   name: var-lib-openvswitch
+        # - mountPath: /run/openvswitch/
+        #   name: run-openvswitch
+        # - mountPath: /run/ovn/
+        #   name: run-ovn
+        - mountPath: /run/ovnkube-config/
+          name: ovnkube-config
+        - mountPath: /env
+          name: env-overrides
+        - mountPath: /ovn-cert
+          name: ovn-cert
+        - mountPath: /ovn-ca
+          name: ovn-ca
+        resources:
+          requests:
+            cpu: 10m
+            memory: 300Mi
+        env:
+        - name: OVN_KUBE_LOG_LEVEL
+          value: "4"
+        - name: K8S_NODE
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        ports:
+        - name: metrics-port
+          containerPort: 29104
+        terminationMessagePolicy: FallbackToLogsOnError
+      nodeSelector:
+        node-role.kubernetes.io/master: ""
+        beta.kubernetes.io/os: "linux"
+      volumes:
+      # for checking ovs-configuration service
+      - name: systemd-units
+        hostPath:
+          path: /etc/systemd/system
+      - name: etc-openvswitch
+        hostPath:
+          path: /var/lib/ovn/etc
+      - name: var-lib-openvswitch
+        hostPath:
+          path: /var/lib/ovn/data
+      - name: run-openvswitch
+        hostPath:
+          path: /var/run/openvswitch
+      - name: run-ovn
+        hostPath:
+          path: /var/run/ovn
+      - name: ovnkube-config
+        configMap:
+          name: ovnkube-config
+      - name: env-overrides
+        configMap:
+          name: env-overrides
+          optional: true
+      - name: ovn-ca
+        configMap:
+          name: ovn-ca
+      - name: ovn-cert
+        secret:
+          secretName: ovn-cert
+      - name: ovn-master-metrics-cert
+        secret:
+          secretName: ovn-master-metrics-cert
+          optional: true
+      tolerations:
+      - key: "node-role.kubernetes.io/master"
+        operator: "Exists"
+      - key: "node.kubernetes.io/not-ready"
+        operator: "Exists"
+      - key: "node.kubernetes.io/unreachable"
+        operator: "Exists"
+      - key: "node.kubernetes.io/network-unavailable"
+        operator: "Exists"