diff --git a/.gitignore b/.gitignore index 1fd1730a08..4373669dcf 100644 --- a/.gitignore +++ b/.gitignore @@ -106,5 +106,7 @@ Session.vim !.vscode/extensions.json .history +### GoLand files ### +.idea # End of https://www.gitignore.io/api/go,vim,emacs,visualstudiocode diff --git a/bindata/network/ovn-kubernetes/ovnkube-master.yaml b/bindata/network/ovn-kubernetes/ovnkube-master.yaml index d27fb54707..df49fbcd70 100644 --- a/bindata/network/ovn-kubernetes/ovnkube-master.yaml +++ b/bindata/network/ovn-kubernetes/ovnkube-master.yaml @@ -151,9 +151,12 @@ spec: - /bin/bash - -c - | + set -x MASTER_IP="{{.OVN_MASTER_IP}}" if [[ "${K8S_NODE_IP}" == "${MASTER_IP}" ]]; then echo "$(date -Iseconds) - nbdb - postStart - waiting for master to be selected" + + # set the connection and disable inactivity probe retries=0 while ! ovn-nbctl --no-leader-only -t 5 set-connection pssl:{{.OVN_NB_PORT}}{{.LISTEN_DUAL_STACK}} -- set connection . inactivity_probe=60000; do (( retries += 1 )) @@ -164,6 +167,61 @@ spec: sleep 2 done fi + + election_timer="${OVN_NB_RAFT_ELECTION_TIMER}" + echo "Setting nb-db raft election timer to ${election_timer} ms" + retries=0 + while current_election_timer=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep -oP '(?<=Election timer:\s)[[:digit:]]+'); do + if [[ -z "${current_election_timer}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Failed to get current nb-db raft election timer value after multiple attempts. Exiting..." + exit 1 + fi + sleep 2 + else + break + fi + done + + if [[ ${election_timer} -ne ${current_election_timer} ]]; then + retries=0 + while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Role: candidate" ); do + if [[ ! -z "${is_candidate}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Cluster node (nb-db raft) is in candidate role for prolonged time. Continuing..." + fi + sleep 2 + else + break + fi + done + + is_leader=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Role: leader") + if [[ ! -z "${is_leader}" ]]; then + while [[ ${current_election_timer} != ${election_timer} ]]; do + max_election_timer=$((${current_election_timer} * 2)) + if [[ ${election_timer} -le ${max_election_timer} ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${election_timer}; then + echo "Failed to set nb-db raft election timer ${election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${election_timer} + else + if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${max_election_timer}; then + echo "Failed to set nb-db raft election timer ${max_election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${max_election_timer} + fi + done + fi + fi + readinessProbe: initialDelaySeconds: 30 timeoutSeconds: 5 @@ -177,6 +235,8 @@ spec: env: - name: OVN_LOG_LEVEL value: info + - name: OVN_NB_RAFT_ELECTION_TIMER + value: "{{.OVN_NB_RAFT_ELECTION_TIMER}}" - name: K8S_NODE_IP valueFrom: fieldRef: @@ -216,7 +276,7 @@ spec: - /bin/bash - -c - | - set -xe + set -x if [[ -f /env/_master ]]; then set -o allexport source /env/_master @@ -261,9 +321,12 @@ spec: - /bin/bash - -c - | + set -x MASTER_IP="{{.OVN_MASTER_IP}}" if [[ "${K8S_NODE_IP}" == "${MASTER_IP}" ]]; then echo "$(date -Iseconds) - sdb - postStart - waiting for master to be selected" + + # set the connection and disable inactivity probe retries=0 while ! ovn-sbctl --no-leader-only -t 5 set-connection pssl:{{.OVN_SB_PORT}}{{.LISTEN_DUAL_STACK}} -- set connection . inactivity_probe=60000; do (( retries += 1 )) @@ -274,6 +337,61 @@ spec: sleep 2 done fi + + election_timer="${OVN_SB_RAFT_ELECTION_TIMER}" + echo "Setting sb-db raft election timer to ${election_timer} ms" + retries=0 + while current_election_timer=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep -oP '(?<=Election timer:\s)[[:digit:]]+'); do + if [[ -z "${current_election_timer}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Failed to get current sb-db raft election timer value after multiple attempts. Exiting..." + exit 1 + fi + sleep 2 + else + break + fi + done + + if [[ ${election_timer} -ne ${current_election_timer} ]]; then + retries=0 + while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Role: candidate" ); do + if [[ ! -z "${is_candidate}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Cluster node (sb-db raft) is in candidate role for prolonged time. Continuing..." + fi + sleep 2 + else + break + fi + done + + is_leader=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Role: leader") + if [[ ! -z "${is_leader}" ]]; then + while [[ ${current_election_timer} != ${election_timer} ]]; do + max_election_timer=$((${current_election_timer} * 2)) + if [[ ${election_timer} -le ${max_election_timer} ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${election_timer}; then + echo "Failed to set sb-db raft election timer ${election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${election_timer} + else + if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${max_election_timer}; then + echo "Failed to set sb-db raft election timer ${max_election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${max_election_timer} + fi + done + fi + fi + readinessProbe: initialDelaySeconds: 30 timeoutSeconds: 5 @@ -287,6 +405,8 @@ spec: env: - name: OVN_LOG_LEVEL value: info + - name: OVN_SB_RAFT_ELECTION_TIMER + value: "{{.OVN_SB_RAFT_ELECTION_TIMER}}" - name: K8S_NODE_IP valueFrom: fieldRef: diff --git a/bindata/network/ovn-kubernetes/ovnkube-node.yaml b/bindata/network/ovn-kubernetes/ovnkube-node.yaml index 61ea850280..9528c6a866 100644 --- a/bindata/network/ovn-kubernetes/ovnkube-node.yaml +++ b/bindata/network/ovn-kubernetes/ovnkube-node.yaml @@ -134,6 +134,7 @@ spec: --sb-client-cacert /ovn-ca/ca-bundle.crt \ --config-file=/run/ovnkube-config/ovnkube.conf \ --loglevel "${OVN_KUBE_LOG_LEVEL}" \ + --inactivity-probe="${OVN_CONTROLLER_INACTIVITY_PROBE}" ${hybrid_overlay_flags} \ --metrics-bind-address "0.0.0.0:9103" env: @@ -142,6 +143,8 @@ spec: value: "{{.KUBERNETES_SERVICE_PORT}}" - name: KUBERNETES_SERVICE_HOST value: "{{.KUBERNETES_SERVICE_HOST}}" + - name: OVN_CONTROLLER_INACTIVITY_PROBE + value: "{{.OVN_CONTROLLER_INACTIVITY_PROBE}}" - name: OVN_KUBE_LOG_LEVEL value: "4" - name: K8S_NODE diff --git a/manifests/0000_70_cluster-network-operator_03_deployment.yaml b/manifests/0000_70_cluster-network-operator_03_deployment.yaml index e45d91bb10..2d39d86f17 100644 --- a/manifests/0000_70_cluster-network-operator_03_deployment.yaml +++ b/manifests/0000_70_cluster-network-operator_03_deployment.yaml @@ -45,6 +45,12 @@ spec: value: "quay.io/openshift/origin-multus-route-override-cni:4.4" - name: OVN_IMAGE value: "quay.io/openshift/origin-ovn-kubernetes:4.3" + - name: OVN_NB_RAFT_ELECTION_TIMER + value: "5000" + - name: OVN_SB_RAFT_ELECTION_TIMER + value: "5000" + - name: OVN_CONTROLLER_INACTIVITY_PROBE + value: "30000" - name: KURYR_DAEMON_IMAGE value: "quay.io/openshift/origin-kuryr-cni:4.3" - name: KURYR_CONTROLLER_IMAGE diff --git a/pkg/network/ovn_kubernetes.go b/pkg/network/ovn_kubernetes.go index 7b32345eef..4310630a67 100644 --- a/pkg/network/ovn_kubernetes.go +++ b/pkg/network/ovn_kubernetes.go @@ -58,6 +58,9 @@ func renderOVNKubernetes(conf *operv1.NetworkSpec, bootstrapResult *bootstrap.Bo data.Data["OVN_SB_PORT"] = OVN_SB_PORT data.Data["OVN_NB_RAFT_PORT"] = OVN_NB_RAFT_PORT data.Data["OVN_SB_RAFT_PORT"] = OVN_SB_RAFT_PORT + data.Data["OVN_NB_RAFT_ELECTION_TIMER"] = os.Getenv("OVN_NB_RAFT_ELECTION_TIMER") + data.Data["OVN_SB_RAFT_ELECTION_TIMER"] = os.Getenv("OVN_SB_RAFT_ELECTION_TIMER") + data.Data["OVN_CONTROLLER_INACTIVITY_PROBE"] = os.Getenv("OVN_CONTROLLER_INACTIVITY_PROBE") data.Data["OVN_NB_DB_LIST"] = dbList(bootstrapResult.OVN.MasterIPs, OVN_NB_PORT) data.Data["OVN_SB_DB_LIST"] = dbList(bootstrapResult.OVN.MasterIPs, OVN_SB_PORT) data.Data["OVN_MASTER_IP"] = bootstrapResult.OVN.MasterIPs[0]