From d65ec5c878730e8e441256b1709e49b02a36ca62 Mon Sep 17 00:00:00 2001 From: Martin Kennelly Date: Tue, 15 Jul 2025 07:34:29 +0100 Subject: [PATCH 1/4] Fix ovnkube-controller-with-node shutdown sequence Currently, we are force exiting with the trap before the background processes can end, container is removed and the orphaned processes end early causing our config to go into an unknown state because we dont end in an orderly manner. Wait until the pid file for ovnkube controller with node is removed which shows the process has completed. Signed-off-by: Martin Kennelly (cherry picked from commit 8b294197b9313e230f45c7143f3aff9a79e979dd) --- dist/images/ovnkube.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/dist/images/ovnkube.sh b/dist/images/ovnkube.sh index 3b41e7103c..833cb93fd4 100755 --- a/dist/images/ovnkube.sh +++ b/dist/images/ovnkube.sh @@ -418,6 +418,19 @@ wait_for_event() { done } +# wait_ovnkube_controller_with_node_done - Wait for ovnkube-controller-with-node process to complete +# Checks if the ovnkube-controller-with-node process is running by looking for its PID file. +# If the PID file exists, waits for that process to finish before continuing. +# If the PID file doesnt exist, it means the process has already exited. +wait_ovnkube_controller_with_node_done() { + local pid_file=${OVN_RUNDIR}/ovnkube-controller-with-node.pid + if [[ -f ${pid_file} ]]; then + echo "info: waiting on ovnkube-controller-with-node process to end" + wait $(cat $pid_file) + echo "info: done waiting for ovn-controller-with-node to end" + fi +} + # The ovnkube-db kubernetes service must be populated with OVN DB service endpoints # before various OVN K8s containers can come up. This functions checks for that. # If OVN dbs are configured to listen only on unix sockets, then there will not be @@ -1732,7 +1745,10 @@ ovnkube-controller() { } ovnkube-controller-with-node() { - trap 'kill $(jobs -p) ; rm -f /etc/cni/net.d/10-ovn-kubernetes.conf ; exit 0' TERM + # send sig term to background job (ovnkube-node process), remove CNI conf and resume background job until it ends. + # currently we the process to background, therefore wait until that process removes its pid file on exit. + # if the pid file doesnt exist, we exit immediately. + trap 'kill $(jobs -p) ; rm -f /etc/cni/net.d/10-ovn-kubernetes.conf ; wait_ovnkube_controller_with_node_done; exit 0' TERM check_ovn_daemonset_version "1.1.0" rm -f ${OVN_RUNDIR}/ovnkube-controller-with-node.pid From 50a94e1e7c040aca5aee86e9fb769bfe10eb8638 Mon Sep 17 00:00:00 2001 From: Martin Kennelly Date: Tue, 15 Jul 2025 07:32:00 +0100 Subject: [PATCH 2/4] ovn-controller: block GARP during startup Prevent ovn-controller from sending stale GARP by adding drop flows on external bridge patch ports until ovnkube-controller synchronizes the southbound database - henceforth known as "drop flows". This addresses race conditions where ovn-controller processes outdated SB DB state before ovnkube-controller updates it, particularly affecting EIP SNAT configurations attached to logical router ports. Fixes: https://issues.redhat.com/browse/FDP-1537 ovnkube-controller controls the lifecycle of the drop flows. ovs / ovn-controller running is required to configure external bridge. Downstream, the external bridge maybe precreated and ovn-controller will use this. This fix considers three primary scenarios: node, container and pod restart. On Node restart means the ovs flows installed priotior to reboot on the node are cleared but the external bridge exists. Add the flows before ovnkube controller with node starts. The reason to add it here is that our gateway code depends on ovn-controller started and running... There is now a race here between ovn-controller starting (and garping) before we set this flow but I think the risk is low however it needs serious testing. The reason I did not naturally at the drop flows before ovn-controller started is because I have no way to detect if its a node reboot or pod reboot and i dont want to inject drop flows for simple ovn-controller container restart which could disrupt traffic. ovnkube-controller starts, we create a new gateway and apply flows the same flows in-order to ensure we always drop GARP when ovnkube controller hasn't sync. Remove the flows when ovnkube-controller has syncd. There is also a race here between ovnkube-controller removing the flows and ovn-controller GARPing with stale SB DB info. There is no easy way to detect what SB DB data ovn-controller has consumed. On Pod restart, we add the drop flows before exit. ovnkube-controller-with-node will also add it before it starts the go code. Container restart: - ovnkube-controller: adds flows upon start and exit - ovn-controller: no changes While the drop flows are set, OVN may not be able to resolve IPs it doesn't know about in its Logical Router pipelines generation. Following removal of the drop flows, OVN may resolve the IPs using GARP requests. OVN-Controller always sends out GARPs with op code 1 on startup. Signed-off-by: Martin Kennelly (cherry picked from commit 82fc3bf7aac4f15ff7659f703c03c92947016a49) --- dist/images/ovnkube.sh | 47 +++++++++++ go-controller/cmd/ovnkube/ovnkube.go | 26 +++++- .../node_controller_manager.go | 38 ++++++++- .../pkg/libovsdb/util/northd_sync.go | 65 +++++++++++++++ .../pkg/node/bridgeconfig/bridgeconfig.go | 17 ++++ .../pkg/node/bridgeconfig/bridgeflows.go | 82 +++++++++++++++++++ go-controller/pkg/node/gateway.go | 10 +++ go-controller/pkg/node/openflow_manager.go | 9 ++ go-controller/pkg/node/types/const.go | 4 + 9 files changed, 292 insertions(+), 6 deletions(-) create mode 100644 go-controller/pkg/libovsdb/util/northd_sync.go diff --git a/dist/images/ovnkube.sh b/dist/images/ovnkube.sh index 833cb93fd4..1e0661f501 100755 --- a/dist/images/ovnkube.sh +++ b/dist/images/ovnkube.sh @@ -505,6 +505,36 @@ ovs_ready() { return 0 } +# get_bridge_name_for_physnet - Extract OVS bridge name for a given OVN physical network +# Takes an OVN network name for physical networks (physnet) and returns the corresponding +# OVS bridge name from the ovn-bridge-mappings configuration. +# Return empty string if not found. +get_bridge_name_for_physnet() { + local physnet="$1" + local mappings + mappings=$(ovs-vsctl --if-exists get open_vswitch . external_ids:ovn-bridge-mappings) + # Extract bridge name after physnet: and before next comma (or end) + # regex matches zero or more non-comma characters + # cut on colon and return field number 2 + echo "$mappings" | tr -d "\"" | grep -o "$physnet:[^,]*" | cut -d: -f2 +} + +# Adds drop flows for GARPs on patch port to br-int for specified bridge. +add_garp_drop_flow() { + local bridge="$1" + local cookie="0x0305" + local priority="498" + # if bridge exists, and the patch port is created, we expect to add at least one flow to a patch port ending in to-br-int. + # FIXME: can we generate the exact name. Its possible we add these flows to the incorrect port when selecting on substring + for port_name in $(ovs-vsctl list-ports $bridge); do + if [[ "$port_name" == *to-br-int ]]; then + local of_port=$(ovs-vsctl get interface $port_name ofport) + ovs-ofctl add-flow $bridge "cookie=$cookie,table=0,priority=$priority,in_port=$of_port,arp,arp_op=1,actions=drop" > /dev/null + break + fi + done +} + # Verify that the process is running either by checking for the PID in `ps` output # or by using `ovs-appctl` utility for the processes that support it. # $1 is the name of the process @@ -1773,6 +1803,23 @@ ovnkube-controller-with-node() { wait_for_event process_ready ovn-controller fi + # start temp work around + # remove when https://issues.redhat.com/browse/FDP-1537 is avilable + if [[ ${ovnkube_node_mode} == "full" && ${ovn_enable_interconnect} == "true" && ${ovn_egressip_enable} == "true" ]]; then + echo "=============== ovnkube-controller-with-node - (add GARP drop flows if external bridge exists)" + # bridge may not yet exist + local bridge_name="$(get_bridge_name_for_physnet 'physnet')" + if [[ "$bridge_name" != "" ]]; then + echo "=============== ovnkube-controller-with-node - found bridge mapping for physnet: $bridge_name" + # nothing to do if the external bridge isn't created. + if ovs-vsctl br-exists $bridge_name; then + echo "=============== ovnkube-controller-with-node - found bridge $bridge_name" + add_garp_drop_flow "$bridge_name" + echo "=============== ovnkube-controller-with-node - (finished adding GARP drop flows)" + fi + fi + fi + ovn_routable_mtu_flag= if [[ -n "${routable_mtu}" ]]; then routable_mtu_flag="--routable-mtu ${routable_mtu}" diff --git a/go-controller/cmd/ovnkube/ovnkube.go b/go-controller/cmd/ovnkube/ovnkube.go index 6ce5de42c9..7d0e40af81 100644 --- a/go-controller/cmd/ovnkube/ovnkube.go +++ b/go-controller/cmd/ovnkube/ovnkube.go @@ -8,6 +8,7 @@ import ( "os/signal" "strings" "sync" + "sync/atomic" "syscall" "text/tabwriter" "text/template" @@ -29,6 +30,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/controllermanager" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb" + libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" ovnnode "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/routemanager" @@ -486,6 +488,14 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util clusterManager.Stop() }() } + // when ovnkube is running in ovnkube-controller and ovnkube node mode in the same process, bool is used to inform ovnkube-node that ovnkube-controller + // has sync'd once and changes have propagated to SB DB. ovnkube-node will then remove flows for dropping GARPs. + // Remove when OVN supports native silencing of GARPs on startup: https://issues.redhat.com/browse/FDP-1537 + // isOVNKubeControllerSyncd is true when ovnkube controller has sync and changes are in OVN Southbound database. + var isOVNKubeControllerSyncd *atomic.Bool + if runMode.ovnkubeController && runMode.node && config.OVNKubernetesFeature.EnableEgressIP && config.OVNKubernetesFeature.EnableInterconnect && config.OvnKubeNode.Mode == types.NodeModeFull { + isOVNKubeControllerSyncd = &atomic.Bool{} + } if runMode.ovnkubeController { wg.Add(1) @@ -522,10 +532,20 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util controllerErr = fmt.Errorf("failed to start network controller: %w", err) return } - // record delay until ready metrics.MetricOVNKubeControllerReadyDuration.Set(time.Since(startTime).Seconds()) + if isOVNKubeControllerSyncd != nil { + klog.Infof("Waiting for OVN northbound database changes to sync to OVN Southbound database") + if err = libovsdbutil.WaitUntilNorthdSyncOnce(ctx, libovsdbOvnNBClient, libovsdbOvnSBClient); err != nil { + controllerErr = fmt.Errorf("failed waiting for northd to sync OVN Northbound DB to Southbound: %v", err) + return + } else { + klog.Infof("OVN northbound database changes synced to OVN Southbound database") + isOVNKubeControllerSyncd.Store(true) + } + } + <-ctx.Done() controllerManager.Stop() }() @@ -569,7 +589,7 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util return } - err = nodeControllerManager.Start(ctx) + err = nodeControllerManager.Start(ctx, isOVNKubeControllerSyncd) if err != nil { nodeErr = fmt.Errorf("failed to start node network controller: %w", err) return @@ -579,7 +599,7 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util metrics.MetricNodeReadyDuration.Set(time.Since(startTime).Seconds()) <-ctx.Done() - nodeControllerManager.Stop() + nodeControllerManager.Stop(isOVNKubeControllerSyncd) }() } diff --git a/go-controller/pkg/controllermanager/node_controller_manager.go b/go-controller/pkg/controllermanager/node_controller_manager.go index 94fbf18fd2..495cb840a6 100644 --- a/go-controller/pkg/controllermanager/node_controller_manager.go +++ b/go-controller/pkg/controllermanager/node_controller_manager.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" "sync" + "sync/atomic" "time" "k8s.io/apimachinery/pkg/util/sets" @@ -147,7 +148,7 @@ func (ncm *NodeControllerManager) initDefaultNodeNetworkController(ctx context.C } // Start the node network controller manager -func (ncm *NodeControllerManager) Start(ctx context.Context) (err error) { +func (ncm *NodeControllerManager) Start(ctx context.Context, isOVNKubeControllerSyncd *atomic.Bool) (err error) { klog.Infof("Starting the node network controller manager, Mode: %s", config.OvnKubeNode.Mode) // Initialize OVS exec runner; find OVS binaries that the CNI code uses. @@ -166,7 +167,7 @@ func (ncm *NodeControllerManager) Start(ctx context.Context) (err error) { defer func() { if err != nil { klog.Errorf("Stopping node network controller manager, err=%v", err) - ncm.Stop() + ncm.Stop(isOVNKubeControllerSyncd) } }() @@ -224,15 +225,46 @@ func (ncm *NodeControllerManager) Start(ctx context.Context) (err error) { return fmt.Errorf("failed to own priority %d for IP rules: %v", node.UDNMasqueradeIPRulePriority, err) } } + + // start workaround and remove when ovn has native support for silencing GARPs for LRPs + // https://issues.redhat.com/browse/FDP-1537 + // when in mode ovnkube controller with node, wait until ovnkube controller is syncd before removing drop flows for GARPs +waitForControllerSyncLoop: + for { + select { + case <-ctx.Done(): + return nil + default: + if isOVNKubeControllerSyncd != nil && !isOVNKubeControllerSyncd.Load() { + klog.V(5).Infof("Waiting for ovnkube controller to start before removing GARP drop flows") + time.Sleep(200 * time.Millisecond) + continue + } + klog.Infof("Removing flows to drop GARP") + ncm.defaultNodeNetworkController.Gateway.SetDefaultBridgeGARPDropFlows(false) + if err := ncm.defaultNodeNetworkController.Gateway.Reconcile(); err != nil { + return fmt.Errorf("failed to reconcile gateway after removing GARP drop flows for ext bridge: %v", err) + } + break waitForControllerSyncLoop + } + } + // end workaround + return nil } // Stop gracefully stops all managed controllers -func (ncm *NodeControllerManager) Stop() { +func (ncm *NodeControllerManager) Stop(isOVNKubeControllerSyncd *atomic.Bool) { // stop stale ovs ports cleanup close(ncm.stopChan) if ncm.defaultNodeNetworkController != nil { + if isOVNKubeControllerSyncd != nil { + ncm.defaultNodeNetworkController.Gateway.SetDefaultBridgeGARPDropFlows(true) + if err := ncm.defaultNodeNetworkController.Gateway.Reconcile(); err != nil { + klog.Errorf("Failed to reconcile gateway after attempting to add flows to the external bridge to drop GARPs: %v", err) + } + } ncm.defaultNodeNetworkController.Stop() } diff --git a/go-controller/pkg/libovsdb/util/northd_sync.go b/go-controller/pkg/libovsdb/util/northd_sync.go new file mode 100644 index 0000000000..40aa212680 --- /dev/null +++ b/go-controller/pkg/libovsdb/util/northd_sync.go @@ -0,0 +1,65 @@ +package util + +import ( + "context" + "errors" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/ovn-kubernetes/libovsdb/client" + "github.com/ovn-kubernetes/libovsdb/model" + "github.com/ovn-kubernetes/libovsdb/ovsdb" + + libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/sbdb" +) + +// WaitUntilNorthdSyncOnce ensures northd has sync'd at least once by increments nb_cfg value in NB DB and waiting +// for northd to copy it to SB DB. Poll SB DB until context is cancelled. +// The expectation is that the data you wish to be sync'd to SB DB has already been written to NB DB so when we get the initial +// nb_cfg value, we know that if we increment that by one and see that value or greater in SB DB, then the data has sync'd. +// All other processes interacting with nb_cfg increment it. This function depends on other processes respecting that. +// No guarantee of any changes in SB DB made after this func. +func WaitUntilNorthdSyncOnce(ctx context.Context, nbClient, sbClient client.Client) error { + // 1. Get value of nb_cfg + // 2. Increment value of nb_cfg + // 3. Wait until value appears in SB DB after northd copies it. + nbGlobal := &nbdb.NBGlobal{} + nbGlobal, err := libovsdbops.GetNBGlobal(nbClient, nbGlobal) + if err != nil { + return fmt.Errorf("failed to find OVN Northbound NB_Global table"+ + " entry: %w", err) + } + // increment nb_cfg value by 1. When northd consumes updates from NB DB, it will copy this value to SB DBs SB_Global table nb_cfg field. + ops, err := nbClient.Where(nbGlobal).Mutate(nbGlobal, model.Mutation{ + Field: &nbGlobal.NbCfg, + Mutator: ovsdb.MutateOperationAdd, + Value: 1, + }) + if err != nil { + return fmt.Errorf("failed to generate ops to mutate nb_cfg: %w", err) + } + expectedNbCfgValue := nbGlobal.NbCfg + 1 + if _, err = libovsdbops.TransactAndCheck(nbClient, ops); err != nil { + return fmt.Errorf("failed to transact to increment nb_cfg: %w", err) + } + sbGlobal := &sbdb.SBGlobal{} + // poll until we see the expected value in SB DB every 5 milliseconds until context is cancelled. + err = wait.PollUntilContextCancel(ctx, time.Millisecond*5, true, func(_ context.Context) (done bool, err error) { + if sbGlobal, err = libovsdbops.GetSBGlobal(sbClient, sbGlobal); err != nil { + // northd hasn't added an entry yet + if errors.Is(err, client.ErrNotFound) { + return false, nil + } + return false, fmt.Errorf("failed to get sb_global table entry from SB DB: %w", err) + } + return sbGlobal.NbCfg >= expectedNbCfgValue, nil // we only need to ensure it is greater than or equal to the expected value + }) + if err != nil { + return fmt.Errorf("failed while waiting for nb_cfg value greater than or equal %d in sb db sb_global table: %w", expectedNbCfgValue, err) + } + return nil +} diff --git a/go-controller/pkg/node/bridgeconfig/bridgeconfig.go b/go-controller/pkg/node/bridgeconfig/bridgeconfig.go index c8b4ac647e..849740e58c 100644 --- a/go-controller/pkg/node/bridgeconfig/bridgeconfig.go +++ b/go-controller/pkg/node/bridgeconfig/bridgeconfig.go @@ -82,6 +82,7 @@ type BridgeConfiguration struct { ofPortPhys string netConfig map[string]*BridgeUDNConfiguration eipMarkIPs *egressip.MarkIPsCache + dropGARP bool } func NewBridgeConfiguration(intfName, nodeName, @@ -110,6 +111,16 @@ func NewBridgeConfiguration(intfName, nodeName, } res.netConfig[types.DefaultNetworkName].Advertised.Store(advertised) + // temp workaround for https://issues.redhat.com/browse/FDP-1537 + // we need to ensure we continue dropping GARPs for any new bridge config if the run mode is ovnkube controller + ovnkube node + IC + single zone node + // FIXME: only add if run mode is ovnkube controller + node in single process + if config.OVNKubernetesFeature.EnableEgressIP && config.OVNKubernetesFeature.EnableInterconnect && config.OvnKubeNode.Mode == types.NodeModeFull { + // drop by default - set to false later when ovnkube controller has sync'd and changes propagated to OVN southbound database + // we should also match on run mode here to ensure ovnkube controller + ovnkube node are running in the same process + res.dropGARP = true + } + // end temp work around + if config.Gateway.GatewayAcceleratedInterface != "" { // Try to get representor for the specified gateway device. // If function succeeds, then it is either a valid switchdev VF or SF, and we can use this accelerated device @@ -477,6 +488,12 @@ func (b *BridgeConfiguration) SetEIPMarkIPs(eipMarkIPs *egressip.MarkIPsCache) { b.eipMarkIPs = eipMarkIPs } +func (b *BridgeConfiguration) SetDropGARP(drop bool) { + b.mutex.Lock() + defer b.mutex.Unlock() + b.dropGARP = drop +} + func gatewayReady(patchPort string) bool { // Get ofport of patchPort ofport, _, err := util.GetOVSOfPort("--if-exists", "get", "interface", patchPort, "ofport") diff --git a/go-controller/pkg/node/bridgeconfig/bridgeflows.go b/go-controller/pkg/node/bridgeconfig/bridgeflows.go index 8a858c30e9..8d6f248c59 100644 --- a/go-controller/pkg/node/bridgeconfig/bridgeflows.go +++ b/go-controller/pkg/node/bridgeconfig/bridgeflows.go @@ -57,6 +57,19 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string mod_vlan_id = fmt.Sprintf("mod_vlan_vid:%d,", config.Gateway.VLANID) } + // Problem: ovn-controller connects to SB DB and then GARPs for any EIPs configured however for IC, SB DB maybe stale if + // ovnkube-controller is not processing. + // Solution: add a logical flow on startup to allow GARPs from Node IPs but drop other GARPs and remove when ovnkube-controller + // has sync'd and changes propagated to OVN SB DB. + // remove when ovn contains native support for logical router ports to contain an option to silence GARPs on startup of ovn-controller. + // https://issues.redhat.com/browse/FDP-1537 + if b.dropGARP { + // priority 499 flows to allow GARP pkts when src IP is a Node IP + dftFlows = append(dftFlows, b.allowNodeIPGARPFlows(extraIPs)...) + // priority 498 flows to drop GARP pkts with no regards to src IP + dftFlows = append(dftFlows, b.dropGARPFlows()...) + } + if config.IPv4Mode { // table0, Geneve packets coming from external. Skip conntrack and go directly to host // if dest mac is the shared mac send directly to host. @@ -498,6 +511,26 @@ func generateIPFragmentReassemblyFlow(ofPortPhys string) []string { return flows } +// generateGratuitousARPDropFlow returns a single flow to drop GARPs +// Remove when https://issues.redhat.com/browse/FDP-1537 available +func generateGratuitousARPDropFlow(inPort string, priority int) string { + // set to op code 1 - see rfc5227 particularly section: + // Why Are ARP Announcements Performed Using ARP Request Packets and Not ARP Reply Packets? + // ovn follows this practise of using op code 1 + return fmt.Sprintf("cookie=%s,table=0,priority=%d,in_port=%s,dl_dst=ff:ff:ff:ff:ff:ff,arp,arp_op=1,actions=drop", + nodetypes.GARPCookie, priority, inPort) +} + +// generateGratuitousARPAllowFlow returns a single flow to allow GARP only for a specific source IP. +// Remove when https://issues.redhat.com/browse/FDP-1537 available +func generateGratuitousARPAllowFlow(inPort string, ip net.IP, priority int) string { + // set to op code 1 - see rfc5227 particularly section: + // Why Are ARP Announcements Performed Using ARP Request Packets and Not ARP Reply Packets? + // ovn follows this practise of using op code 1 + return fmt.Sprintf("cookie=%s,table=0,priority=%d,in_port=%s,dl_dst=ff:ff:ff:ff:ff:ff,arp,arp_op=1,arp_spa=%s,actions=output:NORMAL", + nodetypes.GARPCookie, priority, inPort, ip) +} + // must be called with bridge.mutex held func (b *BridgeConfiguration) commonFlows(hostSubnets []*net.IPNet) ([]string, error) { // CAUTION: when adding new flows where the in_port is ofPortPatch and the out_port is ofPortPhys, ensure @@ -910,6 +943,55 @@ func (b *BridgeConfiguration) PMTUDDropFlows(ipAddrs []string) []string { return flows } +// dropGARPFlows generates the ovs flows for dropping gratuitous ARPs for cluster default network traffic only. +// bridgeConfiguration lock must be held by caller +func (b *BridgeConfiguration) dropGARPFlows() []string { + if config.Gateway.Mode != config.GatewayModeShared || !config.IPv4Mode { + return nil + } + const priority = 498 + var flows []string + + defaultNetInfo := util.DefaultNetInfo{} + defaultNetPatchPortName := defaultNetInfo.GetNetworkScopedPatchPortName(b.bridgeName, b.nodeName) + + for _, netConfig := range b.patchedNetConfigs() { + if netConfig.PatchPort != defaultNetPatchPortName { + continue + } + flows = append(flows, generateGratuitousARPDropFlow(netConfig.OfPortPatch, priority)) + } + return flows +} + +// allowNodeIPGARPFlows generates the OVS flows to allow gratuitous ARPs for Node IP(s) for the cluster default network traffic only. +// bridgeConfiguration lock must be held by caller. +// Remove when https://issues.redhat.com/browse/FDP-1537 is available +func (b *BridgeConfiguration) allowNodeIPGARPFlows(nodeIPs []net.IP) []string { + if config.Gateway.Mode != config.GatewayModeShared || !config.IPv4Mode { + return nil + } + const priority = 499 + var flows []string + + defaultNetInfo := util.DefaultNetInfo{} + defaultNetPatchPortName := defaultNetInfo.GetNetworkScopedPatchPortName(b.bridgeName, b.nodeName) + + for _, netConfig := range b.patchedNetConfigs() { + if netConfig.PatchPort != defaultNetPatchPortName { + continue + } + for _, nodeIP := range nodeIPs { + if nodeIP == nil || nodeIP.IsUnspecified() || utilnet.IsIPv6(nodeIP) { + continue + } + flows = append(flows, generateGratuitousARPAllowFlow(netConfig.OfPortPatch, nodeIP, priority)) + } + + } + return flows +} + func getIPv(ipnet *net.IPNet) string { prefix := "ip" if utilnet.IsIPv6CIDR(ipnet) { diff --git a/go-controller/pkg/node/gateway.go b/go-controller/pkg/node/gateway.go index 35267261f2..97e7baeecb 100644 --- a/go-controller/pkg/node/gateway.go +++ b/go-controller/pkg/node/gateway.go @@ -37,6 +37,7 @@ type Gateway interface { GetGatewayIface() string SetDefaultGatewayBridgeMAC(addr net.HardwareAddr) SetDefaultPodNetworkAdvertised(bool) + SetDefaultBridgeGARPDropFlows(bool) Reconcile() error } @@ -482,6 +483,15 @@ func (g *gateway) GetDefaultPodNetworkAdvertised() bool { return g.openflowManager.defaultBridge.GetNetworkConfig(types.DefaultNetworkName).Advertised.Load() } +// SetDefaultBridgeGARPDropFlows will enable flows to drop GARPs if the openflow +// manager has been initialized. +func (g *gateway) SetDefaultBridgeGARPDropFlows(isDropped bool) { + if g.openflowManager == nil { + return + } + g.openflowManager.setDefaultBridgeGARPDrop(isDropped) +} + // Reconcile handles triggering updates to different components of a gateway, like OFM, Services func (g *gateway) Reconcile() error { klog.Info("Reconciling gateway with updates") diff --git a/go-controller/pkg/node/openflow_manager.go b/go-controller/pkg/node/openflow_manager.go index 47a397766b..b55fff21cd 100644 --- a/go-controller/pkg/node/openflow_manager.go +++ b/go-controller/pkg/node/openflow_manager.go @@ -78,6 +78,12 @@ func (c *openflowManager) setDefaultBridgeMAC(macAddr net.HardwareAddr) { c.defaultBridge.SetMAC(macAddr) } +// setDefaultBridgeGARPDrop is used to enable or disable whether openflow manager generates ovs flows and adds them to +// the default ext bridge to drop GARP +func (c *openflowManager) setDefaultBridgeGARPDrop(isDropped bool) { + c.defaultBridge.SetDropGARP(isDropped) +} + func (c *openflowManager) updateFlowCacheEntry(key string, flows []string) { c.flowMutex.Lock() defer c.flowMutex.Unlock() @@ -193,6 +199,9 @@ func (c *openflowManager) Run(stopChan <-chan struct{}, doneWg *sync.WaitGroup) c.syncFlows() timer.Reset(syncPeriod) case <-stopChan: + // sync before shutting down because flows maybe added, and theres a race between flow channel (req sync) + // and stop chan on shutdown. ensure flows are sync before shut down + c.syncFlows() return } } diff --git a/go-controller/pkg/node/types/const.go b/go-controller/pkg/node/types/const.go index bdf9c388bf..a07f4b166f 100644 --- a/go-controller/pkg/node/types/const.go +++ b/go-controller/pkg/node/types/const.go @@ -17,6 +17,10 @@ const ( // PmtudOpenFlowCookie identifies the flows used to drop ICMP type (3) destination unreachable, // fragmentation-needed (4) PmtudOpenFlowCookie = "0x0304" + // GARPCookie identifies the flows used to allow node IPs and drop other GARPs from CDN. + // Temp workaround until OVN has native supported for silencing GARPs on startup. + // https://issues.redhat.com/browse/FDP-1537 + GARPCookie = "0x0305" // CtMarkHost is the conntrack mark value for host traffic CtMarkHost = "0x2" ) From a7869b2c9dcf46332262fae482b351099e42b28e Mon Sep 17 00:00:00 2001 From: Martin Kennelly Date: Tue, 9 Sep 2025 15:41:27 +0100 Subject: [PATCH 3/4] Node controller shutdown: do not ref gateway if not set PR 5373 to drop the GARP flows didnt consider that we set the default network controller and later we set the gateway obj. In-between this period, ovnkube node may receive a stop signal and we do not guard against accessing the gateway if its not yet set. OVNKube controller may have sync'd before the gateway obj is set. There is nothing to reconcile if the gateway is not set. Signed-off-by: Martin Kennelly (cherry picked from commit e60220a637e083b09360d8a8c9f8959593600f07) --- go-controller/pkg/controllermanager/node_controller_manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go-controller/pkg/controllermanager/node_controller_manager.go b/go-controller/pkg/controllermanager/node_controller_manager.go index 495cb840a6..716cb76869 100644 --- a/go-controller/pkg/controllermanager/node_controller_manager.go +++ b/go-controller/pkg/controllermanager/node_controller_manager.go @@ -259,7 +259,7 @@ func (ncm *NodeControllerManager) Stop(isOVNKubeControllerSyncd *atomic.Bool) { close(ncm.stopChan) if ncm.defaultNodeNetworkController != nil { - if isOVNKubeControllerSyncd != nil { + if isOVNKubeControllerSyncd != nil && ncm.defaultNodeNetworkController.Gateway != nil { ncm.defaultNodeNetworkController.Gateway.SetDefaultBridgeGARPDropFlows(true) if err := ncm.defaultNodeNetworkController.Gateway.Reconcile(); err != nil { klog.Errorf("Failed to reconcile gateway after attempting to add flows to the external bridge to drop GARPs: %v", err) From a4776fb3518e7751d9c6e3ef858da60fe165cadc Mon Sep 17 00:00:00 2001 From: Martin Kennelly Date: Mon, 6 Oct 2025 12:17:35 +0100 Subject: [PATCH 4/4] [IC][EIP] OCPBUGS-42303: Fix race between SB DB and OVN-Controller Ensure ovn-controller has processed the SB DB updates before removing the GARP drop flows by utilizing the hv_cfg field in NB_Global [1] OVNKube controller increments the nb_cfg value post sync, which is copied to SB DB by northd. OVN-Controllers copy this nb_cfg value from SB DB and write it to their chassis_private tables nb_cfg field after they have processed the SB DB changes. Northd will then look at all the chassis_private tables nb_cfg value and set the NB DBs Nb_global hv_cfg value to the min integer found. Since IC currently only supports one node per zone, we can be sure ovn-controller is running locally and therefore its ok to block removing the drop GARP flows. [1] https://man7.org/linux/man-pages/man5/ovn-nb.5.html Signed-off-by: Martin Kennelly (cherry picked from commit 3b5da01cb280067b065eca7738f5f502e9faa3a0) --- go-controller/cmd/ovnkube/ovnkube.go | 8 ++-- .../pkg/libovsdb/util/northd_sync.go | 45 +++++++++++-------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/go-controller/cmd/ovnkube/ovnkube.go b/go-controller/cmd/ovnkube/ovnkube.go index 7d0e40af81..742ec8ed12 100644 --- a/go-controller/cmd/ovnkube/ovnkube.go +++ b/go-controller/cmd/ovnkube/ovnkube.go @@ -536,12 +536,12 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util metrics.MetricOVNKubeControllerReadyDuration.Set(time.Since(startTime).Seconds()) if isOVNKubeControllerSyncd != nil { - klog.Infof("Waiting for OVN northbound database changes to sync to OVN Southbound database") - if err = libovsdbutil.WaitUntilNorthdSyncOnce(ctx, libovsdbOvnNBClient, libovsdbOvnSBClient); err != nil { - controllerErr = fmt.Errorf("failed waiting for northd to sync OVN Northbound DB to Southbound: %v", err) + klog.Infof("Waiting for OVN northbound database changes to be processed by ovn-controller") + if err = libovsdbutil.WaitUntilFlowsInstalled(ctx, libovsdbOvnNBClient); err != nil { + controllerErr = fmt.Errorf("failed waiting for OVN northbound database changes to be processed by ovn-controller: %v", err) return } else { - klog.Infof("OVN northbound database changes synced to OVN Southbound database") + klog.Infof("Finished waiting for OVN northbound database changes to be processed by ovn-controller") isOVNKubeControllerSyncd.Store(true) } } diff --git a/go-controller/pkg/libovsdb/util/northd_sync.go b/go-controller/pkg/libovsdb/util/northd_sync.go index 40aa212680..9712f0b05d 100644 --- a/go-controller/pkg/libovsdb/util/northd_sync.go +++ b/go-controller/pkg/libovsdb/util/northd_sync.go @@ -14,26 +14,26 @@ import ( libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/sbdb" ) -// WaitUntilNorthdSyncOnce ensures northd has sync'd at least once by increments nb_cfg value in NB DB and waiting -// for northd to copy it to SB DB. Poll SB DB until context is cancelled. -// The expectation is that the data you wish to be sync'd to SB DB has already been written to NB DB so when we get the initial -// nb_cfg value, we know that if we increment that by one and see that value or greater in SB DB, then the data has sync'd. -// All other processes interacting with nb_cfg increment it. This function depends on other processes respecting that. -// No guarantee of any changes in SB DB made after this func. -func WaitUntilNorthdSyncOnce(ctx context.Context, nbClient, sbClient client.Client) error { +// WaitUntilFlowsInstalled ensures that ovn-controller has sync'd at least once by incrementing nb_cfg value in NB DB +// and waiting for northd to write back a value equal or greater to the hv_cfg field in NB_Global. +// See https://www.ovn.org/support/dist-docs/ovn-nb.5.html for more info regarding nb_cfg / hv_cfg fields. +// The expectation is that the data you wish to be sync'd is already written to NB DB. +// Note: if the ovn-controller is down, this will block until it comes back up, therefore this func should only +// be used with IC mode and one node per zone. +func WaitUntilFlowsInstalled(ctx context.Context, nbClient client.Client) error { // 1. Get value of nb_cfg // 2. Increment value of nb_cfg - // 3. Wait until value appears in SB DB after northd copies it. + // 3. Wait until value appears in hv_cfg field thus ensuring ovn-controller has processed the changes nbGlobal := &nbdb.NBGlobal{} nbGlobal, err := libovsdbops.GetNBGlobal(nbClient, nbGlobal) if err != nil { return fmt.Errorf("failed to find OVN Northbound NB_Global table"+ " entry: %w", err) } - // increment nb_cfg value by 1. When northd consumes updates from NB DB, it will copy this value to SB DBs SB_Global table nb_cfg field. + // increment nb_cfg value by 1. When northd consumes updates from NB DB, it will copy this value to SB DBs SB_Global + // table nb_cfg field. ops, err := nbClient.Where(nbGlobal).Mutate(nbGlobal, model.Mutation{ Field: &nbGlobal.NbCfg, Mutator: ovsdb.MutateOperationAdd, @@ -42,24 +42,31 @@ func WaitUntilNorthdSyncOnce(ctx context.Context, nbClient, sbClient client.Clie if err != nil { return fmt.Errorf("failed to generate ops to mutate nb_cfg: %w", err) } - expectedNbCfgValue := nbGlobal.NbCfg + 1 if _, err = libovsdbops.TransactAndCheck(nbClient, ops); err != nil { return fmt.Errorf("failed to transact to increment nb_cfg: %w", err) } - sbGlobal := &sbdb.SBGlobal{} - // poll until we see the expected value in SB DB every 5 milliseconds until context is cancelled. - err = wait.PollUntilContextCancel(ctx, time.Millisecond*5, true, func(_ context.Context) (done bool, err error) { - if sbGlobal, err = libovsdbops.GetSBGlobal(sbClient, sbGlobal); err != nil { + expectedHvCfgValue := nbGlobal.NbCfg + 1 + if expectedHvCfgValue < 0 { // handle overflow + expectedHvCfgValue = 0 + } + nbGlobal = &nbdb.NBGlobal{} + // ovn-northd sets hv_cfg to the lowest int value found for all chassis in the system (IC mode, + // we support a single chassis per zone) as reported in the Chassis_Private table in the southbound database. + // Thus, hv_cfg equals nb_cfg for the single chassis once it is caught up with NB DB we want. + // poll until we see the expected value in NB DB every 5 milliseconds until context is cancelled. + err = wait.PollUntilContextCancel(ctx, time.Millisecond*5, true, func(_ context.Context) (done bool, err2 error) { + if nbGlobal, err2 = libovsdbops.GetNBGlobal(nbClient, nbGlobal); err2 != nil { // northd hasn't added an entry yet - if errors.Is(err, client.ErrNotFound) { + if errors.Is(err2, client.ErrNotFound) { return false, nil } - return false, fmt.Errorf("failed to get sb_global table entry from SB DB: %w", err) + return false, fmt.Errorf("failed to get nb_global table entry from NB DB: %w", err2) } - return sbGlobal.NbCfg >= expectedNbCfgValue, nil // we only need to ensure it is greater than or equal to the expected value + return nbGlobal.HvCfg >= expectedHvCfgValue, nil // we only need to ensure it is greater than or equal to the expected value }) if err != nil { - return fmt.Errorf("failed while waiting for nb_cfg value greater than or equal %d in sb db sb_global table: %w", expectedNbCfgValue, err) + return fmt.Errorf("failed while waiting for hv_cfg value greater than or equal %d in NB DB nb_global table: %w", + expectedHvCfgValue, err) } return nil }