From d8a1cb70c7336aed3b5cfd1e8d6489730826388b Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Wed, 1 Mar 2023 08:36:29 +0800 Subject: [PATCH] [dualtor] Fix neighbor miss when mux is not ready (#2676) What I did The issue is that MuxOrch::m_syncdNeighbors assumes all cached zero-mac neighbors have tunnel routes installed. The check before adding tunnel routes ignores the following zero-mac neighbor events. Let's ensure that, if a zero-mac neighbor is present in MuxOrch::m_syncdNeighbors, it has a tunnel route installed. So let's check the tunnel route install success before adding a neighbor to MuxOrch::m_syncdNeighbors. Why I did it To fix #2675 If MuxOrch is not fully initialized, and there is a FAILED neighbor added to kernel, the tunnel route creation will fail. But the subsequent FAILED neighbor events cannot trigger tunnel route creation because MuxOrch::m_syncdNeighbors caches the first event and regard the tunnel as already installed. How I verified it UT and verify on testbed. Signed-off-by: Longxiang Lyu --- orchagent/muxorch.cpp | 11 ++++++++--- orchagent/muxorch.h | 2 ++ orchagent/neighorch.cpp | 42 +++++++++++++++++++++++++++++++---------- orchagent/neighorch.h | 2 +- tests/test_mux.py | 37 +++++++++++++++++++++++++++++++++++- 5 files changed, 79 insertions(+), 15 deletions(-) diff --git a/orchagent/muxorch.cpp b/orchagent/muxorch.cpp index 3fffd18946e7..07b015616195 100644 --- a/orchagent/muxorch.cpp +++ b/orchagent/muxorch.cpp @@ -1104,7 +1104,7 @@ void MuxOrch::updateNeighbor(const NeighborUpdate& update) return; } - auto standalone_tunnel_neigh_it = standalone_tunnel_neighbors_.find(update.entry.ip_address); + bool is_tunnel_route_installed = isStandaloneTunnelRouteInstalled(update.entry.ip_address); // Handling zero MAC neighbor updates if (!update.mac) { @@ -1115,7 +1115,7 @@ void MuxOrch::updateNeighbor(const NeighborUpdate& update) if (update.add) { - if (standalone_tunnel_neigh_it == standalone_tunnel_neighbors_.end()) + if (!is_tunnel_route_installed) { createStandaloneTunnelRoute(update.entry.ip_address); } @@ -1130,7 +1130,7 @@ void MuxOrch::updateNeighbor(const NeighborUpdate& update) * make sure to remove any existing tunnel routes to prevent conflicts. * This block also covers the case of neighbor deletion. */ - if (standalone_tunnel_neigh_it != standalone_tunnel_neighbors_.end()) + if (is_tunnel_route_installed) { removeStandaloneTunnelRoute(update.entry.ip_address); } @@ -1474,6 +1474,11 @@ void MuxOrch::removeStandaloneTunnelRoute(IpAddress neighborIp) standalone_tunnel_neighbors_.erase(neighborIp); } +bool MuxOrch::isStandaloneTunnelRouteInstalled(const IpAddress& neighborIp) +{ + return standalone_tunnel_neighbors_.find(neighborIp) != standalone_tunnel_neighbors_.end(); +} + MuxCableOrch::MuxCableOrch(DBConnector *db, DBConnector *sdb, const std::string& tableName): Orch2(db, tableName, request_), app_tunnel_route_table_(db, APP_TUNNEL_ROUTE_TABLE_NAME), diff --git a/orchagent/muxorch.h b/orchagent/muxorch.h index f4126979f656..d2590168cc12 100644 --- a/orchagent/muxorch.h +++ b/orchagent/muxorch.h @@ -202,6 +202,8 @@ class MuxOrch : public Orch2, public Observer, public Subject bool removeNextHopTunnel(std::string tunnelKey, IpAddress& ipAddr); sai_object_id_t getNextHopTunnelId(std::string tunnelKey, IpAddress& ipAddr); + bool isStandaloneTunnelRouteInstalled(const IpAddress& neighborIp); + private: virtual bool addOperation(const Request& request); virtual bool delOperation(const Request& request); diff --git a/orchagent/neighorch.cpp b/orchagent/neighorch.cpp index 5207b2430c82..8e4c668a57e5 100644 --- a/orchagent/neighorch.cpp +++ b/orchagent/neighorch.cpp @@ -739,17 +739,33 @@ void NeighOrch::doTask(Consumer &consumer) mac_address = MacAddress(fvValue(*i)); } - if (m_syncdNeighbors.find(neighbor_entry) == m_syncdNeighbors.end() - || m_syncdNeighbors[neighbor_entry].mac != mac_address) + bool nbr_not_found = (m_syncdNeighbors.find(neighbor_entry) == m_syncdNeighbors.end()); + if (nbr_not_found || m_syncdNeighbors[neighbor_entry].mac != mac_address) { - // only for unresolvable neighbors that are new - if (!mac_address) + if (!mac_address) { - if (m_syncdNeighbors.find(neighbor_entry) == m_syncdNeighbors.end()) + if (nbr_not_found) { - addZeroMacTunnelRoute(neighbor_entry, mac_address); + // only for unresolvable neighbors that are new + if (addZeroMacTunnelRoute(neighbor_entry, mac_address)) + { + it = consumer.m_toSync.erase(it); + } + else + { + it++; + continue; + } + } + else + { + /* + * For neighbors that were previously resolvable but are now unresolvable, + * we expect such neighbor entries to be deleted prior to a zero MAC update + * arriving for that same neighbor. + */ + it = consumer.m_toSync.erase(it); } - it = consumer.m_toSync.erase(it); } else if (addNeighbor(neighbor_entry, mac_address)) { @@ -1755,12 +1771,18 @@ void NeighOrch::updateSrv6Nexthop(const NextHopKey &nh, const sai_object_id_t &n m_syncdNextHops.erase(nh); } } -void NeighOrch::addZeroMacTunnelRoute(const NeighborEntry& entry, const MacAddress& mac) + +bool NeighOrch::addZeroMacTunnelRoute(const NeighborEntry& entry, const MacAddress& mac) { SWSS_LOG_INFO("Creating tunnel route for neighbor %s", entry.ip_address.to_string().c_str()); MuxOrch* mux_orch = gDirectory.get(); NeighborUpdate update = {entry, mac, true}; mux_orch->update(SUBJECT_TYPE_NEIGH_CHANGE, static_cast(&update)); - m_syncdNeighbors[entry] = { mac, false }; -} + if (mux_orch->isStandaloneTunnelRouteInstalled(entry.ip_address)) + { + m_syncdNeighbors[entry] = { mac, false }; + return true; + } + return false; +} diff --git a/orchagent/neighorch.h b/orchagent/neighorch.h index 727797757f4e..7dc5d386b47b 100644 --- a/orchagent/neighorch.h +++ b/orchagent/neighorch.h @@ -116,7 +116,7 @@ class NeighOrch : public Orch, public Subject, public Observer bool resolveNeighborEntry(const NeighborEntry &, const MacAddress &); void clearResolvedNeighborEntry(const NeighborEntry &); - void addZeroMacTunnelRoute(const NeighborEntry &, const MacAddress &); + bool addZeroMacTunnelRoute(const NeighborEntry &, const MacAddress &); }; #endif /* SWSS_NEIGHORCH_H */ diff --git a/tests/test_mux.py b/tests/test_mux.py index 8313980130ad..15892963cc79 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -1173,12 +1173,16 @@ def test_Route(self, dvs, dvs_route, testlog): self.create_and_test_route(appdb, asicdb, dvs, dvs_route) - def test_NH(self, dvs, dvs_route, intf_fdb_map, setup_peer_switch, setup_tunnel, testlog): + def test_NH(self, dvs, dvs_route, intf_fdb_map, setup, setup_mux_cable, + setup_peer_switch, setup_tunnel, testlog): """ test NH routes and mux state change """ appdb = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0) asicdb = dvs.get_asic_db() mac = intf_fdb_map["Ethernet0"] + # get tunnel nexthop + self.check_tnl_nexthop_in_asic_db(asicdb) + self.create_and_test_NH_routes(appdb, asicdb, dvs, dvs_route, mac) def test_acl(self, dvs, dvs_acl, testlog): @@ -1226,6 +1230,37 @@ def test_neighbor_miss( expected_mac=mac if exp_result[REAL_MAC] else '00:00:00:00:00:00' ) + def test_neighbor_miss_no_mux( + self, dvs, dvs_route, setup_vlan, setup_tunnel, setup, + setup_peer_switch, neighbor_cleanup, testlog + ): + config_db = dvs.get_config_db() + appdb = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0) + + test_ip = self.SERV1_SOC_IPV4 + self.ping_ip(dvs, test_ip) + + # no mux present, no standalone tunnel route installed + self.check_neighbor_state(dvs, dvs_route, test_ip, expect_route=False) + + # setup the mux + config_db = dvs.get_config_db() + self.create_mux_cable(config_db) + # tunnel route should be installed immediately after mux setup + self.check_neighbor_state(dvs, dvs_route, test_ip, expect_route=True) + + # set port state as standby + self.set_mux_state(appdb, "Ethernet0", "standby") + self.check_neighbor_state(dvs, dvs_route, test_ip, expect_route=True) + + # set port state as active + self.set_mux_state(appdb, "Ethernet0", "active") + self.check_neighbor_state(dvs, dvs_route, test_ip, expect_route=True) + + # clear the FAILED neighbor + self.clear_neighbors(dvs) + self.check_neighbor_state(dvs, dvs_route, test_ip, expect_route=False) + def test_neighbor_miss_no_peer( self, dvs, dvs_route, setup_vlan, setup_mux_cable, setup_tunnel, remove_peer_switch, neighbor_cleanup, testlog