From 7a92100c1c1a344e67d4501cb7b98de557d0ea52 Mon Sep 17 00:00:00 2001 From: Shi Su <67605788+shi-su@users.noreply.github.com> Date: Mon, 16 Nov 2020 14:15:31 -0800 Subject: [PATCH] [Routeorch] Fix next hop group reference count in bulk operation (#1501) What I did Remove next-hop groups after updating the reference counter for a bulk of routes instead of removing next-hop groups in the loop of updating the reference counter. Fix Azure/sonic-buildimage#5813 Why I did it The bulk route API has two loops of updating the reference counter: 1. update the sairedis reference counter 2. update the orchagent reference counter Before this commit, the removal of next-hop groups is triggered in the second loop of updating the orchagent reference counter when the reference counter decreases to zero. This may result in a reference counter mismatch between orchagent and sairedis since the sairedis reference counter has already included the operation of the whole bulk but the orchagent reference counter has not. Therefore, the removal of next-hop group may fail due to the mismatch in reference counter (e.g., there are some other routes point to the next-hop group but has not been counted in orchagent yet). To fix this problem, the next-hop group removal operation should be done after updating the reference counter of the whole bulk to make sure the reference counters sairedis and orchagent matches. --- orchagent/routeorch.cpp | 17 ++++++++++++----- orchagent/routeorch.h | 2 ++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/orchagent/routeorch.cpp b/orchagent/routeorch.cpp index ef34cc9b046a..7166effcd5e7 100644 --- a/orchagent/routeorch.cpp +++ b/orchagent/routeorch.cpp @@ -640,6 +640,7 @@ void RouteOrch::doTask(Consumer& consumer) // Go through the bulker results auto it_prev = consumer.m_toSync.begin(); + m_bulkNhgReducedRefCnt.clear(); while (it_prev != it) { KeyOpFieldsValuesTuple t = it_prev->second; @@ -708,6 +709,15 @@ void RouteOrch::doTask(Consumer& consumer) it_prev++; } } + + /* Remove next hop group if the reference count decreases to zero */ + for (auto it_nhg = m_bulkNhgReducedRefCnt.begin(); it_nhg != m_bulkNhgReducedRefCnt.end(); it_nhg++) + { + if (m_syncdNextHopGroups[*it_nhg].ref_count == 0) + { + removeNextHopGroup(*it_nhg); + } + } } } @@ -1395,7 +1405,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey if (it_route->second.getSize() > 1 && m_syncdNextHopGroups[it_route->second].ref_count == 0) { - removeNextHopGroup(it_route->second); + m_bulkNhgReducedRefCnt.emplace(it_route->second); } SWSS_LOG_INFO("Post set route %s with next hop(s) %s", ipPrefix.to_string().c_str(), nextHops.to_string().c_str()); @@ -1533,15 +1543,12 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx) /* * Decrease the reference count only when the route is pointing to a next hop. - * Decrease the reference count when the route is pointing to a next hop group, - * and check whether the reference count decreases to zero. If yes, then we need - * to remove the next hop group. */ decreaseNextHopRefCount(it_route->second); if (it_route->second.getSize() > 1 && m_syncdNextHopGroups[it_route->second].ref_count == 0) { - removeNextHopGroup(it_route->second); + m_bulkNhgReducedRefCnt.emplace(it_route->second); } SWSS_LOG_INFO("Remove route %s with next hop(s) %s", diff --git a/orchagent/routeorch.h b/orchagent/routeorch.h index 7c5a6a483f8a..4618b72804fc 100644 --- a/orchagent/routeorch.h +++ b/orchagent/routeorch.h @@ -126,6 +126,8 @@ class RouteOrch : public Orch, public Subject RouteTables m_syncdRoutes; NextHopGroupTable m_syncdNextHopGroups; + std::set m_bulkNhgReducedRefCnt; + NextHopObserverTable m_nextHopObservers; EntityBulker gRouteBulker;