From fef2d0dffc1fa8bab0fc5cae0e53a8fb65f3e2d8 Mon Sep 17 00:00:00 2001 From: Sudharsan Dhamal Gopalarathnam Date: Wed, 4 Dec 2024 13:49:38 -0800 Subject: [PATCH] [202405][FRR] Fixing FRR to make route node lock (#20990) To fix a statistical issue. The original fix was done in FRRouting/frr#17297. However to accommodate 8.5.4 the patch in the PR was added. [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". Core was generated by `/usr/lib/frr/zebra -A 127.0.0.1 -s 90000000 -M dplane_fpm_nl -M snmp'. Program terminated with signal SIGABRT, Aborted. #0 0x00007fccd7351e2c in ?? () from /lib/x86_64-linux-gnu/libc.so.6 [Current thread is 1 (Thread 0x7fccd6faf7c0 (LWP 36))] (gdb) bt #0 0x00007fccd7351e2c in ?? () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x00007fccd7302fb2 in raise () from /lib/x86_64-linux-gnu/libc.so.6 #2 0x00007fccd72ed472 in abort () from /lib/x86_64-linux-gnu/libc.so.6 #3 0x00007fccd75bb3a9 in _zlog_assert_failed (xref=xref@entry=0x7fccd7652380 <_xref.16>, extra=extra@entry=0x0) at ../lib/zlog.c:678 #4 0x00007fccd759b2fe in route_node_delete (node=) at ../lib/table.c:352 #5 0x00007fccd759b445 in route_unlock_node (node=0x0) at ../lib/table.h:258 #6 route_next (node=) at ../lib/table.c:436 #7 route_next (node=node@entry=0x56029d89e560) at ../lib/table.c:410 #8 0x000056029b6b6b7a in if_lookup_by_name_per_ns (ns=ns@entry=0x56029d873d90, ifname=ifname@entry=0x7fccc0029340 "PortChannel1020") at ../zebra/interface.c:312 #9 0x000056029b6b8b36 in zebra_if_dplane_ifp_handling (ctx=0x7fccc0029310) at ../zebra/interface.c:1867 #10 zebra_if_dplane_result (ctx=0x7fccc0029310) at ../zebra/interface.c:2221 #11 0x000056029b7137a9 in rib_process_dplane_results (thread=) at ../zebra/zebra_rib.c:4810 #12 0x00007fccd75a0e0d in thread_call (thread=thread@entry=0x7ffe8e553cc0) at ../lib/thread.c:1990 #13 0x00007fccd7559368 in frr_run (master=0x56029d65a040) at ../lib/libfrr.c:1198 #14 0x000056029b6ac317 in main (argc=9, argv=0x7ffe8e5540d8) at ../zebra/main.c:478 --- .../0054-make-route-node-lock-atomic.patch | 95 +++++++++++++++++++ src/sonic-frr/patch/series | 1 + 2 files changed, 96 insertions(+) create mode 100644 src/sonic-frr/patch/0054-make-route-node-lock-atomic.patch diff --git a/src/sonic-frr/patch/0054-make-route-node-lock-atomic.patch b/src/sonic-frr/patch/0054-make-route-node-lock-atomic.patch new file mode 100644 index 000000000000..1cb2485804bd --- /dev/null +++ b/src/sonic-frr/patch/0054-make-route-node-lock-atomic.patch @@ -0,0 +1,95 @@ +From 402665769dc5563c246003720e75c3d6244a88eb Mon Sep 17 00:00:00 2001 +From: "Barry Friedman (friedman)" +Date: Wed, 9 Oct 2024 18:01:00 +0000 +Subject: [PATCH 1/2] lib: make route_node lock atomic + +Multiple threads walking a route table perform read-modify-write +operations on the lock variable. This can result in a wrong lock count +which wrongly triggers a route_node_delete(). Making the access to the +lock variable atomic should ensure the lock is correct. + +Signed-off-by: Barry Friedman (friedman) +--- + lib/table.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/table.h b/lib/table.h +index 5dec69ee7e..89993dd279 100644 +--- a/lib/table.h ++++ b/lib/table.h +@@ -129,7 +129,7 @@ struct route_table { + struct route_node *table_rdonly(link[2]); \ + \ + /* Lock of this radix */ \ +- unsigned int table_rdonly(lock); \ ++ _Atomic unsigned int table_rdonly(lock); \ + \ + struct rn_hash_node_item nodehash; \ + /* Each node of route. */ \ +@@ -244,7 +244,7 @@ extern void route_table_iter_cleanup(route_table_iter_t *iter); + /* Lock node. */ + static inline struct route_node *route_lock_node(struct route_node *node) + { +- (*(unsigned *)&node->lock)++; ++ atomic_fetch_add_explicit(&node->lock, 1, memory_order_relaxed); + return node; + } + +@@ -252,7 +252,7 @@ static inline struct route_node *route_lock_node(struct route_node *node) + static inline void route_unlock_node(struct route_node *node) + { + assert(node->lock > 0); +- (*(unsigned *)&node->lock)--; ++ atomic_fetch_sub_explicit(&node->lock, 1, memory_order_relaxed); + + if (node->lock == 0) + route_node_delete(node); +-- +2.43.2 + + +From bf96a5a334ea1e953bf98a92f6a942335f49700b Mon Sep 17 00:00:00 2001 +From: "Barry Friedman (friedman)" +Date: Fri, 11 Oct 2024 22:02:51 +0000 +Subject: [PATCH 2/2] lib: Fix atomic node lock warnings with enable-dev-build + +The route_node lock variable type adds a const qualifier when +--enable-dev-build is configured. This is to generate warnings +in case client code tries to modify this variable. As a result +the functions route_lock_node() and route_unlock_node() need to +cast away the const qualifier to prevent the compile warnings. +When they _Atomic qualifier was added I wrongly removed the +cast. This change adds it back to prevent the warnings. + +Signed-off-by: Barry Friedman (friedman) +--- + lib/table.h | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/table.h b/lib/table.h +index 89993dd279..8023f8aa30 100644 +--- a/lib/table.h ++++ b/lib/table.h +@@ -244,7 +244,8 @@ extern void route_table_iter_cleanup(route_table_iter_t *iter); + /* Lock node. */ + static inline struct route_node *route_lock_node(struct route_node *node) + { +- atomic_fetch_add_explicit(&node->lock, 1, memory_order_relaxed); ++ atomic_fetch_add_explicit((_Atomic unsigned *)&node->lock, 1, ++ memory_order_relaxed); + return node; + } + +@@ -252,7 +253,8 @@ static inline struct route_node *route_lock_node(struct route_node *node) + static inline void route_unlock_node(struct route_node *node) + { + assert(node->lock > 0); +- atomic_fetch_sub_explicit(&node->lock, 1, memory_order_relaxed); ++ atomic_fetch_sub_explicit((_Atomic unsigned *)&node->lock, 1, ++ memory_order_relaxed); + + if (node->lock == 0) + route_node_delete(node); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/series b/src/sonic-frr/patch/series index 980e8f42d6d7..8f9e09f4a86b 100644 --- a/src/sonic-frr/patch/series +++ b/src/sonic-frr/patch/series @@ -51,3 +51,4 @@ 0051-bgpd-backpressure-fix-ret-value-evpn_route_select_in.patch 0052-bgpd-backpressure-log-error-for-evpn-when-route-inst.patch 0053-Patch-to-send-tag-value-associated-with-route-via-ne.patch +0054-make-route-node-lock-atomic.patch