forked from sonic-net/sonic-buildimage
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FRR] Zebra BGP enhancements to better handle memory during route chu…
…rns (sonic-net#19717) Added the below patches which are part of BGP Zebra back pressure feature required to keep the memory usage in check during route churns How I did it New patches that were added: Patch FRR Pull request 0030-zebra-backpressure-Zebra-push-back-on-Buffer-Stream-.patch FRRouting/frr#15411 0031-bgpd-backpressure-Add-a-typesafe-list-for-Zebra-Anno.patch FRRouting/frr#15524 0032-bgpd-fix-flushing-ipv6-flowspec-entries-when-peering.patch FRRouting/frr#15326 0033-bgpd-backpressure-cleanup-bgp_zebra_XX-func-args.patch FRRouting/frr#15524 0034-gpd-backpressure-Handle-BGP-Zebra-Install-evt-Creat.patch FRRouting/frr#15524 0035-bgpd-backpressure-Handle-BGP-Zebra-EPVN-Install-evt-.patch FRRouting/frr#15624 0036-zebra-backpressure-Fix-Null-ptr-access-Coverity-Issu.patch FRRouting/frr#15728 0037-bgpd-Increase-install-uninstall-speed-of-evpn-vpn-vn.patch FRRouting/frr#15727 0038-zebra-Actually-display-I-O-buffer-sizes.patch FRRouting/frr#15708 0039-zebra-Actually-display-I-O-buffer-sizes-part-2.patch FRRouting/frr#15769 0040-bgpd-backpressure-Fix-to-withdraw-evpn-type-5-routes.patch FRRouting/frr#16034 0041-bgpd-backpressure-Fix-to-avoid-CPU-hog.patch FRRouting/frr#16035 0042-zebra-Use-built-in-data-structure-counter.patch FRRouting/frr#16221 0043-zebra-Use-the-ctx-queue-counters.patch FRRouting/frr#16220 0044-zebra-Modify-dplane-loop-to-allow-backpressure-to-fi.patch FRRouting/frr#16220 0045-zebra-Limit-queue-depth-in-dplane_fpm_nl.patch FRRouting/frr#16220 0046-zebra-Modify-show-zebra-dplane-providers-to-give-mor.patch FRRouting/frr#16220 0047-bgpd-backpressure-fix-evpn-route-sync-to-zebra.patch FRRouting/frr#16234 0048-bgpd-backpressure-fix-to-properly-remove-dest-for-bg.patch FRRouting/frr#16368 0049-bgpd-backpressure-Improve-debuggability.patch FRRouting/frr#16368 0050-bgpd-backpressure-Avoid-use-after-free.patch FRRouting/frr#16437 0051-bgpd-backpressure-fix-ret-value-evpn_route_select_in.patch FRRouting/frr#16416 0052-bgpd-backpressure-log-error-for-evpn-when-route-inst.patch FRRouting/frr#16416
- Loading branch information
1 parent
57cfb86
commit 38aa337
Showing
24 changed files
with
3,513 additions
and
0 deletions.
There are no files selected for viewing
174 changes: 174 additions & 0 deletions
174
src/sonic-frr/patch/0030-zebra-backpressure-Zebra-push-back-on-Buffer-Stream-.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
From 7c711ff437985b23a4dd919a98b22b8ea14ef553 Mon Sep 17 00:00:00 2001 | ||
From: Rajasekar Raja <rajasekarr@nvidia.com> | ||
Date: Mon, 12 Feb 2024 10:44:18 -0800 | ||
Subject: [PATCH 02/11] zebra: backpressure - Zebra push back on Buffer/Stream | ||
creation | ||
|
||
Currently, the way zebra works is it creates pthread per client (BGP is | ||
of interest in this case) and this thread loops itself in zserv_read() | ||
to check for any incoming data. If there is one, then it reads, | ||
validates and adds it in the ibuf_fifo signalling the main thread to | ||
process the message. The main thread when it gets a change, processes | ||
the message, and invokes the function pointer registered in the header | ||
command. (Ex: zserv_handlers). | ||
|
||
Finally, if all of this was successful, this task reschedules itself and | ||
loops in zserv_read() again | ||
|
||
However, if there are already items on ibuf FIFO, that means zebra is | ||
slow in processing. And with the current mechanism if Zebra main is | ||
busy, the ibuf FIFO keeps growing holding up the memory. | ||
|
||
Show memory zebra:(Example: 15k streams hoarding ~160 MB of data) | ||
--- qmem libfrr --- | ||
Stream : 44 variable 3432352 15042 161243800 | ||
|
||
Fix: | ||
Client IO Thread: (zserv_read) | ||
- Stop doing the read events when we know there are X number of items | ||
on the FIFO already.(X - zebra zapi-packets <1-10000> (Default-1000) | ||
|
||
- Determine the number of items on the zserv->ibuf_fifo. Subtract this | ||
from the work items and only pull the number of items off that would | ||
take us to X items on the ibuf_fifo again. | ||
|
||
- If the number of items in the ibuf_fifo has reached to the maximum | ||
* Either initially when zserv_read() is called (or) | ||
* when processing the remainders of the incoming buffer | ||
the client IO thread is woken by the the zebra main. | ||
|
||
Main thread: (zserv_process_message) | ||
If the client ibuf always schedules a wakeup to the client IO to read | ||
more items from the socked buffer. This way we ensure | ||
- Client IO thread always tries to read the socket buffer and add more | ||
items to the ibuf_fifo (until max limit) | ||
- hidden config change (zebra zapi-packets <>) is taken into account | ||
|
||
Ticket: #3390099 | ||
|
||
Signed-off-by: Donald Sharp <sharpd@nvidia.com> | ||
Signed-off-by: Rajasekar Raja <rajasekarr@nvidia.com> | ||
|
||
diff --git a/zebra/zserv.c b/zebra/zserv.c | ||
index 2024f34534..de6e404fc4 100644 | ||
--- a/zebra/zserv.c | ||
+++ b/zebra/zserv.c | ||
@@ -318,6 +318,14 @@ zwrite_fail: | ||
* this task reschedules itself. | ||
* | ||
* Any failure in any of these actions is handled by terminating the client. | ||
+ * | ||
+ * The client's input buffer ibuf_fifo can have a maximum items as configured | ||
+ * in the packets_to_process. This way we are not filling up the FIFO more | ||
+ * than the maximum when the zebra main is busy. If the fifo has space, we | ||
+ * reschedule ourselves to read more. | ||
+ * | ||
+ * The main thread processes the items in ibuf_fifo and always signals the | ||
+ * client IO thread. | ||
*/ | ||
static void zserv_read(struct thread *thread) | ||
{ | ||
@@ -325,15 +333,25 @@ static void zserv_read(struct thread *thread) | ||
int sock; | ||
size_t already; | ||
struct stream_fifo *cache; | ||
- uint32_t p2p_orig; | ||
- | ||
- uint32_t p2p; | ||
+ uint32_t p2p; /* Temp p2p used to process */ | ||
+ uint32_t p2p_orig; /* Configured p2p (Default-1000) */ | ||
+ int p2p_avail; /* How much space is available for p2p */ | ||
struct zmsghdr hdr; | ||
+ size_t client_ibuf_fifo_cnt = stream_fifo_count_safe(client->ibuf_fifo); | ||
|
||
p2p_orig = atomic_load_explicit(&zrouter.packets_to_process, | ||
memory_order_relaxed); | ||
+ p2p_avail = p2p_orig - client_ibuf_fifo_cnt; | ||
+ | ||
+ /* | ||
+ * Do nothing if ibuf_fifo count has reached its max limit. Otherwise | ||
+ * proceed and reschedule ourselves if there is space in the ibuf_fifo. | ||
+ */ | ||
+ if (p2p_avail <= 0) | ||
+ return; | ||
+ | ||
+ p2p = p2p_avail; | ||
cache = stream_fifo_new(); | ||
- p2p = p2p_orig; | ||
sock = THREAD_FD(thread); | ||
|
||
while (p2p) { | ||
@@ -433,7 +451,7 @@ static void zserv_read(struct thread *thread) | ||
p2p--; | ||
} | ||
|
||
- if (p2p < p2p_orig) { | ||
+ if (p2p < (uint32_t)p2p_avail) { | ||
uint64_t time_now = monotime(NULL); | ||
|
||
/* update session statistics */ | ||
@@ -447,19 +465,23 @@ static void zserv_read(struct thread *thread) | ||
while (cache->head) | ||
stream_fifo_push(client->ibuf_fifo, | ||
stream_fifo_pop(cache)); | ||
+ /* Need to update count as main thread could have processed few */ | ||
+ client_ibuf_fifo_cnt = | ||
+ stream_fifo_count_safe(client->ibuf_fifo); | ||
} | ||
|
||
/* Schedule job to process those packets */ | ||
zserv_event(client, ZSERV_PROCESS_MESSAGES); | ||
- | ||
} | ||
|
||
if (IS_ZEBRA_DEBUG_PACKET) | ||
- zlog_debug("Read %d packets from client: %s", p2p_orig - p2p, | ||
- zebra_route_string(client->proto)); | ||
+ zlog_debug("Read %d packets from client: %s. Current ibuf fifo count: %zu. Conf P2p %d", | ||
+ p2p_avail - p2p, zebra_route_string(client->proto), | ||
+ client_ibuf_fifo_cnt, p2p_orig); | ||
|
||
- /* Reschedule ourselves */ | ||
- zserv_client_event(client, ZSERV_CLIENT_READ); | ||
+ /* Reschedule ourselves since we have space in ibuf_fifo */ | ||
+ if (client_ibuf_fifo_cnt < p2p_orig) | ||
+ zserv_client_event(client, ZSERV_CLIENT_READ); | ||
|
||
stream_fifo_free(cache); | ||
|
||
@@ -495,14 +517,20 @@ static void zserv_client_event(struct zserv *client, | ||
* as the task argument. | ||
* | ||
* Each message is popped off the client's input queue and the action associated | ||
- * with the message is executed. This proceeds until there are no more messages, | ||
- * an error occurs, or the processing limit is reached. | ||
+ * with the message is executed. This proceeds until an error occurs, or the | ||
+ * processing limit is reached. | ||
* | ||
* The client's I/O thread can push at most zrouter.packets_to_process messages | ||
* onto the input buffer before notifying us there are packets to read. As long | ||
* as we always process zrouter.packets_to_process messages here, then we can | ||
* rely on the read thread to handle queuing this task enough times to process | ||
* everything on the input queue. | ||
+ * | ||
+ * If the client ibuf always schedules a wakeup to the client IO to read more | ||
+ * items from the socked buffer. This way we ensure | ||
+ * - Client IO thread always tries to read the socket buffer and add more | ||
+ * items to the ibuf_fifo (until max limit) | ||
+ * - the hidden config change (zebra zapi-packets <>) is taken into account. | ||
*/ | ||
static void zserv_process_messages(struct thread *thread) | ||
{ | ||
@@ -538,6 +566,9 @@ static void zserv_process_messages(struct thread *thread) | ||
/* Reschedule ourselves if necessary */ | ||
if (need_resched) | ||
zserv_event(client, ZSERV_PROCESS_MESSAGES); | ||
+ | ||
+ /* Ensure to include the read socket in the select/poll/etc.. */ | ||
+ zserv_client_event(client, ZSERV_CLIENT_READ); | ||
} | ||
|
||
int zserv_send_message(struct zserv *client, struct stream *msg) | ||
-- | ||
2.17.1 | ||
|
90 changes: 90 additions & 0 deletions
90
src/sonic-frr/patch/0031-bgpd-backpressure-Add-a-typesafe-list-for-Zebra-Anno.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
From 7796ce2bb6eb1650ae1bec41ab2f270807b33c62 Mon Sep 17 00:00:00 2001 | ||
From: Donald Sharp <sharpd@nvidia.com> | ||
Date: Thu, 25 Jan 2024 12:53:24 -0500 | ||
Subject: [PATCH 03/11] bgpd: backpressure - Add a typesafe list for Zebra | ||
Announcement | ||
|
||
Modify the bgp master to hold a type safe list for bgp_dests that need | ||
to be passed to zebra. | ||
|
||
Future commits will use this. | ||
|
||
Ticket: #3390099 | ||
|
||
Signed-off-by: Donald Sharp <sharpd@nvidia.com> | ||
Signed-off-by: Rajasekar Raja <rajasekarr@nvidia.com> | ||
|
||
diff --git a/bgpd/bgp_main.c b/bgpd/bgp_main.c | ||
index 90ae580bab..e28dde5d16 100644 | ||
--- a/bgpd/bgp_main.c | ||
+++ b/bgpd/bgp_main.c | ||
@@ -214,6 +214,8 @@ static __attribute__((__noreturn__)) void bgp_exit(int status) | ||
bgp_evpn_mh_finish(); | ||
bgp_l3nhg_finish(); | ||
|
||
+ zebra_announce_fini(&bm->zebra_announce_head); | ||
+ | ||
/* reverse bgp_dump_init */ | ||
bgp_dump_finish(); | ||
|
||
diff --git a/bgpd/bgp_table.h b/bgpd/bgp_table.h | ||
index 121afc481f..d43bf86eb9 100644 | ||
--- a/bgpd/bgp_table.h | ||
+++ b/bgpd/bgp_table.h | ||
@@ -101,6 +101,8 @@ struct bgp_node { | ||
|
||
STAILQ_ENTRY(bgp_dest) pq; | ||
|
||
+ struct zebra_announce_item zai; | ||
+ | ||
uint64_t version; | ||
|
||
mpls_label_t local_label; | ||
@@ -121,6 +123,8 @@ struct bgp_node { | ||
enum bgp_path_selection_reason reason; | ||
}; | ||
|
||
+DECLARE_LIST(zebra_announce, struct bgp_dest, zai); | ||
+ | ||
extern void bgp_delete_listnode(struct bgp_dest *dest); | ||
/* | ||
* bgp_table_iter_t | ||
diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c | ||
index 023047050b..392423e028 100644 | ||
--- a/bgpd/bgpd.c | ||
+++ b/bgpd/bgpd.c | ||
@@ -8017,6 +8017,8 @@ void bgp_master_init(struct thread_master *master, const int buffer_size, | ||
memset(&bgp_master, 0, sizeof(bgp_master)); | ||
|
||
bm = &bgp_master; | ||
+ | ||
+ zebra_announce_init(&bm->zebra_announce_head); | ||
bm->bgp = list_new(); | ||
bm->listen_sockets = list_new(); | ||
bm->port = BGP_PORT_DEFAULT; | ||
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h | ||
index 72b5b50fb4..55f53bf9d3 100644 | ||
--- a/bgpd/bgpd.h | ||
+++ b/bgpd/bgpd.h | ||
@@ -32,6 +32,8 @@ | ||
#include "srv6.h" | ||
#include "iana_afi.h" | ||
|
||
+PREDECL_LIST(zebra_announce); | ||
+ | ||
/* For union sockunion. */ | ||
#include "queue.h" | ||
#include "sockunion.h" | ||
@@ -180,6 +182,9 @@ struct bgp_master { | ||
uint32_t inq_limit; | ||
uint32_t outq_limit; | ||
|
||
+ /* To preserve ordering of installations into zebra across all Vrfs */ | ||
+ struct zebra_announce_head zebra_announce_head; | ||
+ | ||
QOBJ_FIELDS; | ||
}; | ||
DECLARE_QOBJ_TYPE(bgp_master); | ||
-- | ||
2.17.1 | ||
|
Oops, something went wrong.