11
11
How:
12
12
NOTE: The flow from APPL-DB to ASIC-DB takes non zero milliseconds.
13
13
1) Initiate subscribe for ASIC-DB updates.
14
- 2) Read APPL-DB & ASIC-DB
14
+ 2) Read APPL-DB & ASIC-DB
15
15
3) Get the diff.
16
- 4) If any diff,
16
+ 4) If any diff,
17
17
4.1) Collect subscribe messages for a second
18
- 4.2) check diff against the subscribe messages
18
+ 4.2) check diff against the subscribe messages
19
19
5) Rule out local interfaces & default routes
20
20
6) If still outstanding diffs, report failure.
21
21
29
29
down to ensure failure.
30
30
Analyze the reported failures to match expected.
31
31
You may use the exit code to verify the result as success or not.
32
-
32
+
33
33
34
34
35
35
"""
45
45
import time
46
46
import signal
47
47
import traceback
48
+ import subprocess
48
49
49
50
from swsscommon import swsscommon
50
51
from utilities_common import chassis
71
72
72
73
PRINT_MSG_LEN_MAX = 1000
73
74
75
+ FRR_CHECK_RETRIES = 3
76
+ FRR_WAIT_TIME = 15
77
+
74
78
class Level (Enum ):
75
79
ERR = 'ERR'
76
80
INFO = 'INFO'
@@ -293,7 +297,7 @@ def get_routes():
293
297
294
298
def get_route_entries ():
295
299
"""
296
- helper to read present route entries from ASIC-DB and
300
+ helper to read present route entries from ASIC-DB and
297
301
as well initiate selector for ASIC-DB:ASIC-state updates.
298
302
:return (selector, subscriber, <list of sorted routes>)
299
303
"""
@@ -309,14 +313,39 @@ def get_route_entries():
309
313
res , e = checkout_rt_entry (k )
310
314
if res :
311
315
rt .append (e )
312
-
316
+
313
317
print_message (syslog .LOG_DEBUG , json .dumps ({"ASIC_ROUTE_ENTRY" : sorted (rt )}, indent = 4 ))
314
318
315
319
selector = swsscommon .Select ()
316
320
selector .addSelectable (subs )
317
321
return (selector , subs , sorted (rt ))
318
322
319
323
324
+ def is_suppress_fib_pending_enabled ():
325
+ """
326
+ Returns True if FIB suppression is enabled, False otherwise
327
+ """
328
+ cfg_db = swsscommon .ConfigDBConnector ()
329
+ cfg_db .connect ()
330
+
331
+ state = cfg_db .get_entry ('DEVICE_METADATA' , 'localhost' ).get ('suppress-fib-pending' )
332
+
333
+ return state == 'enabled'
334
+
335
+
336
+ def get_frr_routes ():
337
+ """
338
+ Read routes from zebra through CLI command
339
+ :return frr routes dictionary
340
+ """
341
+
342
+ output = subprocess .check_output ('show ip route json' , shell = True )
343
+ routes = json .loads (output )
344
+ output = subprocess .check_output ('show ipv6 route json' , shell = True )
345
+ routes .update (json .loads (output ))
346
+ return routes
347
+
348
+
320
349
def get_interfaces ():
321
350
"""
322
351
helper to read interface table from APPL-DB.
@@ -354,7 +383,7 @@ def filter_out_local_interfaces(keys):
354
383
355
384
chassis_local_intfs = chassis .get_chassis_local_interfaces ()
356
385
local_if_lst .update (set (chassis_local_intfs ))
357
-
386
+
358
387
db = swsscommon .DBConnector (APPL_DB_NAME , 0 )
359
388
tbl = swsscommon .Table (db , 'ROUTE_TABLE' )
360
389
@@ -493,6 +522,61 @@ def filter_out_standalone_tunnel_routes(routes):
493
522
return updated_routes
494
523
495
524
525
+ def check_frr_pending_routes ():
526
+ """
527
+ Check FRR routes for offload flag presence by executing "show ip route json"
528
+ Returns a list of routes that have no offload flag.
529
+ """
530
+
531
+ missed_rt = []
532
+
533
+ retries = FRR_CHECK_RETRIES
534
+ for i in range (retries ):
535
+ missed_rt = []
536
+ frr_routes = get_frr_routes ()
537
+
538
+ for _ , entries in frr_routes .items ():
539
+ for entry in entries :
540
+ if entry ['protocol' ] != 'bgp' :
541
+ continue
542
+
543
+ # TODO: Also handle VRF routes. Currently this script does not check for VRF routes so it would be incorrect for us
544
+ # to assume they are installed in ASIC_DB, so we don't handle them.
545
+ if entry ['vrfName' ] != 'default' :
546
+ continue
547
+
548
+ if not entry .get ('offloaded' , False ):
549
+ missed_rt .append (entry )
550
+
551
+ if not missed_rt :
552
+ break
553
+
554
+ time .sleep (FRR_WAIT_TIME )
555
+
556
+ return missed_rt
557
+
558
+
559
+ def mitigate_installed_not_offloaded_frr_routes (missed_frr_rt , rt_appl ):
560
+ """
561
+ Mitigate installed but not offloaded FRR routes.
562
+
563
+ In case route exists in APPL_DB, this function will manually send a notification to fpmsyncd
564
+ to trigger the flow that sends offload flag to zebra.
565
+
566
+ It is designed to mitigate a problem when orchagent fails to send notification about installed route to fpmsyncd
567
+ or fpmsyncd not being able to read the notification or in case zebra fails to receive offload update due to variety of reasons.
568
+ All of the above mentioned cases must be considered as a bug, but even in that case we will report an error in the log but
569
+ given that this script ensures the route is installed in the hardware it will automitigate such a bug.
570
+ """
571
+ db = swsscommon .DBConnector ('APPL_STATE_DB' , 0 )
572
+ response_producer = swsscommon .NotificationProducer (db , f'{ APPL_DB_NAME } _{ swsscommon .APP_ROUTE_TABLE_NAME } _RESPONSE_CHANNEL' )
573
+ for entry in [entry for entry in missed_frr_rt if entry ['prefix' ] in rt_appl ]:
574
+ fvs = swsscommon .FieldValuePairs ([('err_str' , 'SWSS_RC_SUCCESS' ), ('protocol' , entry ['protocol' ])])
575
+ response_producer .send ('SWSS_RC_SUCCESS' , entry ['prefix' ], fvs )
576
+
577
+ print_message (syslog .LOG_ERR , f'Mitigated route { entry ["prefix" ]} ' )
578
+
579
+
496
580
def get_soc_ips (config_db ):
497
581
mux_table = config_db .get_table ('MUX_CABLE' )
498
582
soc_ips = []
@@ -536,7 +620,7 @@ def check_routes():
536
620
"""
537
621
The heart of this script which runs the checks.
538
622
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
539
- Checkout routes in ASIC-DB to match APPL-DB, discounting local &
623
+ Checkout routes in ASIC-DB to match APPL-DB, discounting local &
540
624
default routes. In case of missed / unexpected entries in ASIC,
541
625
it might be due to update latency between APPL & ASIC DBs. So collect
542
626
ASIC-DB subscribe updates for a second, and checkout if you see SET
@@ -545,12 +629,16 @@ def check_routes():
545
629
If there are still some unjustifiable diffs, between APPL & ASIC DB,
546
630
related to routes report failure, else all good.
547
631
632
+ If there are FRR routes that aren't marked offloaded but all APPL & ASIC DB
633
+ routes are in sync report failure and perform a mitigation action.
634
+
548
635
:return (0, None) on sucess, else (-1, results) where results holds
549
636
the unjustifiable entries.
550
637
"""
551
638
intf_appl_miss = []
552
639
rt_appl_miss = []
553
640
rt_asic_miss = []
641
+ rt_frr_miss = []
554
642
555
643
results = {}
556
644
adds = []
@@ -599,11 +687,22 @@ def check_routes():
599
687
if rt_asic_miss :
600
688
results ["Unaccounted_ROUTE_ENTRY_TABLE_entries" ] = rt_asic_miss
601
689
690
+ rt_frr_miss = check_frr_pending_routes ()
691
+
692
+ if rt_frr_miss :
693
+ results ["missed_FRR_routes" ] = rt_frr_miss
694
+
602
695
if results :
603
696
print_message (syslog .LOG_WARNING , "Failure results: {" , json .dumps (results , indent = 4 ), "}" )
604
697
print_message (syslog .LOG_WARNING , "Failed. Look at reported mismatches above" )
605
698
print_message (syslog .LOG_WARNING , "add: " , json .dumps (adds , indent = 4 ))
606
699
print_message (syslog .LOG_WARNING , "del: " , json .dumps (deletes , indent = 4 ))
700
+
701
+ if rt_frr_miss and not rt_appl_miss and not rt_asic_miss :
702
+ print_message (syslog .LOG_ERR , "Some routes are not set offloaded in FRR but all routes in APPL_DB and ASIC_DB are in sync" )
703
+ if is_suppress_fib_pending_enabled ():
704
+ mitigate_installed_not_offloaded_frr_routes (rt_frr_miss , rt_appl )
705
+
607
706
return - 1 , results
608
707
else :
609
708
print_message (syslog .LOG_INFO , "All good!" )
@@ -649,7 +748,7 @@ def main():
649
748
return ret , res
650
749
else :
651
750
return ret , res
652
-
751
+
653
752
654
753
655
754
if __name__ == "__main__" :
0 commit comments