Skip to content

Commit 90d7015

Browse files
[route_check] implement a check for FRR routes not marked offloaded (sonic-net#2531)
* [route_check] implement a check for FRR routes not marked offloaded * Implemented a route_check functioality that will check "show ip route json" output from FRR and will ensure that all routes are marked as offloaded. If some routes are not offloaded for 15 sec, this is considered as an issue and a mitigation logic is invoked.
1 parent c2bc150 commit 90d7015

File tree

4 files changed

+234
-14
lines changed

4 files changed

+234
-14
lines changed

scripts/route_check.py

+108-9
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
How:
1212
NOTE: The flow from APPL-DB to ASIC-DB takes non zero milliseconds.
1313
1) Initiate subscribe for ASIC-DB updates.
14-
2) Read APPL-DB & ASIC-DB
14+
2) Read APPL-DB & ASIC-DB
1515
3) Get the diff.
16-
4) If any diff,
16+
4) If any diff,
1717
4.1) Collect subscribe messages for a second
18-
4.2) check diff against the subscribe messages
18+
4.2) check diff against the subscribe messages
1919
5) Rule out local interfaces & default routes
2020
6) If still outstanding diffs, report failure.
2121
@@ -29,7 +29,7 @@
2929
down to ensure failure.
3030
Analyze the reported failures to match expected.
3131
You may use the exit code to verify the result as success or not.
32-
32+
3333
3434
3535
"""
@@ -45,6 +45,7 @@
4545
import time
4646
import signal
4747
import traceback
48+
import subprocess
4849

4950
from swsscommon import swsscommon
5051
from utilities_common import chassis
@@ -71,6 +72,9 @@
7172

7273
PRINT_MSG_LEN_MAX = 1000
7374

75+
FRR_CHECK_RETRIES = 3
76+
FRR_WAIT_TIME = 15
77+
7478
class Level(Enum):
7579
ERR = 'ERR'
7680
INFO = 'INFO'
@@ -293,7 +297,7 @@ def get_routes():
293297

294298
def get_route_entries():
295299
"""
296-
helper to read present route entries from ASIC-DB and
300+
helper to read present route entries from ASIC-DB and
297301
as well initiate selector for ASIC-DB:ASIC-state updates.
298302
:return (selector, subscriber, <list of sorted routes>)
299303
"""
@@ -309,14 +313,39 @@ def get_route_entries():
309313
res, e = checkout_rt_entry(k)
310314
if res:
311315
rt.append(e)
312-
316+
313317
print_message(syslog.LOG_DEBUG, json.dumps({"ASIC_ROUTE_ENTRY": sorted(rt)}, indent=4))
314318

315319
selector = swsscommon.Select()
316320
selector.addSelectable(subs)
317321
return (selector, subs, sorted(rt))
318322

319323

324+
def is_suppress_fib_pending_enabled():
325+
"""
326+
Returns True if FIB suppression is enabled, False otherwise
327+
"""
328+
cfg_db = swsscommon.ConfigDBConnector()
329+
cfg_db.connect()
330+
331+
state = cfg_db.get_entry('DEVICE_METADATA', 'localhost').get('suppress-fib-pending')
332+
333+
return state == 'enabled'
334+
335+
336+
def get_frr_routes():
337+
"""
338+
Read routes from zebra through CLI command
339+
:return frr routes dictionary
340+
"""
341+
342+
output = subprocess.check_output('show ip route json', shell=True)
343+
routes = json.loads(output)
344+
output = subprocess.check_output('show ipv6 route json', shell=True)
345+
routes.update(json.loads(output))
346+
return routes
347+
348+
320349
def get_interfaces():
321350
"""
322351
helper to read interface table from APPL-DB.
@@ -354,7 +383,7 @@ def filter_out_local_interfaces(keys):
354383

355384
chassis_local_intfs = chassis.get_chassis_local_interfaces()
356385
local_if_lst.update(set(chassis_local_intfs))
357-
386+
358387
db = swsscommon.DBConnector(APPL_DB_NAME, 0)
359388
tbl = swsscommon.Table(db, 'ROUTE_TABLE')
360389

@@ -493,6 +522,61 @@ def filter_out_standalone_tunnel_routes(routes):
493522
return updated_routes
494523

495524

525+
def check_frr_pending_routes():
526+
"""
527+
Check FRR routes for offload flag presence by executing "show ip route json"
528+
Returns a list of routes that have no offload flag.
529+
"""
530+
531+
missed_rt = []
532+
533+
retries = FRR_CHECK_RETRIES
534+
for i in range(retries):
535+
missed_rt = []
536+
frr_routes = get_frr_routes()
537+
538+
for _, entries in frr_routes.items():
539+
for entry in entries:
540+
if entry['protocol'] != 'bgp':
541+
continue
542+
543+
# TODO: Also handle VRF routes. Currently this script does not check for VRF routes so it would be incorrect for us
544+
# to assume they are installed in ASIC_DB, so we don't handle them.
545+
if entry['vrfName'] != 'default':
546+
continue
547+
548+
if not entry.get('offloaded', False):
549+
missed_rt.append(entry)
550+
551+
if not missed_rt:
552+
break
553+
554+
time.sleep(FRR_WAIT_TIME)
555+
556+
return missed_rt
557+
558+
559+
def mitigate_installed_not_offloaded_frr_routes(missed_frr_rt, rt_appl):
560+
"""
561+
Mitigate installed but not offloaded FRR routes.
562+
563+
In case route exists in APPL_DB, this function will manually send a notification to fpmsyncd
564+
to trigger the flow that sends offload flag to zebra.
565+
566+
It is designed to mitigate a problem when orchagent fails to send notification about installed route to fpmsyncd
567+
or fpmsyncd not being able to read the notification or in case zebra fails to receive offload update due to variety of reasons.
568+
All of the above mentioned cases must be considered as a bug, but even in that case we will report an error in the log but
569+
given that this script ensures the route is installed in the hardware it will automitigate such a bug.
570+
"""
571+
db = swsscommon.DBConnector('APPL_STATE_DB', 0)
572+
response_producer = swsscommon.NotificationProducer(db, f'{APPL_DB_NAME}_{swsscommon.APP_ROUTE_TABLE_NAME}_RESPONSE_CHANNEL')
573+
for entry in [entry for entry in missed_frr_rt if entry['prefix'] in rt_appl]:
574+
fvs = swsscommon.FieldValuePairs([('err_str', 'SWSS_RC_SUCCESS'), ('protocol', entry['protocol'])])
575+
response_producer.send('SWSS_RC_SUCCESS', entry['prefix'], fvs)
576+
577+
print_message(syslog.LOG_ERR, f'Mitigated route {entry["prefix"]}')
578+
579+
496580
def get_soc_ips(config_db):
497581
mux_table = config_db.get_table('MUX_CABLE')
498582
soc_ips = []
@@ -536,7 +620,7 @@ def check_routes():
536620
"""
537621
The heart of this script which runs the checks.
538622
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
539-
Checkout routes in ASIC-DB to match APPL-DB, discounting local &
623+
Checkout routes in ASIC-DB to match APPL-DB, discounting local &
540624
default routes. In case of missed / unexpected entries in ASIC,
541625
it might be due to update latency between APPL & ASIC DBs. So collect
542626
ASIC-DB subscribe updates for a second, and checkout if you see SET
@@ -545,12 +629,16 @@ def check_routes():
545629
If there are still some unjustifiable diffs, between APPL & ASIC DB,
546630
related to routes report failure, else all good.
547631
632+
If there are FRR routes that aren't marked offloaded but all APPL & ASIC DB
633+
routes are in sync report failure and perform a mitigation action.
634+
548635
:return (0, None) on sucess, else (-1, results) where results holds
549636
the unjustifiable entries.
550637
"""
551638
intf_appl_miss = []
552639
rt_appl_miss = []
553640
rt_asic_miss = []
641+
rt_frr_miss = []
554642

555643
results = {}
556644
adds = []
@@ -599,11 +687,22 @@ def check_routes():
599687
if rt_asic_miss:
600688
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss
601689

690+
rt_frr_miss = check_frr_pending_routes()
691+
692+
if rt_frr_miss:
693+
results["missed_FRR_routes"] = rt_frr_miss
694+
602695
if results:
603696
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
604697
print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above")
605698
print_message(syslog.LOG_WARNING, "add: ", json.dumps(adds, indent=4))
606699
print_message(syslog.LOG_WARNING, "del: ", json.dumps(deletes, indent=4))
700+
701+
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
702+
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR but all routes in APPL_DB and ASIC_DB are in sync")
703+
if is_suppress_fib_pending_enabled():
704+
mitigate_installed_not_offloaded_frr_routes(rt_frr_miss, rt_appl)
705+
607706
return -1, results
608707
else:
609708
print_message(syslog.LOG_INFO, "All good!")
@@ -649,7 +748,7 @@ def main():
649748
return ret, res
650749
else:
651750
return ret, res
652-
751+
653752

654753

655754
if __name__ == "__main__":

tests/mock_tables/config_db.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,8 @@
831831
"mac": "1d:34:db:16:a6:00",
832832
"platform": "x86_64-mlnx_msn3800-r0",
833833
"peer_switch": "sonic-switch",
834-
"type": "ToRRouter"
834+
"type": "ToRRouter",
835+
"suppress-fib-pending": "enabled"
835836
},
836837
"SNMP_COMMUNITY|msft": {
837838
"TYPE": "RO"

tests/route_check_test.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import time
88
from sonic_py_common import device_info
99
from unittest.mock import MagicMock, patch
10-
from tests.route_check_test_data import APPL_DB, ARGS, ASIC_DB, CONFIG_DB, DEFAULT_CONFIG_DB, DESCR, OP_DEL, OP_SET, PRE, RESULT, RET, TEST_DATA, UPD
10+
from tests.route_check_test_data import APPL_DB, ARGS, ASIC_DB, CONFIG_DB, DEFAULT_CONFIG_DB, DESCR, OP_DEL, OP_SET, PRE, RESULT, RET, TEST_DATA, UPD, FRR_ROUTES
1111

1212
import pytest
1313

@@ -239,6 +239,7 @@ def setup(self):
239239

240240
def init(self):
241241
route_check.UNIT_TESTING = 1
242+
route_check.FRR_WAIT_TIME = 0
242243

243244
@pytest.fixture
244245
def force_hang(self):
@@ -258,7 +259,8 @@ def mock_dbs(self):
258259
patch("route_check.swsscommon.Table") as mock_table, \
259260
patch("route_check.swsscommon.Select") as mock_sel, \
260261
patch("route_check.swsscommon.SubscriberStateTable") as mock_subs, \
261-
patch("route_check.swsscommon.ConfigDBConnector", return_value=mock_config_db):
262+
patch("route_check.swsscommon.ConfigDBConnector", return_value=mock_config_db), \
263+
patch("route_check.swsscommon.NotificationProducer"):
262264
device_info.get_platform = MagicMock(return_value='unittest')
263265
set_mock(mock_table, mock_conn, mock_sel, mock_subs, mock_config_db)
264266
yield
@@ -272,7 +274,21 @@ def test_route_check(self, mock_dbs, test_num):
272274
set_test_case_data(ct_data)
273275
logger.info("Running test case {}: {}".format(test_num, ct_data[DESCR]))
274276

275-
with patch('sys.argv', ct_data[ARGS].split()):
277+
with patch('sys.argv', ct_data[ARGS].split()), \
278+
patch('route_check.subprocess.check_output') as mock_check_output:
279+
280+
check_frr_patch = patch('route_check.check_frr_pending_routes', lambda: [])
281+
282+
if FRR_ROUTES in ct_data:
283+
routes = ct_data[FRR_ROUTES]
284+
285+
def side_effect(*args, **kwargs):
286+
return json.dumps(routes)
287+
288+
mock_check_output.side_effect = side_effect
289+
else:
290+
check_frr_patch.start()
291+
276292
ret, res = route_check.main()
277293
expect_ret = ct_data[RET] if RET in ct_data else 0
278294
expect_res = ct_data[RESULT] if RESULT in ct_data else None
@@ -283,6 +299,8 @@ def test_route_check(self, mock_dbs, test_num):
283299
assert ret == expect_ret
284300
assert res == expect_res
285301

302+
check_frr_patch.stop()
303+
286304
def test_timeout(self, mock_dbs, force_hang):
287305
# Test timeout
288306
ex_raised = False

0 commit comments

Comments
 (0)