Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[system-health] Fix error log system_service'state' while doing confi… #11225

Merged
merged 2 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 37 additions & 6 deletions src/system-health/health_checker/sysmonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import sys
import time
import glob
import multiprocessing
from datetime import datetime
Expand Down Expand Up @@ -145,12 +146,7 @@ def get_all_service_list(self):
dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))]

#add the enabled docker services from config db feature table
feature_table = self.config_db.get_table("FEATURE")
for srv in feature_table.keys():
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
srvext = srv + ".service"
if srvext not in dir_list:
dir_list.append(srvext)
self.get_service_from_feature_table(dir_list)

self.config.load_config()
if self.config and self.config.ignore_services:
Expand All @@ -161,6 +157,41 @@ def get_all_service_list(self):
dir_list.sort()
return dir_list

def get_service_from_feature_table(self, dir_list):
"""Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table
is not an atomic operation, sonic-cfggen do it with two steps:
1. Add an empty table entry to CONFIG DB
2. Add all fields to the table

So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry
mechanism is here to avoid such issue.

Args:
dir_list (list): service list
"""
max_retry = 3
retry_delay = 1
success = True

while max_retry > 0:
success = True
feature_table = self.config_db.get_table("FEATURE")
for srv, fields in feature_table.items():
if 'state' not in fields:
success = False
logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table))
break
if fields["state"] not in ["disabled", "always_disabled"]:
srvext = srv + ".service"
if srvext not in dir_list:
dir_list.append(srvext)
if not success:
max_retry -= 1
time.sleep(retry_delay)
else:
break
if not success:
logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table))

#Checks FEATURE table from config db for the service' check_up_status flag
#if marked to true, then read the service up_status from FEATURE table of state db.
Expand Down
20 changes: 20 additions & 0 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,3 +720,23 @@ def test_system_service():
sysmon.task_run()
assert sysmon._task_process is not None
sysmon.task_stop()


def test_get_service_from_feature_table():
sysmon = Sysmonitor()
sysmon.config_db = MagicMock()
sysmon.config_db.get_table = MagicMock()
sysmon.config_db.get_table.side_effect = [
{
'bgp': {},
'swss': {}
},
{
'bgp': {'state': 'enabled'},
'swss': {'state': 'disabled'}
}
]
dir_list = []
sysmon.get_service_from_feature_table(dir_list)
assert 'bgp.service' in dir_list
assert 'swss.service' not in dir_list