From 426c6cf1ef7389a4a2a1e2b7ffa37e4cb7662304 Mon Sep 17 00:00:00 2001 From: sg893052 Date: Wed, 6 Apr 2022 00:56:16 -0700 Subject: [PATCH 1/2] sysready Bring in all global functions under Sysmonitor class Add unit test code coverage for sysmonitor code Formatted the code --- files/build_templates/init_cfg.json.j2 | 6 + .../build_templates/sonic_debian_extension.j2 | 3 + ...rvices-data.determine-reboot-cause.service | 1 + .../yang-models/sonic-feature.yang | 7 + .../health_checker/sysmonitor.py | 437 ++++++++++++++++++ src/system-health/scripts/healthd | 5 + src/system-health/tests/test_system_health.py | 211 +++++++++ 7 files changed, 670 insertions(+) create mode 100755 src/system-health/health_checker/sysmonitor.py diff --git a/files/build_templates/init_cfg.json.j2 b/files/build_templates/init_cfg.json.j2 index e1320214e02c..5bbb8c11732c 100644 --- a/files/build_templates/init_cfg.json.j2 +++ b/files/build_templates/init_cfg.json.j2 @@ -56,6 +56,12 @@ "has_global_scope": {% if feature + '.service' in installer_services.split(' ') %}true{% else %}false{% endif %}, "has_per_asic_scope": {% if feature + '@.service' in installer_services.split(' ') %}true{% else %}false{% endif %}, "auto_restart": "{{autorestart}}", +{# Set check_up_status to true here when app readiness will be marked in state db #} +{# For now, to support the infrastrucure, setting the check_up_status to false for bgp,swss,pmon #} +{# Once apps like bgp,synd supports app readiness, then bgp,syncd can set check_up_status to true #} +{%- if feature in ["bgp", "swss", "pmon"] %} + "check_up_status" : "false", +{%- endif %} {%- if include_kubernetes == "y" %} {%- if feature in ["lldp", "pmon", "radv", "snmp", "telemetry"] %} "set_owner": "kube", {% else %} diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index a49e29406eeb..ee34fd586a4e 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -864,3 +864,6 @@ sudo cp $BUILD_SCRIPTS_DIR/mask_disabled_services.py $FILESYSTEM_ROOT/tmp/ sudo chmod a+x $FILESYSTEM_ROOT/tmp/mask_disabled_services.py sudo LANG=C chroot $FILESYSTEM_ROOT /tmp/mask_disabled_services.py sudo rm -rf $FILESYSTEM_ROOT/tmp/mask_disabled_services.py + + +sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install python3-dbus diff --git a/src/sonic-host-services-data/debian/sonic-host-services-data.determine-reboot-cause.service b/src/sonic-host-services-data/debian/sonic-host-services-data.determine-reboot-cause.service index f0d9e91fe991..50d79b3e7639 100644 --- a/src/sonic-host-services-data/debian/sonic-host-services-data.determine-reboot-cause.service +++ b/src/sonic-host-services-data/debian/sonic-host-services-data.determine-reboot-cause.service @@ -5,6 +5,7 @@ After=rc-local.service database.service [Service] Type=simple +RemainAfterExit=yes ExecStart=/usr/local/bin/determine-reboot-cause [Install] diff --git a/src/sonic-yang-models/yang-models/sonic-feature.yang b/src/sonic-yang-models/yang-models/sonic-feature.yang index be46bef19853..54133ef10612 100644 --- a/src/sonic-yang-models/yang-models/sonic-feature.yang +++ b/src/sonic-yang-models/yang-models/sonic-feature.yang @@ -86,6 +86,13 @@ module sonic-feature{ type feature-owner; default "local"; } + + leaf check_up_status { + description "This configuration controls the system ready tool to check + the app ready/up status"; + type boolean; + default false; + } } } } diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py new file mode 100755 index 000000000000..0d4479f1fc02 --- /dev/null +++ b/src/system-health/health_checker/sysmonitor.py @@ -0,0 +1,437 @@ +#!/usr/bin/python3 + +import os +import sys +import glob +import multiprocessing +from datetime import datetime +from swsscommon import swsscommon +from sonic_py_common.logger import Logger +from . import utils +from sonic_py_common.task_base import ProcessTaskBase +from .config import Config + +SYSLOG_IDENTIFIER = "system#monitor" +REDIS_TIMEOUT_MS = 0 +system_allsrv_state = "DOWN" +spl_srv_list = ['database-chassis', 'gbsyncd'] +SELECT_TIMEOUT_MSECS = 1000 +QUEUE_TIMEOUT = 15 +TASK_STOP_TIMEOUT = 10 +mpmgr = multiprocessing.Manager() +logger = Logger(log_identifier=SYSLOG_IDENTIFIER) + + +#Subprocess which subscribes to STATE_DB FEATURE table for any update +#and push service events to main process via queue +class MonitorStateDbTask(ProcessTaskBase): + + def __init__(self,myQ): + ProcessTaskBase.__init__(self) + self.task_queue = myQ + + def subscribe_statedb(self): + state_db = swsscommon.DBConnector("STATE_DB", REDIS_TIMEOUT_MS, True) + sel = swsscommon.Select() + cst = swsscommon.SubscriberStateTable(state_db, "FEATURE") + sel.addSelectable(cst) + + while not self.task_stopping_event.is_set(): + (state, c) = sel.select(SELECT_TIMEOUT_MSECS) + if state == swsscommon.Select.TIMEOUT: + continue + if state != swsscommon.Select.OBJECT: + logger.log_warning("sel.select() did not return swsscommon.Select.OBJECT") + continue + (key, op, cfvs) = cst.pop() + key_ext = key+".service" + timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) + msg={"unit": key_ext,"evt_src":"feature","time":timestamp} + self.task_notify(msg) + + + def task_worker(self): + if self.task_stopping_event.is_set(): + return + try: + self.subscribe_statedb() + except Exception as e: + logger.log_error("subscribe_statedb exited- {}".format(str(e))) + + def task_notify(self, msg): + if self.task_stopping_event.is_set(): + return + self.task_queue.put(msg) + + +#Subprocess which subscribes to system dbus to listen for systemd events +#and push service events to main process via queue +class MonitorSystemBusTask(ProcessTaskBase): + + def __init__(self,myQ): + ProcessTaskBase.__init__(self) + self.task_queue = myQ + + def on_job_removed(self, id, job, unit, result): + if result == "done": + timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) + msg={"unit": unit,"evt_src":"sysbus","time":timestamp} + self.task_notify(msg) + return + + #Function for listening the systemd event on dbus + def subscribe_sysbus(self): + import dbus + from gi.repository import GLib + from dbus.mainloop.glib import DBusGMainLoop + + DBusGMainLoop(set_as_default=True) + bus = dbus.SystemBus() + systemd = bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1') + manager = dbus.Interface(systemd, 'org.freedesktop.systemd1.Manager') + manager.Subscribe() + manager.connect_to_signal('JobRemoved', self.on_job_removed) + + loop = GLib.MainLoop() + loop.run() + + def task_worker(self): + if self.task_stopping_event.is_set(): + return + logger.log_info("Start Listening to systemd bus (pid {0})".format(os.getpid())) + self.subscribe_sysbus() + + def task_notify(self, msg): + if self.task_stopping_event.is_set(): + return + self.task_queue.put(msg) + +#Mainprocess which launches 2 subtasks - systembus task and statedb task +#and on receiving events, checks and updates the system ready status to state db +class Sysmonitor(ProcessTaskBase): + + def __init__(self): + ProcessTaskBase.__init__(self) + self._stop_timeout_secs = TASK_STOP_TIMEOUT + self.dnsrvs_name = set() + self.state_db = None + self.config_db = None + self.config = Config() + + #Sets system ready status to state db + def post_system_status(self, state): + try: + if not self.state_db: + self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1') + self.state_db.connect(self.state_db.STATE_DB) + + self.state_db.set(self.state_db.STATE_DB, "SYSTEM_READY|SYSTEM_STATE", "Status", state) + logger.log_info("Posting system ready status {} to statedb".format(state)) + + except Exception as e: + logger.log_error("Unable to post system ready status: {}".format(str(e))) + + #Forms the service list to be monitored + def get_all_service_list(self): + + if not self.config_db: + self.config_db = swsscommon.ConfigDBConnector() + self.config_db.connect() + + dir_list = [] + #add the services from the below targets + targets= ["/etc/systemd/system/multi-user.target.wants", "/etc/systemd/system/sonic.target.wants"] + for path in targets: + dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))] + + #add the enabled docker services from config db feature table + feature_table = self.config_db.get_table("FEATURE") + for srv in feature_table.keys(): + if feature_table[srv]["state"] not in ["disabled", "always_disabled"]: + srvext=srv+".service" + if srvext not in dir_list: + dir_list.append(srvext) + + self.config.load_config() + if self.config and self.config.ignore_services: + for srv in self.config.ignore_services: + if srv in dir_list: + dir_list.remove(srv) + + dir_list.sort() + return dir_list + + + #Checks FEATURE table from config db for the service' check_up_status flag + #if marked to true, then read the service up_status from FEATURE table of state db. + #else, just return Up + def get_app_ready_status(self, service): + if not self.state_db: + self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1') + self.state_db.connect(self.state_db.STATE_DB) + if not self.config_db: + self.config_db = swsscommon.ConfigDBConnector() + self.config_db.connect() + + fail_reason = "" + check_app_up_status = "" + up_status_flag = "" + configdb_feature_table = self.config_db.get_table('FEATURE') + update_time = "-" + + if service not in configdb_feature_table.keys(): + pstate = "Up" + else: + check_app_up_status = configdb_feature_table[service].get('check_up_status') + if check_app_up_status is not None and (check_app_up_status.lower()) == "true": + up_status_flag = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'up_status') + if up_status_flag is not None and (up_status_flag.lower()) == "true": + pstate = "Up" + else: + fail_reason = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'fail_reason') + if fail_reason is None: + fail_reason = "NA" + pstate = "Down" + + update_time = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'update_time') + if update_time is None: + update_time = "-" + else: + #Either check_up_status marked False or entry does not exist + pstate = "Up" + + return pstate,fail_reason,update_time + + #Gets the service properties + def run_systemctl_show(self, service): + command = ('systemctl show {} --property=Id,LoadState,UnitFileState,Type,ActiveState,SubState,Result'.format(service)) + output = utils.run_command(command) + srv_properties = output.split('\n') + prop_dict = {} + for prop in srv_properties: + kv = prop.split("=", 1) + if len(kv) == 2: + prop_dict[kv[0]] = kv[1] + + return prop_dict + + #Sets the service status to state db + def post_unit_status(self, srv_name, srv_status, app_status, fail_reason, update_time): + if not self.state_db: + self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1') + self.state_db.connect(self.state_db.STATE_DB) + + key = 'ALL_SERVICE_STATUS|{}'.format(srv_name) + statusvalue = {} + statusvalue['service_status'] = srv_status + statusvalue['app_ready_status'] = app_status + statusvalue['fail_reason'] = fail_reason + statusvalue['update_time'] = update_time + self.state_db.hmset(self.state_db.STATE_DB, key, statusvalue) + statusvalue.clear() + + #Reads the current status of the service and posts it to state db + def get_unit_status(self, event): + """ Get a unit status""" + global spl_srv_list + unit_status = "NOT OK" + update_time = "-" + + try: + service_status = "Down" + service_up_status = "Down" + service_name,last_name = event.split('.') + + sysctl_show = self.run_systemctl_show(event) + + load_state = sysctl_show['LoadState'] + if load_state == "loaded": + status = sysctl_show['UnitFileState'] + fail_reason = sysctl_show['Result'] + active_state = sysctl_show['ActiveState'] + sub_state = sysctl_show['SubState'] + srv_type = sysctl_show['Type'] + + #Raise syslog for service state change + logger.log_info("{} service state changed to [{}/{}]".format(event, active_state, sub_state)) + + if status == "enabled" or status == "enabled-runtime" or status == "static": + if fail_reason == "success": + fail_reason = "-" + if (active_state == "active" and sub_state == "exited"): + service_status = "OK" + service_up_status = "OK" + unit_status = "OK" + elif active_state == "active" and sub_state == "running": + service_status = "OK" + init_state,app_fail_reason,update_time = self.get_app_ready_status(service_name) + if init_state == "Up": + service_up_status = "OK" + unit_status = "OK" + else: + fail_reason = app_fail_reason + unit_status = "NOT OK" + if fail_reason == "docker start": + service_up_status = "Starting" + fail_reason = "-" + elif active_state == "activating": + service_status = "Starting" + service_up_status = "Starting" + elif active_state == "deactivating": + service_status = "Stopping" + service_up_status = "Stopping" + elif active_state == "inactive": + if srv_type == "oneshot" or service_name in spl_srv_list: + service_status = "OK" + service_up_status = "OK" + unit_status = "OK" + else: + unit_status = "NOT OK" + if fail_reason == "-": + fail_reason = "Inactive" + else: + unit_status = "NOT OK" + + self.post_unit_status(service_name, service_status, service_up_status, fail_reason, update_time) + + return unit_status + + except Exception as e: + logger.log_error("Get unit status {}-{}".format(service_name, str(e))) + + + #Gets status of all the services from service list + def get_all_system_status(self): + """ Shows the system ready status""" + #global dnsrvs_name + scan_srv_list = [] + + scan_srv_list = self.get_all_service_list() + for service in scan_srv_list: + ustate = self.get_unit_status(service) + if ustate == "NOT OK": + if service not in self.dnsrvs_name: + self.dnsrvs_name.add(service) + + if len(self.dnsrvs_name) == 0: + return "UP" + else: + return "DOWN" + + #Displays the system ready status message on console + def print_console_message(self, message): + with open('/dev/console', 'w') as console: + console.write("\n{} {}\n".format(datetime.now().strftime("%b %d %H:%M:%S.%f"), message)) + + #Publish the system ready status message on logger,console and state db + def publish_system_status(self, astate): + global system_allsrv_state + if system_allsrv_state != astate: + system_allsrv_state = astate + if astate == "DOWN": + msg = "System is not ready - one or more services are not up" + elif astate == "UP": + msg = "System is ready" + logger.log_notice(msg) + self.print_console_message(msg) + self.post_system_status(astate) + + #Checks all the services and updates the current system status + def update_system_status(self): + try: + astate = self.get_all_system_status() + self.publish_system_status(astate) + + except Exception as e: + logger.log_error("update system status exception:{}".format(str(e))) + + #Checks a service status and updates the system status + def check_unit_status(self, event): + #global dnsrvs_name + if not self.state_db: + self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1') + self.state_db.connect(self.state_db.STATE_DB) + astate = "DOWN" + + full_srv_list = self.get_all_service_list() + if event in full_srv_list: + ustate = self.get_unit_status(event) + if ustate == "OK" and system_allsrv_state == "UP": + astate = "UP" + elif ustate == "OK" and system_allsrv_state == "DOWN": + if event in self.dnsrvs_name: + self.dnsrvs_name.remove(event) + if len(self.dnsrvs_name) == 0: + astate = "UP" + else: + astate = "DOWN" + else: + if event not in self.dnsrvs_name: + self.dnsrvs_name.add(event) + astate = "DOWN" + + self.publish_system_status(astate) + else: + #if received event is not in current full service list but exists in STATE_DB & set, + #then it should be removed from STATE_DB & set + if event in self.dnsrvs_name: + self.dnsrvs_name.remove(event) + + srv_name,last = event.split('.') + key = 'ALL_SERVICE_STATUS|{}'.format(srv_name) + key_exists = self.state_db.exists(self.state_db.STATE_DB, key) + if key_exists == 1: + self.state_db.delete(self.state_db.STATE_DB, key) + + return 0 + + def system_service(self): + if not self.state_db: + self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1') + self.state_db.connect(self.state_db.STATE_DB) + + myQ = mpmgr.Queue() + try: + monitor_system_bus = MonitorSystemBusTask(myQ) + monitor_system_bus.task_run() + + monitor_statedb_table = MonitorStateDbTask(myQ) + monitor_statedb_table.task_run() + + except Exception as e: + logger.log_error("SubProcess-{}".format(str(e))) + sys.exit(1) + + + self.update_system_status() + + from queue import Empty + # Queue to receive the STATEDB and Systemd state change event + while not self.task_stopping_event.is_set(): + try: + msg = myQ.get(timeout=QUEUE_TIMEOUT) + event = msg["unit"] + event_src = msg["evt_src"] + event_time = msg["time"] + logger.log_debug("Main process- received event:{} from source:{} time:{}".format(event,event_src,event_time)) + logger.log_info("check_unit_status for [ "+event+" ] ") + self.check_unit_status(event) + except Empty: + pass + except Exception as e: + logger.log_error("system_service"+str(e)) + + #cleanup tables "'ALL_SERVICE_STATUS*', 'SYSTEM_READY*'" from statedb + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "ALL_SERVICE_STATUS|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "SYSTEM_READY|*") + + monitor_system_bus.task_stop() + monitor_statedb_table.task_stop() + + def task_worker(self): + if self.task_stopping_event.is_set(): + return + self.system_service() + + + diff --git a/src/system-health/scripts/healthd b/src/system-health/scripts/healthd index dd276df2fafd..df52969d3aff 100644 --- a/src/system-health/scripts/healthd +++ b/src/system-health/scripts/healthd @@ -12,6 +12,8 @@ from sonic_py_common.daemon_base import DaemonBase from swsscommon.swsscommon import SonicV2Connector from health_checker.manager import HealthCheckerManager +from health_checker.sysmonitor import Sysmonitor + SYSLOG_IDENTIFIER = 'healthd' @@ -75,6 +77,8 @@ class HealthDaemon(DaemonBase): if not manager.config.config_file_exists(): self.log_warning("System health configuration file not found, exit...") return + sysmon = Sysmonitor() + sysmon.task_run() while 1: stat = manager.check(chassis) self._process_stat(chassis, manager.config, stat) @@ -85,6 +89,7 @@ class HealthDaemon(DaemonBase): self.log_warning("sonic_platform package not installed. Cannot start system-health daemon") self.deinit() + sysmon.task_stop() def _process_stat(self, chassis, config, stat): from health_checker.health_checker import HealthChecker diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 14d58c0f44b5..efadfc312f36 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -3,6 +3,7 @@ 1. test_user_defined_checker mocks the output of a user defined checker and verify class UserDefinedChecker 2. test_service_checker mocks the output of monit service and verify class ServiceChecker 3. test_hardware_checker mocks the hardware status data in db and verify class HardwareChecker + 4. Mocks and tests the system ready status and verify class Sysmonitor And there are class that are not covered by unit test. These class will be covered by sonic-mgmt regression test. 1. HealthDaemon 2. HealthCheckerManager @@ -30,6 +31,9 @@ from health_checker.manager import HealthCheckerManager from health_checker.service_checker import ServiceChecker from health_checker.user_defined_checker import UserDefinedChecker +from health_checker.sysmonitor import Sysmonitor +from health_checker.sysmonitor import MonitorStateDbTask +from health_checker.sysmonitor import MonitorSystemBusTask mock_supervisorctl_output = """ snmpd RUNNING pid 67, uptime 1:03:56 @@ -505,3 +509,210 @@ def test_utils(): output = utils.run_command('ls') assert output + + +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('docker.DockerClient') +@patch('health_checker.utils.run_command') +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): + mock_db_data = MagicMock() + mock_get_table = MagicMock() + mock_db_data.get_table = mock_get_table + mock_config_db.return_value = mock_db_data + mock_get_table.return_value = { + 'radv': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + }, + 'bgp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + }, + 'pmon': { + 'state': 'disabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + } + } + sysmon = Sysmonitor() + print("mock get table:{}".format(mock_get_table.return_value)) + result = sysmon.get_all_service_list() + print("result get all service list:{}".format(result)) + assert 'radv.service' in result + assert 'pmon.service' not in result + + +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('docker.DockerClient') +@patch('health_checker.utils.run_command') +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_get_app_ready_status(mock_config_db, mock_run, mock_docker_client): + mock_db_data = MagicMock() + mock_get_table = MagicMock() + mock_db_data.get_table = mock_get_table + mock_config_db.return_value = mock_db_data + mock_get_table.return_value = { + 'radv': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + 'check_up_status': 'True' + }, + 'bgp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + 'check_up_status': 'True' + }, + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + 'check_up_status': 'False' + } + } + + MockConnector.data.update({ + 'FEATURE|radv': { + 'up_status': 'True', + 'fail_reason': '-', + 'update_time': '-' + }, + 'FEATURE|bgp': { + 'up_status': 'False', + 'fail_reason': 'some error', + 'update_time': '-' + }}) + + sysmon = Sysmonitor() + result = sysmon.get_app_ready_status('radv') + print(result) + assert 'Up' in result + result = sysmon.get_app_ready_status('bgp') + print(result) + assert 'Down' in result + result = sysmon.get_app_ready_status('snmp') + print(result) + assert 'Up' in result + + +mock_srv_props={ +'mock_radv.service':{'Type': 'simple', 'Result': 'success', 'Id': 'mock_radv.service', 'LoadState': 'loaded', 'ActiveState': 'active', 'SubState': 'running', 'UnitFileState': 'enabled'}, +'mock_bgp.service':{'Type': 'simple', 'Result': 'success', 'Id': 'mock_bgp.service', 'LoadState': 'loaded', 'ActiveState': 'inactive', 'SubState': 'dead', 'UnitFileState': 'enabled'} +} + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service', 'mock_bgp.service', 'mock_ns.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_bgp.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Down','-','-'))) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +def test_check_unit_status(): + sysmon = Sysmonitor() + sysmon.check_unit_status('mock_bgp.service') + assert 'mock_bgp.service' in sysmon.dnsrvs_name + + + +@patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_radv.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Up','-','-'))) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +def test_get_unit_status_ok(): + sysmon = Sysmonitor() + result = sysmon.get_unit_status('mock_radv.service') + print("get_unit_status:{}".format(result)) + assert result == 'OK' + + +@patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_bgp.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Up','-','-'))) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +def test_get_unit_status_not_ok(): + sysmon = Sysmonitor() + result = sysmon.get_unit_status('mock_bgp.service') + print("get_unit_status:{}".format(result)) + assert result == 'NOT OK' + + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service', 'mock_ns.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_unit_status', MagicMock(return_value= 'OK')) +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value='Up')) +def test_get_all_system_status_ok(): + sysmon = Sysmonitor() + result = sysmon.get_all_system_status() + print("result:{}".format(result)) + assert result == 'UP' + + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service', 'mock_ns.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_unit_status', MagicMock(return_value= 'NOT OK')) +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value='Up')) +def test_get_all_system_status_not_ok(): + sysmon = Sysmonitor() + result = sysmon.get_all_system_status() + print("result:{}".format(result)) + assert result == 'DOWN' + +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +def test_post_unit_status(): + sysmon = sysmonitor() + sysmon.post_unit_status("mock_bgp.service", 'up', 'up', '-', '-') + +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) +def test_publish_system_status(): + sysmon = Sysmonitor() + sysmon.publish_system_status('UP') + +@patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) +def test_post_unit_status(): + sysmon = Sysmonitor() + sysmon.post_system_status("UP") + + +@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) +def test_print_console_message(): + sysmon = Sysmonitor() + sysmon.print_console_message("System is ready") + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) +def test_update_system_status(): + sysmon = Sysmonitor() + sysmon.update_system_status() + +from sonic_py_common.task_base import ProcessTaskBase +import multiprocessing +mpmgr = multiprocessing.Manager() + +myQ = mpmgr.Queue() +def test_monitor_statedb_task(): + sysmon = MonitorStateDbTask(myQ) + sysmon.SubscriberStateTable = MagicMock() + sysmon.task_run() + assert sysmon._task_process is not None + sysmon.task_stop() + +@patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock()) +def test_monitor_sysbus_task(): + sysmon = MonitorSystemBusTask(myQ) + sysmon.SubscriberStateTable = MagicMock() + sysmon.task_run() + assert sysmon._task_process is not None + sysmon.task_stop() + +@patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock()) +@patch('health_checker.sysmonitor.MonitorStateDbTask.subscribe_statedb', MagicMock()) +def test_system_service(): + sysmon = Sysmonitor() + sysmon.task_run() + assert sysmon._task_process is not None + sysmon.task_stop() From c424dcda3bcadca52daa379ddf50b7e8d3b2f893 Mon Sep 17 00:00:00 2001 From: sg893052 Date: Tue, 10 May 2022 07:40:17 -0700 Subject: [PATCH 2/2] Fix format issues and verify a few testcases --- .../health_checker/sysmonitor.py | 9 ++--- src/system-health/tests/mock_connector.py | 9 +++++ src/system-health/tests/test_system_health.py | 38 ++++++++++--------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index 0d4479f1fc02..a4058f8c09d3 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -44,9 +44,9 @@ def subscribe_statedb(self): logger.log_warning("sel.select() did not return swsscommon.Select.OBJECT") continue (key, op, cfvs) = cst.pop() - key_ext = key+".service" + key_ext = key + ".service" timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) - msg={"unit": key_ext,"evt_src":"feature","time":timestamp} + msg={"unit": key_ext, "evt_src":"feature", "time":timestamp} self.task_notify(msg) @@ -75,7 +75,7 @@ def __init__(self,myQ): def on_job_removed(self, id, job, unit, result): if result == "done": timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) - msg={"unit": unit,"evt_src":"sysbus","time":timestamp} + msg = {"unit": unit, "evt_src":"sysbus", "time":timestamp} self.task_notify(msg) return @@ -148,7 +148,7 @@ def get_all_service_list(self): feature_table = self.config_db.get_table("FEATURE") for srv in feature_table.keys(): if feature_table[srv]["state"] not in ["disabled", "always_disabled"]: - srvext=srv+".service" + srvext = srv + ".service" if srvext not in dir_list: dir_list.append(srvext) @@ -228,7 +228,6 @@ def post_unit_status(self, srv_name, srv_status, app_status, fail_reason, update statusvalue['fail_reason'] = fail_reason statusvalue['update_time'] = update_time self.state_db.hmset(self.state_db.STATE_DB, key, statusvalue) - statusvalue.clear() #Reads the current status of the service and posts it to state db def get_unit_status(self, event): diff --git a/src/system-health/tests/mock_connector.py b/src/system-health/tests/mock_connector.py index d32017ff8485..d602c8eaf165 100644 --- a/src/system-health/tests/mock_connector.py +++ b/src/system-health/tests/mock_connector.py @@ -22,3 +22,12 @@ def keys(self, db_id, pattern): def get_all(self, db_id, key): return MockConnector.data[key] + + def set(self, db_id, key, field, value): + self.data[key] = {} + self.data[key][field] = value + + def hmset(self, db_id, key, fieldsvalues): + self.data[key] = {} + for field,value in fieldsvalues.items(): + self.data[key][field] = value diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index efadfc312f36..76f3ceea5d3f 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -660,34 +660,38 @@ def test_get_all_system_status_not_ok(): print("result:{}".format(result)) assert result == 'DOWN' -@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) def test_post_unit_status(): - sysmon = sysmonitor() - sysmon.post_unit_status("mock_bgp.service", 'up', 'up', '-', '-') - -@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) -@patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) -@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) -def test_publish_system_status(): sysmon = Sysmonitor() - sysmon.publish_system_status('UP') + sysmon.post_unit_status("mock_bgp", 'OK', 'Down', 'mock reason', '-') + result = swsscommon.SonicV2Connector.get_all(MockConnector, 0, 'ALL_SERVICE_STATUS|mock_bgp') + print(result) + assert result['service_status'] == 'OK' + assert result['app_ready_status'] == 'Down' + assert result['fail_reason'] == 'mock reason' -@patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) -def test_post_unit_status(): +def test_post_system_status(): sysmon = Sysmonitor() sysmon.post_system_status("UP") + result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + print("post system status result:{}".format(result)) + assert result == "UP" - +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.post_system_status', test_post_system_status()) @patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) -def test_print_console_message(): +def test_publish_system_status(): sysmon = Sysmonitor() - sysmon.print_console_message("System is ready") - -@patch('health_checker.sysmonitor.Sysmonitor.get_all_system_status', MagicMock()) -@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock()) + sysmon.publish_system_status('UP') + result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + assert result == "UP" + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_system_status', test_get_all_system_status_ok()) +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', test_publish_system_status()) def test_update_system_status(): sysmon = Sysmonitor() sysmon.update_system_status() + result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + assert result == "UP" from sonic_py_common.task_base import ProcessTaskBase import multiprocessing