Skip to content

Commit

Permalink
Make system health service start early (#9792)
Browse files Browse the repository at this point in the history
- Why I did it
For SYSTEM READY feature. Currently, there is a booting stage in system health service to indicate that the system is loading SONiC component. This booting stage is no longer needed because SYSTEM READY feature will treat that stage as system "NOT READY".

- How I did it
1. Remove booting stage
2. Adjust unit test cases

- How to verify it
Manual test, Unit test, sonic-mgmt Regression
  • Loading branch information
Junchao-Mellanox authored and pull[bot] committed May 8, 2024
1 parent e4d25e1 commit 8eb3d3a
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 49 deletions.
24 changes: 2 additions & 22 deletions src/system-health/health_checker/manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from . import utils
from .config import Config
from .health_checker import HealthChecker
from .service_checker import ServiceChecker
Expand All @@ -10,14 +9,10 @@ class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
"""
STATE_BOOTING = 'booting'
STATE_RUNNING = 'running'
boot_timeout = None

def __init__(self):
self._checkers = []
self._state = self.STATE_BOOTING

self.config = Config()
self.initialize()

Expand All @@ -33,17 +28,11 @@ def check(self, chassis):
"""
Load new configuration if any and perform the system health check for all existing checkers.
:param chassis: A chassis object.
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
:return: A dictionary that contains the status for all objects that was checked.
"""
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
# check state first to avoid user change boot timeout in configuration file
# after finishing system boot
if self._state == self.STATE_BOOTING and self._is_system_booting():
self._set_system_led(chassis, self.config, 'booting')
return self._state, stats

for checker in self._checkers:
self._do_check(checker, stats)
Expand All @@ -56,7 +45,7 @@ def check(self, chassis):
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
self._set_system_led(chassis, self.config, led_status)

return self._state, stats
return stats

def _do_check(self, checker, stats):
"""
Expand Down Expand Up @@ -86,15 +75,6 @@ def _do_check(self, checker, stats):
else:
stats['Internal'].update(entry)

def _is_system_booting(self):
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
if not booting:
self._state = self.STATE_RUNNING
return booting

def _set_system_led(self, chassis, config, status):
try:
chassis.set_status_led(config.get_led_color(status))
Expand Down
2 changes: 1 addition & 1 deletion src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def check_by_monit(self, config):
output = utils.run_command(ServiceChecker.CHECK_CMD)
lines = output.splitlines()
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
return

status_begin = lines[1].find('Status')
Expand Down
2 changes: 1 addition & 1 deletion src/system-health/health_checker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def run_command(command):
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0]
except Exception:
return None
Expand Down
11 changes: 5 additions & 6 deletions src/system-health/scripts/healthd
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'

class HealthDaemon(DaemonBase):
"""
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
according to the check result and store the check result to redis.
"""
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
Expand All @@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
def deinit(self):
"""
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
:return:
:return:
"""
self._clear_system_health_table()

Expand Down Expand Up @@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
def run(self):
"""
Check system health in an infinite loop.
:return:
:return:
"""
self.log_notice("Starting up...")

Expand All @@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
self.log_warning("System health configuration file not found, exit...")
return
while 1:
state, stat = manager.check(chassis)
if state == HealthCheckerManager.STATE_RUNNING:
self._process_stat(chassis, manager.config, stat)
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)

if self.stop_event.wait(manager.config.interval):
break
Expand Down
22 changes: 3 additions & 19 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,26 +439,14 @@ def test_config():
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
@patch('health_checker.service_checker.ServiceChecker.get_info')
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
@patch('health_checker.utils.get_uptime')
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
chassis = MagicMock()
chassis.set_status_led = MagicMock()

manager = HealthCheckerManager()
manager.config.user_defined_checkers = ['some check']
assert manager._state == HealthCheckerManager.STATE_BOOTING
assert len(manager._checkers) == 2

mock_uptime.return_value = 200
assert manager._is_system_booting()
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_BOOTING
assert len(stat) == 0
chassis.set_status_led.assert_called_with('orange_blink')

mock_uptime.return_value = 500
assert not manager._is_system_booting()
assert manager._state == HealthCheckerManager.STATE_RUNNING
mock_hw_info.return_value = {
'ASIC': {
'type': 'ASIC',
Expand All @@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
'status': 'OK'
}
}
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_RUNNING
stat = manager.check(chassis)
assert 'Services' in stat
assert stat['Services']['snmp:snmpd']['status'] == 'OK'

Expand All @@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
mock_hw_info.side_effect = RuntimeError()
mock_service_info.side_effect = RuntimeError()
mock_udc_info.side_effect = RuntimeError()
state, stat = manager.check(chassis)
stat = manager.check(chassis)
assert 'Internal' in stat
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
Expand All @@ -518,6 +505,3 @@ def test_utils():

output = utils.run_command('ls')
assert output

uptime = utils.get_uptime()
assert uptime > 0

0 comments on commit 8eb3d3a

Please sign in to comment.