Skip to content

Commit c06cb21

Browse files
Make system health service start early (#9792)
- Why I did it For SYSTEM READY feature. Currently, there is a booting stage in system health service to indicate that the system is loading SONiC component. This booting stage is no longer needed because SYSTEM READY feature will treat that stage as system "NOT READY". - How I did it 1. Remove booting stage 2. Adjust unit test cases - How to verify it Manual test, Unit test, sonic-mgmt Regression
1 parent 43e967d commit c06cb21

File tree

5 files changed

+12
-49
lines changed

5 files changed

+12
-49
lines changed

src/system-health/health_checker/manager.py

+2-22
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from . import utils
21
from .config import Config
32
from .health_checker import HealthChecker
43
from .service_checker import ServiceChecker
@@ -10,14 +9,10 @@ class HealthCheckerManager(object):
109
"""
1110
Manage all system health checkers and system health configuration.
1211
"""
13-
STATE_BOOTING = 'booting'
14-
STATE_RUNNING = 'running'
1512
boot_timeout = None
1613

1714
def __init__(self):
1815
self._checkers = []
19-
self._state = self.STATE_BOOTING
20-
2116
self.config = Config()
2217
self.initialize()
2318

@@ -33,17 +28,11 @@ def check(self, chassis):
3328
"""
3429
Load new configuration if any and perform the system health check for all existing checkers.
3530
:param chassis: A chassis object.
36-
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
37-
contains the status for all objects that was checked.
31+
:return: A dictionary that contains the status for all objects that was checked.
3832
"""
3933
HealthChecker.summary = HealthChecker.STATUS_OK
4034
stats = {}
4135
self.config.load_config()
42-
# check state first to avoid user change boot timeout in configuration file
43-
# after finishing system boot
44-
if self._state == self.STATE_BOOTING and self._is_system_booting():
45-
self._set_system_led(chassis, self.config, 'booting')
46-
return self._state, stats
4736

4837
for checker in self._checkers:
4938
self._do_check(checker, stats)
@@ -56,7 +45,7 @@ def check(self, chassis):
5645
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
5746
self._set_system_led(chassis, self.config, led_status)
5847

59-
return self._state, stats
48+
return stats
6049

6150
def _do_check(self, checker, stats):
6251
"""
@@ -86,15 +75,6 @@ def _do_check(self, checker, stats):
8675
else:
8776
stats['Internal'].update(entry)
8877

89-
def _is_system_booting(self):
90-
uptime = utils.get_uptime()
91-
if not self.boot_timeout:
92-
self.boot_timeout = self.config.get_bootup_timeout()
93-
booting = uptime < self.boot_timeout
94-
if not booting:
95-
self._state = self.STATE_RUNNING
96-
return booting
97-
9878
def _set_system_led(self, chassis, config, status):
9979
try:
10080
chassis.set_status_led(config.get_led_color(status))

src/system-health/health_checker/service_checker.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def check_by_monit(self, config):
218218
output = utils.run_command(ServiceChecker.CHECK_CMD)
219219
lines = output.splitlines()
220220
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
221-
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
221+
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
222222
return
223223

224224
status_begin = lines[1].find('Status')

src/system-health/health_checker/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def run_command(command):
88
:return: Output of the shell command.
99
"""
1010
try:
11-
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
11+
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1212
return process.communicate()[0]
1313
except Exception:
1414
return None

src/system-health/scripts/healthd

+5-6
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'
1818

1919
class HealthDaemon(DaemonBase):
2020
"""
21-
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
21+
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
2222
according to the check result and store the check result to redis.
2323
"""
2424
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
@@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
3535
def deinit(self):
3636
"""
3737
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
38-
:return:
38+
:return:
3939
"""
4040
self._clear_system_health_table()
4141

@@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
6464
def run(self):
6565
"""
6666
Check system health in an infinite loop.
67-
:return:
67+
:return:
6868
"""
6969
self.log_notice("Starting up...")
7070

@@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
7676
self.log_warning("System health configuration file not found, exit...")
7777
return
7878
while 1:
79-
state, stat = manager.check(chassis)
80-
if state == HealthCheckerManager.STATE_RUNNING:
81-
self._process_stat(chassis, manager.config, stat)
79+
stat = manager.check(chassis)
80+
self._process_stat(chassis, manager.config, stat)
8281

8382
if self.stop_event.wait(manager.config.interval):
8483
break

src/system-health/tests/test_system_health.py

+3-19
Original file line numberDiff line numberDiff line change
@@ -439,26 +439,14 @@ def test_config():
439439
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
440440
@patch('health_checker.service_checker.ServiceChecker.get_info')
441441
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
442-
@patch('health_checker.utils.get_uptime')
443-
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
442+
def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
444443
chassis = MagicMock()
445444
chassis.set_status_led = MagicMock()
446445

447446
manager = HealthCheckerManager()
448447
manager.config.user_defined_checkers = ['some check']
449-
assert manager._state == HealthCheckerManager.STATE_BOOTING
450448
assert len(manager._checkers) == 2
451449

452-
mock_uptime.return_value = 200
453-
assert manager._is_system_booting()
454-
state, stat = manager.check(chassis)
455-
assert state == HealthCheckerManager.STATE_BOOTING
456-
assert len(stat) == 0
457-
chassis.set_status_led.assert_called_with('orange_blink')
458-
459-
mock_uptime.return_value = 500
460-
assert not manager._is_system_booting()
461-
assert manager._state == HealthCheckerManager.STATE_RUNNING
462450
mock_hw_info.return_value = {
463451
'ASIC': {
464452
'type': 'ASIC',
@@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
485473
'status': 'OK'
486474
}
487475
}
488-
state, stat = manager.check(chassis)
489-
assert state == HealthCheckerManager.STATE_RUNNING
476+
stat = manager.check(chassis)
490477
assert 'Services' in stat
491478
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
492479

@@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
500487
mock_hw_info.side_effect = RuntimeError()
501488
mock_service_info.side_effect = RuntimeError()
502489
mock_udc_info.side_effect = RuntimeError()
503-
state, stat = manager.check(chassis)
490+
stat = manager.check(chassis)
504491
assert 'Internal' in stat
505492
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
506493
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
@@ -518,6 +505,3 @@ def test_utils():
518505

519506
output = utils.run_command('ls')
520507
assert output
521-
522-
uptime = utils.get_uptime()
523-
assert uptime > 0

0 commit comments

Comments
 (0)