Skip to content

Commit

Permalink
Add asic presence filtering for container checking in system-health (s…
Browse files Browse the repository at this point in the history
…onic-net#13497)

Why I did it
On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created.

system-health indicates errors in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file).

system-health process error messages were also altered to indicate which container had the issue; multiple containers may run processes with the same name, which can result in identical system-health error messages, causing ambiguity.

How I did it
Port container_checker logic from sonic-net#11442 into service_checker for system-health.

How to verify it
Bringup Supervisor card with one or more missing fabric cards. Execute 'show system-health summary'. The command should not report failure due to missing dockers for the asics on the fabric cards which are not present.
  • Loading branch information
spilkey-cisco authored and mssonicbld committed Feb 17, 2023
1 parent 1d155b8 commit fbef246
Showing 1 changed file with 18 additions and 4 deletions.
22 changes: 18 additions & 4 deletions src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table):
"""
expected_running_containers = set()
container_feature_dict = {}

# Get current asic presence list. For multi_asic system, multi instance containers
# should be checked only for asics present.
asics_id_presence = multi_asic.get_asic_presence_list()

# Some services may run all the instances irrespective of asic presence.
# Add those to exception list.
# database service: Currently services have dependency on all database services to
# be up irrespective of asic presence.
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
# it will be removed from exception list.
run_all_instance_list = ['database', 'bgp']

for feature_name, feature_entry in feature_table.items():
if feature_entry["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
Expand All @@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table):
if feature_entry["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
else:
expected_running_containers.add(feature_name)
container_feature_dict[feature_name] = feature_name
Expand Down Expand Up @@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config,
process_status = utils.run_command(cmd)
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
self.publish_events(container_name, critical_process_list)
return

Expand All @@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config,
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
if process_name in process_status:
if process_status[process_name] != 'RUNNING':
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
else:
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))

0 comments on commit fbef246

Please sign in to comment.