From fbef246cc3604c158d329a49e926ac802152a97c Mon Sep 17 00:00:00 2001 From: spilkey-cisco <110940806+spilkey-cisco@users.noreply.github.com> Date: Fri, 10 Feb 2023 21:34:10 -0800 Subject: [PATCH] Add asic presence filtering for container checking in system-health (#13497) Why I did it On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created. system-health indicates errors in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file). system-health process error messages were also altered to indicate which container had the issue; multiple containers may run processes with the same name, which can result in identical system-health error messages, causing ambiguity. How I did it Port container_checker logic from #11442 into service_checker for system-health. How to verify it Bringup Supervisor card with one or more missing fabric cards. Execute 'show system-health summary'. The command should not report failure due to missing dockers for the asics on the fabric cards which are not present. --- .../health_checker/service_checker.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index ed6c7296fde3..31c7f7b9717a 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table): """ expected_running_containers = set() container_feature_dict = {} + + # Get current asic presence list. For multi_asic system, multi instance containers + # should be checked only for asics present. + asics_id_presence = multi_asic.get_asic_presence_list() + + # Some services may run all the instances irrespective of asic presence. + # Add those to exception list. + # database service: Currently services have dependency on all database services to + # be up irrespective of asic presence. + # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven, + # it will be removed from exception list. + run_all_instance_list = ['database', 'bgp'] + for feature_name, feature_entry in feature_table.items(): if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): @@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table): if feature_entry["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - expected_running_containers.add(feature_name + str(asic_id)) - container_feature_dict[feature_name + str(asic_id)] = feature_name + if asic_id in asics_id_presence or feature_name in run_all_instance_list: + expected_running_containers.add(feature_name + str(asic_id)) + container_feature_dict[feature_name + str(asic_id)] = feature_name else: expected_running_containers.add(feature_name) container_feature_dict[feature_name] = feature_name @@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config, process_status = utils.run_command(cmd) if process_status is None: for process_name in critical_process_list: - self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name)) self.publish_events(container_name, critical_process_list) return @@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config, # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status. if process_name in process_status: if process_status[process_name] != 'RUNNING': - self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name)) else: self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))