From 865be7ba8538ae332cd2233851f36925bf1b6c61 Mon Sep 17 00:00:00 2001
From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com>
Date: Tue, 28 Jun 2022 23:48:10 +0800
Subject: [PATCH] =?UTF-8?q?[system-health]=20Fix=20error=20log=20system=5F?=
 =?UTF-8?q?service'state'=20while=20doing=20confi=E2=80=A6=20(#11225)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Why I did it
While doing config reload, FEATURE table may be removed and re-add. During this process, updating FEATURE table is not atomic. It could be that the FEATURE table has entry, but each entry has no field. This PR introduces a retry mechanism to avoid this.

- How I did it
Introduces a retry mechanism to avoid this.

- How to verify it
New unit test added to verify the flow as well as running some manual test.
---
 .../health_checker/sysmonitor.py              | 43 ++++++++++++++++---
 src/system-health/tests/test_system_health.py | 20 +++++++++
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py
index a4058f8c09d3..e69d289fc537 100755
--- a/src/system-health/health_checker/sysmonitor.py
+++ b/src/system-health/health_checker/sysmonitor.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+import time
 import glob
 import multiprocessing
 from datetime import datetime
@@ -145,12 +146,7 @@ def get_all_service_list(self):
             dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))] 
 
         #add the enabled docker services from config db feature table
-        feature_table = self.config_db.get_table("FEATURE")
-        for srv in feature_table.keys():
-            if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
-                srvext = srv + ".service"
-                if srvext not in dir_list:
-                    dir_list.append(srvext)
+        self.get_service_from_feature_table(dir_list)
         
         self.config.load_config()
         if self.config and self.config.ignore_services:
@@ -161,6 +157,41 @@ def get_all_service_list(self):
         dir_list.sort()
         return dir_list
 
+    def get_service_from_feature_table(self, dir_list):
+        """Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table
+           is not an atomic operation, sonic-cfggen do it with two steps:
+               1. Add an empty table entry to CONFIG DB
+               2. Add all fields to the table
+            
+            So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry
+            mechanism is here to avoid such issue.
+
+        Args:
+            dir_list (list): service list
+        """
+        max_retry = 3
+        retry_delay = 1
+        success = True
+
+        while max_retry > 0:
+            success = True
+            feature_table = self.config_db.get_table("FEATURE")
+            for srv, fields in feature_table.items():
+                if 'state' not in fields:
+                    success = False
+                    logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table))
+                    break
+                if fields["state"] not in ["disabled", "always_disabled"]:
+                    srvext = srv + ".service"
+                    if srvext not in dir_list:
+                        dir_list.append(srvext)
+            if not success:
+                max_retry -= 1
+                time.sleep(retry_delay)
+            else:
+                break
+        if not success:
+            logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table))
 
     #Checks FEATURE table from config db for the service' check_up_status flag
     #if marked to true, then read the service up_status from FEATURE table of state db.
diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py
index 76f3ceea5d3f..d58c69bececa 100644
--- a/src/system-health/tests/test_system_health.py
+++ b/src/system-health/tests/test_system_health.py
@@ -720,3 +720,23 @@ def test_system_service():
     sysmon.task_run()
     assert sysmon._task_process is not None
     sysmon.task_stop()
+
+
+def test_get_service_from_feature_table():
+    sysmon = Sysmonitor()
+    sysmon.config_db = MagicMock()
+    sysmon.config_db.get_table = MagicMock()
+    sysmon.config_db.get_table.side_effect = [
+        {
+            'bgp': {},
+            'swss': {}
+        },
+        {
+            'bgp': {'state': 'enabled'},
+            'swss': {'state': 'disabled'}
+        }
+    ]
+    dir_list = []
+    sysmon.get_service_from_feature_table(dir_list)
+    assert 'bgp.service' in dir_list
+    assert 'swss.service' not in dir_list