From 89f091eded92dbe100d2ec8cebd1e01ee1a17f49 Mon Sep 17 00:00:00 2001 From: mssonicbld <79238446+mssonicbld@users.noreply.github.com> Date: Wed, 6 Sep 2023 12:53:08 +0800 Subject: [PATCH] [Mellanox] set select timeout to no more than 1 sec to make sure fast shutdown (#13611) (#16449) - Why I did it Commit sonic-net/sonic-platform-daemons@153ea47 changed SfpStateUpdateTask from Process to Thread. In this commit, it raises an exception in SfpStateUpdateTask to make shutdown flow fast. But it does not work on Nvidia platform as Nvidia platform is passing timeout parameter of get_change_event to select. Linux select function can not be interrupted by a Python exception. There is no such issue on Nvidia platform before that commit. However, in order to comply with the commit and make shutdown flow fast, we decided to change Nvidia platform API implementation. To fix issue #13591. - How I did it The select call in get_change_event should use no more than 1 second as timeout parameter. Outside the select call, add a while loop to make sure timeout parameter of get_change_event work as expected - How to verify it Manual test Co-authored-by: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> --- .../sonic_platform/chassis.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index e13b90d6c0ca..17d66b2e8f64 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -31,10 +31,10 @@ from . import utils from .device_data import DeviceDataManager import re + import time except ImportError as e: raise ImportError (str(e) + "- required module not found") -MAX_SELECT_DELAY = 3600 RJ45_TYPE = "RJ45" @@ -387,26 +387,30 @@ def get_change_event(self, timeout=0): self.sfp_event.initialize() wait_for_ever = (timeout == 0) + # select timeout should be no more than 1000ms to ensure fast shutdown flow + select_timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} - if wait_for_ever: - timeout = MAX_SELECT_DELAY - while True: - status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout) - if bool(port_dict): + begin = time.time() + while True: + status = self.sfp_event.check_sfp_status(port_dict, error_dict, select_timeout) + if bool(port_dict): + break + + if not wait_for_ever: + elapse = time.time() - begin + if elapse * 1000 > timeout: break - else: - status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout) if status: if port_dict: self.reinit_sfps(port_dict) - result_dict = {'sfp':port_dict} + result_dict = {'sfp': port_dict} if error_dict: result_dict['sfp_error'] = error_dict return True, result_dict else: - return True, {'sfp':{}} + return True, {'sfp': {}} def reinit_sfps(self, port_dict): """