From d680ce2bc1c80195dfe4ef4bd202382c984e42e5 Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Thu, 7 Feb 2019 14:41:21 -0800 Subject: [PATCH] [neighsyncd] increase neighbor syncd restore timeout to 110 seconds (#745) * [neighsyncd] increase neighbor syncd restore timeout to 120 seconds Neighbor syncd is restoring important information for teamd and BGP. our timeout should not be shorter than the down stream service. Signed-off-by: Ying Xie * [restore_neighbor] improve restore neighbor timeouts Try to get the bgp timeout and use it for restoring neighbor timeout. When unavailable, use default 110 seconds. Signed-off-by: Ying Xie * Set default values according group discussion result - restore_neighbors.py timeout at 110 seconds due to observed requirement of greater than 70 seconds. - neighbor syncd timeout at 120 seconds (longer than 110 seconds). Signed-off-by: Ying Xie --- neighsyncd/neighsync.h | 4 ++-- neighsyncd/restore_neighbors.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/neighsyncd/neighsync.h b/neighsyncd/neighsync.h index 9360a18713e0..66fd1c2645b1 100644 --- a/neighsyncd/neighsync.h +++ b/neighsyncd/neighsync.h @@ -11,10 +11,10 @@ /* * This is the timer value (in seconds) that the neighsyncd waits for restore_neighbors - * service to finish, should be longer than the restore_neighbors timeout value (60) + * service to finish, should be longer than the restore_neighbors timeout value (110) * This should not happen, if happens, system is in a unknown state, we should exit. */ -#define RESTORE_NEIGH_WAIT_TIME_OUT 70 +#define RESTORE_NEIGH_WAIT_TIME_OUT 120 namespace swss { diff --git a/neighsyncd/restore_neighbors.py b/neighsyncd/restore_neighbors.py index 387723dfe9e5..e0a0eea9434e 100755 --- a/neighsyncd/restore_neighbors.py +++ b/neighsyncd/restore_neighbors.py @@ -30,11 +30,12 @@ logger.setLevel(logging.WARNING) logger.addHandler(logging.NullHandler()) -# timeout the restore process in 1 min if not finished +# timeout the restore process in 110 seconds if not finished # This is mostly to wait for interfaces to be created and up after system warm-reboot # and this process is started by supervisord in swss docker. -# It would be good to keep that time below routing reconciliation time-out. -TIME_OUT = 60 +# There had been devices taking close to 70 seconds to complete restoration, setting +# default timeout to 110 seconds. +DEF_TIME_OUT = 110 # every 5 seconds to check interfaces states CHECK_INTERVAL = 5 @@ -189,13 +190,13 @@ def set_statedb_neigh_restore_done(): # Once all the entries are restored, this function is returned. # The interfaces' states were checked in a loop with an interval (CHECK_INTERVAL) # The function will timeout in case interfaces' states never meet the condition -# after some time (TIME_OUT). -def restore_update_kernel_neighbors(intf_neigh_map): +# after some time (DEF_TIME_OUT). +def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT): # create object for netlink calls to kernel ipclass = IPRoute() mtime = monotonic.time.time start_time = mtime() - while (mtime() - start_time) < TIME_OUT: + while (mtime() - start_time) < timeout: for intf, family_neigh_map in intf_neigh_map.items(): # only try to restore to kernel when link is up if is_intf_oper_state_up(intf):