From feb03c0597b6bdb261dc7a8a60de07bf7de67d61 Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Fri, 31 Mar 2023 11:11:26 -0700 Subject: [PATCH] timeout after 15 minutes waiting for the NNC reconciler to start (#1861) * timeout after 15 minutes waiting for the NNC reconciler to start Signed-off-by: Evan Baker * log error and retry instead of crashing out Signed-off-by: Evan Baker * add metric for nnc reconciler failed to start Signed-off-by: Evan Baker --------- Signed-off-by: Evan Baker --- .../nodenetworkconfig/reconciler.go | 8 ++--- cns/service/main.go | 36 +++++++++++-------- cns/service/metrics.go | 14 +++++++- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/cns/kubecontroller/nodenetworkconfig/reconciler.go b/cns/kubecontroller/nodenetworkconfig/reconciler.go index 546ca2eef1..ae83aa0218 100644 --- a/cns/kubecontroller/nodenetworkconfig/reconciler.go +++ b/cns/kubecontroller/nodenetworkconfig/reconciler.go @@ -134,13 +134,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco // then, and any time that it is called after that, it immediately returns true. // It accepts a cancellable Context and if the context is closed // before Start it will return false. Passing a closed Context after the -// Reconciler is started is indeterminate and the response is psuedorandom. -func (r *Reconciler) Started(ctx context.Context) bool { +// Reconciler is started is indeterminate. +func (r *Reconciler) Started(ctx context.Context) (bool, error) { select { case <-r.started: - return true + return true, nil case <-ctx.Done(): - return false + return false, errors.Wrap(ctx.Err(), "context closed") } } diff --git a/cns/service/main.go b/cns/service/main.go index 3ec59008c2..7ba10e4ce2 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -1235,31 +1235,38 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn // The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the // Monitor's internal loop. go func() { - logger.Printf("Starting NodeNetworkConfig reconciler.") + logger.Printf("Starting controller-manager.") for { if err := manager.Start(ctx); err != nil { - logger.Errorf("[Azure CNS] Failed to start request controller: %v", err) + logger.Errorf("Failed to start controller-manager: %v", err) // retry to start the request controller // inc the managerStartFailures metric for failure tracking managerStartFailures.Inc() } else { - logger.Printf("exiting NodeNetworkConfig reconciler") + logger.Printf("Stopped controller-manager.") return } - - // Retry after 1sec - time.Sleep(time.Second) + time.Sleep(time.Second) // TODO(rbtr): make this exponential backoff } }() - logger.Printf("initialized NodeNetworkConfig reconciler") - // wait for the Reconciler to run once on a NNC that was made for this Node - if started := nncReconciler.Started(ctx); !started { - return errors.Errorf("context cancelled while waiting for reconciler start") + logger.Printf("Initialized controller-manager.") + for { + logger.Printf("Waiting for NodeNetworkConfig reconciler to start.") + // wait for the Reconciler to run once on a NNC that was made for this Node. + // the nncReadyCtx has a timeout of 15 minutes, after which we will consider + // this false and the NNC Reconciler stuck/failed, log and retry. + nncReadyCtx, _ := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak + if started, err := nncReconciler.Started(nncReadyCtx); !started { + log.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err) + nncReconcilerStartFailures.Inc() + continue + } + logger.Printf("NodeNetworkConfig reconciler has started.") + break } - logger.Printf("started NodeNetworkConfig reconciler") go func() { - logger.Printf("starting SyncHostNCVersion loop") + logger.Printf("Starting SyncHostNCVersion loop.") // Periodically poll vfp programmed NC version from NMAgent tickerChannel := time.Tick(time.Duration(cnsconfig.SyncHostNCVersionIntervalMs) * time.Millisecond) for { @@ -1269,12 +1276,11 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn httpRestServiceImplementation.SyncHostNCVersion(timedCtx, cnsconfig.ChannelMode) cancel() case <-ctx.Done(): - logger.Printf("exiting SyncHostNCVersion") + logger.Printf("Stopping SyncHostNCVersion loop.") return } } }() - logger.Printf("initialized and started SyncHostNCVersion loop") - + logger.Printf("Initialized SyncHostNCVersion loop.") return nil } diff --git a/cns/service/metrics.go b/cns/service/metrics.go index 2416a858bd..ca4da8a7f4 100644 --- a/cns/service/metrics.go +++ b/cns/service/metrics.go @@ -11,13 +11,25 @@ import ( // failing and retrying. var managerStartFailures = prometheus.NewCounter( prometheus.CounterOpts{ - Name: "manager_start_failures_total", + Name: "cns_ctrlmanager_start_failures_total", Help: "Number of times the controller-runtime manager failed to start.", }, ) +// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler +// has failed to start within the timeout period. To drive alerting based on this metric, it is +// recommended to use the rate of increase over a period of time. A positive rate of change indicates +// that the CNS is actively failing and retrying. +var nncReconcilerStartFailures = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "cns_nnc_reconciler_start_failures_total", + Help: "Number of times the NNC reconciler has failed to start within the timeout period.", + }, +) + func init() { metrics.Registry.MustRegister( managerStartFailures, + nncReconcilerStartFailures, ) }