From feb03c0597b6bdb261dc7a8a60de07bf7de67d61 Mon Sep 17 00:00:00 2001
From: Evan Baker <rbtr@users.noreply.github.com>
Date: Fri, 31 Mar 2023 11:11:26 -0700
Subject: [PATCH] timeout after 15 minutes waiting for the NNC reconciler to
 start (#1861)

* timeout after 15 minutes waiting for the NNC reconciler to start

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>

* log error and retry instead of crashing out

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>

* add metric for nnc reconciler failed to start

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>

---------

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>
---
 .../nodenetworkconfig/reconciler.go           |  8 ++---
 cns/service/main.go                           | 36 +++++++++++--------
 cns/service/metrics.go                        | 14 +++++++-
 3 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/cns/kubecontroller/nodenetworkconfig/reconciler.go b/cns/kubecontroller/nodenetworkconfig/reconciler.go
index 546ca2eef1..ae83aa0218 100644
--- a/cns/kubecontroller/nodenetworkconfig/reconciler.go
+++ b/cns/kubecontroller/nodenetworkconfig/reconciler.go
@@ -134,13 +134,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 // then, and any time that it is called after that, it immediately returns true.
 // It accepts a cancellable Context and if the context is closed
 // before Start it will return false. Passing a closed Context after the
-// Reconciler is started is indeterminate and the response is psuedorandom.
-func (r *Reconciler) Started(ctx context.Context) bool {
+// Reconciler is started is indeterminate.
+func (r *Reconciler) Started(ctx context.Context) (bool, error) {
 	select {
 	case <-r.started:
-		return true
+		return true, nil
 	case <-ctx.Done():
-		return false
+		return false, errors.Wrap(ctx.Err(), "context closed")
 	}
 }
 
diff --git a/cns/service/main.go b/cns/service/main.go
index 3ec59008c2..7ba10e4ce2 100644
--- a/cns/service/main.go
+++ b/cns/service/main.go
@@ -1235,31 +1235,38 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
 	// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
 	// Monitor's internal loop.
 	go func() {
-		logger.Printf("Starting NodeNetworkConfig reconciler.")
+		logger.Printf("Starting controller-manager.")
 		for {
 			if err := manager.Start(ctx); err != nil {
-				logger.Errorf("[Azure CNS] Failed to start request controller: %v", err)
+				logger.Errorf("Failed to start controller-manager: %v", err)
 				// retry to start the request controller
 				// inc the managerStartFailures metric for failure tracking
 				managerStartFailures.Inc()
 			} else {
-				logger.Printf("exiting NodeNetworkConfig reconciler")
+				logger.Printf("Stopped controller-manager.")
 				return
 			}
-
-			// Retry after 1sec
-			time.Sleep(time.Second)
+			time.Sleep(time.Second) // TODO(rbtr): make this exponential backoff
 		}
 	}()
-	logger.Printf("initialized NodeNetworkConfig reconciler")
-	// wait for the Reconciler to run once on a NNC that was made for this Node
-	if started := nncReconciler.Started(ctx); !started {
-		return errors.Errorf("context cancelled while waiting for reconciler start")
+	logger.Printf("Initialized controller-manager.")
+	for {
+		logger.Printf("Waiting for NodeNetworkConfig reconciler to start.")
+		// wait for the Reconciler to run once on a NNC that was made for this Node.
+		// the nncReadyCtx has a timeout of 15 minutes, after which we will consider
+		// this false and the NNC Reconciler stuck/failed, log and retry.
+		nncReadyCtx, _ := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak
+		if started, err := nncReconciler.Started(nncReadyCtx); !started {
+			log.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
+			nncReconcilerStartFailures.Inc()
+			continue
+		}
+		logger.Printf("NodeNetworkConfig reconciler has started.")
+		break
 	}
-	logger.Printf("started NodeNetworkConfig reconciler")
 
 	go func() {
-		logger.Printf("starting SyncHostNCVersion loop")
+		logger.Printf("Starting SyncHostNCVersion loop.")
 		// Periodically poll vfp programmed NC version from NMAgent
 		tickerChannel := time.Tick(time.Duration(cnsconfig.SyncHostNCVersionIntervalMs) * time.Millisecond)
 		for {
@@ -1269,12 +1276,11 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
 				httpRestServiceImplementation.SyncHostNCVersion(timedCtx, cnsconfig.ChannelMode)
 				cancel()
 			case <-ctx.Done():
-				logger.Printf("exiting SyncHostNCVersion")
+				logger.Printf("Stopping SyncHostNCVersion loop.")
 				return
 			}
 		}
 	}()
-	logger.Printf("initialized and started SyncHostNCVersion loop")
-
+	logger.Printf("Initialized SyncHostNCVersion loop.")
 	return nil
 }
diff --git a/cns/service/metrics.go b/cns/service/metrics.go
index 2416a858bd..ca4da8a7f4 100644
--- a/cns/service/metrics.go
+++ b/cns/service/metrics.go
@@ -11,13 +11,25 @@ import (
 // failing and retrying.
 var managerStartFailures = prometheus.NewCounter(
 	prometheus.CounterOpts{
-		Name: "manager_start_failures_total",
+		Name: "cns_ctrlmanager_start_failures_total",
 		Help: "Number of times the controller-runtime manager failed to start.",
 	},
 )
 
+// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
+// has failed to start within the timeout period. To drive alerting based on this metric, it is
+// recommended to use the rate of increase over a period of time. A positive rate of change indicates
+// that the CNS is actively failing and retrying.
+var nncReconcilerStartFailures = prometheus.NewCounter(
+	prometheus.CounterOpts{
+		Name: "cns_nnc_reconciler_start_failures_total",
+		Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
+	},
+)
+
 func init() {
 	metrics.Registry.MustRegister(
 		managerStartFailures,
+		nncReconcilerStartFailures,
 	)
 }