Skip to content

Commit

Permalink
timeout after 15 minutes waiting for the NNC reconciler to start (#1861)
Browse files Browse the repository at this point in the history
* timeout after 15 minutes waiting for the NNC reconciler to start

Signed-off-by: Evan Baker <[email protected]>

* log error and retry instead of crashing out

Signed-off-by: Evan Baker <[email protected]>

* add metric for nnc reconciler failed to start

Signed-off-by: Evan Baker <[email protected]>

---------

Signed-off-by: Evan Baker <[email protected]>
  • Loading branch information
rbtr authored Sep 8, 2023
1 parent 5749688 commit feb03c0
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 20 deletions.
8 changes: 4 additions & 4 deletions cns/kubecontroller/nodenetworkconfig/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
// then, and any time that it is called after that, it immediately returns true.
// It accepts a cancellable Context and if the context is closed
// before Start it will return false. Passing a closed Context after the
// Reconciler is started is indeterminate and the response is psuedorandom.
func (r *Reconciler) Started(ctx context.Context) bool {
// Reconciler is started is indeterminate.
func (r *Reconciler) Started(ctx context.Context) (bool, error) {
select {
case <-r.started:
return true
return true, nil
case <-ctx.Done():
return false
return false, errors.Wrap(ctx.Err(), "context closed")
}
}

Expand Down
36 changes: 21 additions & 15 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1235,31 +1235,38 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
// Monitor's internal loop.
go func() {
logger.Printf("Starting NodeNetworkConfig reconciler.")
logger.Printf("Starting controller-manager.")
for {
if err := manager.Start(ctx); err != nil {
logger.Errorf("[Azure CNS] Failed to start request controller: %v", err)
logger.Errorf("Failed to start controller-manager: %v", err)
// retry to start the request controller
// inc the managerStartFailures metric for failure tracking
managerStartFailures.Inc()
} else {
logger.Printf("exiting NodeNetworkConfig reconciler")
logger.Printf("Stopped controller-manager.")
return
}

// Retry after 1sec
time.Sleep(time.Second)
time.Sleep(time.Second) // TODO(rbtr): make this exponential backoff
}
}()
logger.Printf("initialized NodeNetworkConfig reconciler")
// wait for the Reconciler to run once on a NNC that was made for this Node
if started := nncReconciler.Started(ctx); !started {
return errors.Errorf("context cancelled while waiting for reconciler start")
logger.Printf("Initialized controller-manager.")
for {
logger.Printf("Waiting for NodeNetworkConfig reconciler to start.")
// wait for the Reconciler to run once on a NNC that was made for this Node.
// the nncReadyCtx has a timeout of 15 minutes, after which we will consider
// this false and the NNC Reconciler stuck/failed, log and retry.
nncReadyCtx, _ := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak
if started, err := nncReconciler.Started(nncReadyCtx); !started {
log.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
nncReconcilerStartFailures.Inc()
continue
}
logger.Printf("NodeNetworkConfig reconciler has started.")
break
}
logger.Printf("started NodeNetworkConfig reconciler")

go func() {
logger.Printf("starting SyncHostNCVersion loop")
logger.Printf("Starting SyncHostNCVersion loop.")
// Periodically poll vfp programmed NC version from NMAgent
tickerChannel := time.Tick(time.Duration(cnsconfig.SyncHostNCVersionIntervalMs) * time.Millisecond)
for {
Expand All @@ -1269,12 +1276,11 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
httpRestServiceImplementation.SyncHostNCVersion(timedCtx, cnsconfig.ChannelMode)
cancel()
case <-ctx.Done():
logger.Printf("exiting SyncHostNCVersion")
logger.Printf("Stopping SyncHostNCVersion loop.")
return
}
}
}()
logger.Printf("initialized and started SyncHostNCVersion loop")

logger.Printf("Initialized SyncHostNCVersion loop.")
return nil
}
14 changes: 13 additions & 1 deletion cns/service/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,25 @@ import (
// failing and retrying.
var managerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "manager_start_failures_total",
Name: "cns_ctrlmanager_start_failures_total",
Help: "Number of times the controller-runtime manager failed to start.",
},
)

// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
// has failed to start within the timeout period. To drive alerting based on this metric, it is
// recommended to use the rate of increase over a period of time. A positive rate of change indicates
// that the CNS is actively failing and retrying.
var nncReconcilerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_nnc_reconciler_start_failures_total",
Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
},
)

func init() {
metrics.Registry.MustRegister(
managerStartFailures,
nncReconcilerStartFailures,
)
}

0 comments on commit feb03c0

Please sign in to comment.