From dc680b7db5d8b026650c4607659820778fedcc85 Mon Sep 17 00:00:00 2001 From: Stephen Greene Date: Thu, 6 Aug 2020 10:15:04 -0400 Subject: [PATCH 1/2] Add new router failed reload counter metric If a router reload fails after a router pod becomes ready, we need a way to alert cluster admins that newly created route resources are not being applied to the cluster. --- pkg/router/template/router.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/router/template/router.go b/pkg/router/template/router.go index 0ccd22fd1..85f9539ab 100644 --- a/pkg/router/template/router.go +++ b/pkg/router/template/router.go @@ -98,6 +98,8 @@ type templateRouter struct { stateChanged bool // metricReload tracks reloads metricReload prometheus.Summary + // metricReloadFails tracks reload failures + metricReloadFails prometheus.Counter // metricWriteConfig tracks writing config metricWriteConfig prometheus.Summary // dynamicConfigManager configures route changes dynamically on the @@ -178,6 +180,12 @@ func newTemplateRouter(cfg templateRouterCfg) (*templateRouter, error) { Help: "Measures the time spent reloading the router in seconds.", }) prometheus.MustRegister(metricsReload) + metricReloadFails := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "template_router", + Name: "reload_fails", + Help: "Tracks the number of failed router reloads", + }) + prometheus.MustRegister(metricReloadFails) metricWriteConfig := prometheus.NewSummary(prometheus.SummaryOpts{ Namespace: "template_router", Name: "write_config_seconds", @@ -207,6 +215,7 @@ func newTemplateRouter(cfg templateRouterCfg) (*templateRouter, error) { dynamicConfigManager: cfg.dynamicConfigManager, metricReload: metricsReload, + metricReloadFails: metricReloadFails, metricWriteConfig: metricWriteConfig, rateLimitedCommitFunction: nil, @@ -434,6 +443,8 @@ func (r *templateRouter) commitAndReload() error { if r.dynamicConfigManager != nil { r.dynamicConfigManager.Notify(RouterEventReloadError) } + // Increment the failed reload counter when a reload fails + r.metricReloadFails.Inc() return err } From ad0336b1d1c300aabd2b6941b7f39998a4147a24 Mon Sep 17 00:00:00 2001 From: Stephen Greene Date: Thu, 6 Aug 2020 10:15:24 -0400 Subject: [PATCH 2/2] Remove premature router reload call at startup The first call to `commitAndReload` bypasses the rate limited reload logic that also takes into account route sync status. Removing this initial router reload call will prevent the router from starting in a "routeless" state: that is, a state were the router is running before it has begun watching route resources. --- pkg/router/template/router.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/router/template/router.go b/pkg/router/template/router.go index 85f9539ab..a65a57f21 100644 --- a/pkg/router/template/router.go +++ b/pkg/router/template/router.go @@ -230,10 +230,7 @@ func newTemplateRouter(cfg templateRouterCfg) (*templateRouter, error) { log.V(0).Info("initializing dynamic config manager ... ") router.dynamicConfigManager.Initialize(router, router.defaultCertificatePath) } - log.V(4).Info("committing state") - // Bypass the rate limiter to ensure the first sync will be - // committed without delay. - router.commitAndReload() + return router, nil }