From c559ee736c7b6e090dd856874c05241deb93df7c Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 27 Jul 2022 16:58:51 -0700 Subject: [PATCH] pkg/clusterconditions/promql: Cap PromQL queries at 5 minutes In some clusters, these PromQL queries can hang for hours, possibly forever [1]. I think we have a 30s default KeepAlive timeout [2], but apparently there's enough socket traffic to keep from tripping that. This adds a 5m cap to the PromQL calls, although I'm not particularly attached to that particular number. We can always raise it if we start seeing timeouts in Insights for queries where taking that long seems reasonable. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=2109374#c12 [2]: https://pkg.go.dev/github.com/prometheus/client_golang/api#pkg-variables --- pkg/clusterconditions/promql/promql.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/clusterconditions/promql/promql.go b/pkg/clusterconditions/promql/promql.go index b4c949d7f7..5b84069995 100644 --- a/pkg/clusterconditions/promql/promql.go +++ b/pkg/clusterconditions/promql/promql.go @@ -27,6 +27,9 @@ type PromQL struct { // HTTPClientConfig holds the client configuration for connecting to the Prometheus service. HTTPClientConfig config.HTTPClientConfig + + // QueryTimeout limits the amount of time we wait before giving up on the Prometheus query. + QueryTimeout time.Duration } var promql = &cache.Cache{ @@ -41,6 +44,7 @@ var promql = &cache.Cache{ CAFile: "/etc/tls/service-ca/service-ca.crt", }, }, + QueryTimeout: 5 * time.Minute, }, MinBetweenMatches: 10 * time.Minute, MinForCondition: time.Hour, @@ -79,8 +83,16 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition } v1api := prometheusv1.NewAPI(client) + + queryContext := ctx + if p.QueryTimeout > 0 { + var cancel context.CancelFunc + queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout) + defer cancel() + } + klog.V(2).Infof("evaluate %s cluster condition: %q", condition.Type, condition.PromQL.PromQL) - result, warnings, err := v1api.Query(ctx, condition.PromQL.PromQL, time.Now()) + result, warnings, err := v1api.Query(queryContext, condition.PromQL.PromQL, time.Now()) if err != nil { return false, fmt.Errorf("executing PromQL query: %w", err) }