diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bbdac43d65..d8a0a9c04b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ * [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups). The same change applies to tracing spans. #3046 * [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030 * [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056 +* [CHANGE] `ruler.evaluation_delay_duration` field in YAML config has been moved and renamed to `limits.ruler_evaluation_delay_duration`. #3098 * [CHANGE] Removed obsolete `results_cache.max_freshness` from YAML config (deprecated since Cortex 1.2). #3145 * [CHANGE] Removed obsolete `-promql.lookback-delta` option (deprecated since Cortex 1.2, replaced with `-querier.lookback-delta`). #3144 * [FEATURE] Logging of the source IP passed along by a reverse proxy is now supported by setting the `-server.log-source-ips-enabled`. For non standard headers the settings `-server.log-source-ips-header` and `-server.log-source-ips-regex` can be used. #2985 @@ -31,6 +32,7 @@ * [ENHANCEMENT] Add "integration" as a label for `cortex_alertmanager_notifications_total` and `cortex_alertmanager_notifications_failed_total` metrics. #3056 * [ENHANCEMENT] Add `cortex_ruler_config_last_reload_successful` and `cortex_ruler_config_last_reload_successful_seconds` to check status of users rule manager. #3056 * [ENHANCEMENT] Memcached dial() calls now have an optional circuit-breaker to avoid hammering a broken cache #3051 +* [ENHANCEMENT] `-ruler.evaluation-delay-duration` is now overridable as a per-tenant limit, `ruler_evaluation_delay_duration`. #3098 * [ENHANCEMENT] Add TLS support to etcd client. #3102 * [ENHANCEMENT] When a tenant accesses the Alertmanager UI or its API, if we have valid `-alertmanager.configs.fallback` we'll use that to start the manager and avoid failing the request. #3073 * [ENHANCEMENT] Add `DELETE api/v1/rules/{namespace}` to the Ruler. It allows all the rule groups of a namespace to be deleted. #3120 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 4a46d535319..4007c184472 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -846,9 +846,8 @@ ruler_client: # CLI flag: -ruler.evaluation-interval [evaluation_interval: | default = 1m] -# Duration to delay the evaluation of rules to ensure they underlying metrics -# have been pushed to cortex. -# CLI flag: -ruler.evaluation-delay-duration +# Deprecated. Please use -ruler.evaluation-delay-duration instead. +# CLI flag: -ruler.evaluation-delay-duration-deprecated [evaluation_delay_duration: | default = 0s] # How frequently to poll for rule changes @@ -2822,6 +2821,11 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s # CLI flag: -frontend.max-cache-freshness [max_cache_freshness: | default = 1m] +# Duration to delay the evaluation of rules to ensure the underlying metrics +# have been pushed to Cortex. +# CLI flag: -ruler.evaluation-delay-duration +[ruler_evaluation_delay_duration: | default = 0s] + # The default tenant's shard size when the shuffle-sharding strategy is used. # Must be set when the store-gateway sharding is enabled with the # shuffle-sharding strategy. When this setting is specified in the per-tenant diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 29bc5a70ced..604660ad6ea 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -35,6 +35,7 @@ import ( "github.com/cortexproject/cortex/pkg/ruler" "github.com/cortexproject/cortex/pkg/storegateway" "github.com/cortexproject/cortex/pkg/util" + "github.com/cortexproject/cortex/pkg/util/flagext" "github.com/cortexproject/cortex/pkg/util/modules" "github.com/cortexproject/cortex/pkg/util/runtimeconfig" "github.com/cortexproject/cortex/pkg/util/services" @@ -126,6 +127,19 @@ func (t *Cortex) initRing() (serv services.Service, err error) { } func (t *Cortex) initRuntimeConfig() (services.Service, error) { + // We need to modify LimitsConfig before calling SetDefaultLimitsForYAMLUnmarshalling later in this method + // but also if runtime-config is not used, for setting limits used by initOverrides. + // TODO: Remove this in Cortex 1.6. + if t.Cfg.Ruler.EvaluationDelay != 0 && t.Cfg.LimitsConfig.RulerEvaluationDelay == 0 { + t.Cfg.LimitsConfig.RulerEvaluationDelay = t.Cfg.Ruler.EvaluationDelay + + // No need to report if this field isn't going to be used. + if t.Cfg.Target == All || t.Cfg.Target == Ruler { + flagext.DeprecatedFlagsUsed.Inc() + level.Warn(util.Logger).Log("msg", "Using DEPRECATED YAML config field ruler.evaluation_delay_duration, please use limits.ruler_evaluation_delay_duration instead.") + } + } + if t.Cfg.RuntimeConfig.LoadPath == "" { t.Cfg.RuntimeConfig.LoadPath = t.Cfg.LimitsConfig.PerTenantOverrideConfig t.Cfg.RuntimeConfig.ReloadPeriod = t.Cfg.LimitsConfig.PerTenantOverridePeriod @@ -514,7 +528,7 @@ func (t *Cortex) initRuler() (serv services.Service, err error) { rulerRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "ruler"}, prometheus.DefaultRegisterer) queryable, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, rulerRegisterer) - managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine) + managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides) manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, prometheus.DefaultRegisterer, util.Logger) if err != nil { return nil, err diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index ff0c424fdd0..494e81b5de0 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -72,12 +72,21 @@ func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender { } } +// RulesLimits is the one function we need from limits.Overrides, and +// is here to limit coupling. +type RulesLimits interface { + EvaluationDelay(usedID string) time.Duration +} + // engineQueryFunc returns a new query function using the rules.EngineQueryFunc function // and passing an altered timestamp. -func engineQueryFunc(engine *promql.Engine, q storage.Queryable, delay time.Duration) rules.QueryFunc { - orig := rules.EngineQueryFunc(engine, q) +func engineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides RulesLimits, userID string) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { - return orig(ctx, qs, t.Add(-delay)) + orig := rules.EngineQueryFunc(engine, q) + // Delay the evaluation of all rules by a set interval to give a buffer + // to metric that haven't been forwarded to cortex yet. + evaluationDelay := overrides.EvaluationDelay(userID) + return orig(ctx, qs, t.Add(-evaluationDelay)) } } @@ -94,6 +103,7 @@ func DefaultTenantManagerFactory( p Pusher, q storage.Queryable, engine *promql.Engine, + overrides RulesLimits, ) ManagerFactory { return func( ctx context.Context, @@ -105,7 +115,7 @@ func DefaultTenantManagerFactory( return rules.NewManager(&rules.ManagerOptions{ Appendable: &PusherAppendable{pusher: p, userID: userID}, Queryable: q, - QueryFunc: engineQueryFunc(engine, q, cfg.EvaluationDelay), + QueryFunc: engineQueryFunc(engine, q, overrides, userID), Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 2e68b722897..7adce6bf2d1 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -49,8 +49,8 @@ type Config struct { ClientTLSConfig tls.ClientConfig `yaml:"ruler_client"` // How frequently to evaluate rules by default. EvaluationInterval time.Duration `yaml:"evaluation_interval"` - // Delay the evaluation of all rules by a set interval to give a buffer - // to metric that haven't been forwarded to cortex yet. + // Deprecated. Replaced with pkg/util/validation/Limits.RulerEvaluationDelay field. + // TODO: To be removed in Cortex 1.6. EvaluationDelay time.Duration `yaml:"evaluation_delay_duration"` // How frequently to poll for updated rules. PollInterval time.Duration `yaml:"poll_interval"` @@ -110,7 +110,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { cfg.ExternalURL.URL, _ = url.Parse("") // Must be non-nil f.Var(&cfg.ExternalURL, "ruler.external.url", "URL of alerts return path.") f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules") - f.DurationVar(&cfg.EvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure they underlying metrics have been pushed to cortex.") + f.DurationVar(&cfg.EvaluationDelay, "ruler.evaluation-delay-duration-deprecated", 0, "Deprecated. Please use -ruler.evaluation-delay-duration instead.") f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes") f.StringVar(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "", "Comma-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.") diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 046d6c5d5f0..c2abb714c08 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -57,7 +57,13 @@ func defaultRulerConfig(store rules.RuleStore) (Config, func()) { return cfg, cleanup } -func testSetup(t *testing.T, cfg Config) (*promql.Engine, storage.QueryableFunc, Pusher, log.Logger, func()) { +type ruleLimits time.Duration + +func (r ruleLimits) EvaluationDelay(_ string) time.Duration { + return time.Duration(r) +} + +func testSetup(t *testing.T, cfg Config) (*promql.Engine, storage.QueryableFunc, Pusher, log.Logger, RulesLimits, func()) { dir, err := ioutil.TempDir("", t.Name()) testutil.Ok(t, err) cleanup := func() { @@ -83,24 +89,24 @@ func testSetup(t *testing.T, cfg Config) (*promql.Engine, storage.QueryableFunc, l := log.NewLogfmtLogger(os.Stdout) l = level.NewFilter(l, level.AllowInfo()) - return engine, noopQueryable, pusher, l, cleanup + return engine, noopQueryable, pusher, l, ruleLimits(0), cleanup } func newManager(t *testing.T, cfg Config) (*DefaultMultiTenantManager, func()) { - engine, noopQueryable, pusher, logger, cleanup := testSetup(t, cfg) - manager, err := NewDefaultMultiTenantManager(cfg, DefaultTenantManagerFactory(cfg, pusher, noopQueryable, engine), prometheus.NewRegistry(), logger) + engine, noopQueryable, pusher, logger, overrides, cleanup := testSetup(t, cfg) + manager, err := NewDefaultMultiTenantManager(cfg, DefaultTenantManagerFactory(cfg, pusher, noopQueryable, engine, overrides), prometheus.NewRegistry(), logger) require.NoError(t, err) return manager, cleanup } func newRuler(t *testing.T, cfg Config) (*Ruler, func()) { - engine, noopQueryable, pusher, logger, cleanup := testSetup(t, cfg) + engine, noopQueryable, pusher, logger, overrides, cleanup := testSetup(t, cfg) storage, err := NewRuleStorage(cfg.StoreConfig) require.NoError(t, err) reg := prometheus.NewRegistry() - managerFactory := DefaultTenantManagerFactory(cfg, pusher, noopQueryable, engine) + managerFactory := DefaultTenantManagerFactory(cfg, pusher, noopQueryable, engine, overrides) manager, err := NewDefaultMultiTenantManager(cfg, managerFactory, reg, util.Logger) require.NoError(t, err) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index e88bb47b2b8..3d4a7ba80fb 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -62,6 +62,9 @@ type Limits struct { CardinalityLimit int `yaml:"cardinality_limit"` MaxCacheFreshness time.Duration `yaml:"max_cache_freshness"` + // Ruler defaults and limits. + RulerEvaluationDelay time.Duration `yaml:"ruler_evaluation_delay_duration"` + // Store-gateway. StoreGatewayTenantShardSize int `yaml:"store_gateway_tenant_shard_size"` @@ -109,6 +112,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.CardinalityLimit, "store.cardinality-limit", 1e5, "Cardinality limit for index queries. This limit is ignored when running the Cortex blocks storage. 0 to disable.") f.DurationVar(&l.MaxCacheFreshness, "frontend.max-cache-freshness", 1*time.Minute, "Most recent allowed cacheable result per-tenant, to prevent caching very recent results that might still be in flux.") + f.DurationVar(&l.RulerEvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure the underlying metrics have been pushed to Cortex.") + f.StringVar(&l.PerTenantOverrideConfig, "limits.per-user-override-config", "", "File name of per-user overrides. [deprecated, use -runtime-config.file instead]") f.DurationVar(&l.PerTenantOverridePeriod, "limits.per-user-override-period", 10*time.Second, "Period with which to reload the overrides. [deprecated, use -runtime-config.reload-period instead]") @@ -346,6 +351,11 @@ func (o *Overrides) SubringSize(userID string) int { return o.getOverridesForUser(userID).SubringSize } +// EvaluationDelay returns the rules evaluation delay for a given user. +func (o *Overrides) EvaluationDelay(userID string) time.Duration { + return o.getOverridesForUser(userID).RulerEvaluationDelay +} + // StoreGatewayTenantShardSize returns the size of the store-gateway shard size for a given user. func (o *Overrides) StoreGatewayTenantShardSize(userID string) int { return o.getOverridesForUser(userID).StoreGatewayTenantShardSize