Skip to content

Commit 80add1d

Browse files
disable rule groups
Signed-off-by: Anand Rajagopal <[email protected]>
1 parent 29c40f7 commit 80add1d

File tree

7 files changed

+347
-15
lines changed

7 files changed

+347
-15
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## master / unreleased
4+
* [FEATURE] Ruler: Add support for disabling rule groups. #5521
45
* [FEATURE] Ruler: Add support for Limit field on RuleGroup. #5528
56
* [FEATURE] AlertManager: Add support for Webex, Discord and Telegram Receiver. #5493
67
* [FEATURE] Ingester: added `-admin-limit-message` to customize the message contained in limit errors.#5460

docs/configuration/config-file-reference.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3093,6 +3093,9 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
30933093
# alerts will fail with a log message and metric increment. 0 = no limit.
30943094
# CLI flag: -alertmanager.max-alerts-size-bytes
30953095
[alertmanager_max_alerts_size_bytes: <int> | default = 0]
3096+
3097+
# list of rule groups to disable
3098+
[disabled_rule_groups: <list of rule groups to disable> | default = ]
30963099
```
30973100
30983101
### `memberlist_config`

pkg/ruler/compat.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"errors"
66
"time"
77

8+
"github.com/cortexproject/cortex/pkg/util/validation"
9+
810
"github.com/go-kit/log"
911
"github.com/go-kit/log/level"
1012
"github.com/prometheus/client_golang/prometheus"
@@ -142,6 +144,7 @@ type RulesLimits interface {
142144
RulerTenantShardSize(userID string) int
143145
RulerMaxRuleGroupsPerTenant(userID string) int
144146
RulerMaxRulesPerRuleGroup(userID string) int
147+
DisabledRuleGroups(userID string) validation.DisabledRuleGroups
145148
}
146149

147150
// EngineQueryFunc returns a new engine query function by passing an altered timestamp.

pkg/ruler/ruler.go

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ const (
7171
recordingRuleFilter string = "record"
7272
)
7373

74+
type DisabledRuleGroupErr struct {
75+
Message string
76+
}
77+
78+
func (e *DisabledRuleGroupErr) Error() string {
79+
return e.Message
80+
}
81+
7482
// Config is the configuration for the recording rules server.
7583
type Config struct {
7684
// This is used for template expansion in alerts; must be a valid URL.
@@ -400,6 +408,17 @@ func SendAlerts(n sender, externalURL string) promRules.NotifyFunc {
400408
}
401409
}
402410

411+
func ruleGroupDisabled(ruleGroup *rulespb.RuleGroupDesc, disabledRuleGroupsForUser validation.DisabledRuleGroups) bool {
412+
for _, disabledRuleGroupForUser := range disabledRuleGroupsForUser {
413+
if ruleGroup.Namespace == disabledRuleGroupForUser.Namespace &&
414+
ruleGroup.Name == disabledRuleGroupForUser.Name &&
415+
ruleGroup.User == disabledRuleGroupForUser.User {
416+
return true
417+
}
418+
}
419+
return false
420+
}
421+
403422
var sep = []byte("/")
404423

405424
func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 {
@@ -415,7 +434,10 @@ func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 {
415434
return ringHasher.Sum32()
416435
}
417436

418-
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, instanceAddr string) (bool, error) {
437+
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, instanceAddr string) (bool, error) {
438+
if ruleGroupDisabled(g, disabledRuleGroups) {
439+
return false, &DisabledRuleGroupErr{Message: fmt.Sprintf("rule group %s, namespace %s, user %s is disabled", g.Name, g.Namespace, g.User)}
440+
}
419441
hash := tokenForGroup(g)
420442

421443
rlrs, err := r.Get(hash, RingOp, nil, nil, nil)
@@ -533,7 +555,26 @@ func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGr
533555
}
534556

535557
func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
536-
return r.store.ListAllRuleGroups(ctx)
558+
allRuleGroups, err := r.store.ListAllRuleGroups(ctx)
559+
if err != nil {
560+
return nil, err
561+
}
562+
for userID, groups := range allRuleGroups {
563+
disabledRuleGroupsForUser := r.limits.DisabledRuleGroups(userID)
564+
if len(disabledRuleGroupsForUser) == 0 {
565+
continue
566+
}
567+
filteredGroupsForUser := rulespb.RuleGroupList{}
568+
for _, group := range groups {
569+
if !ruleGroupDisabled(group, disabledRuleGroupsForUser) {
570+
filteredGroupsForUser = append(filteredGroupsForUser, group)
571+
} else {
572+
level.Info(r.logger).Log("msg", "rule group disabled", "rule group name", group.Name, "namespace", group.Namespace, "user", group.User)
573+
}
574+
}
575+
allRuleGroups[userID] = filteredGroupsForUser
576+
}
577+
return allRuleGroups, nil
537578
}
538579

539580
func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
@@ -544,7 +585,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
544585

545586
filteredConfigs := make(map[string]rulespb.RuleGroupList)
546587
for userID, groups := range configs {
547-
filtered := filterRuleGroups(userID, groups, r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
588+
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
548589
if len(filtered) > 0 {
549590
filteredConfigs[userID] = filtered
550591
}
@@ -602,7 +643,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
602643
return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID)
603644
}
604645

605-
filtered := filterRuleGroups(userID, groups, userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
646+
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
606647
if len(filtered) == 0 {
607648
continue
608649
}
@@ -624,15 +665,21 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
624665
//
625666
// Reason why this function is not a method on Ruler is to make sure we don't accidentally use r.ring,
626667
// but only ring passed as parameter.
627-
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
668+
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
628669
// Prune the rule group to only contain rules that this ruler is responsible for, based on ring.
629670
var result []*rulespb.RuleGroupDesc
630671
for _, g := range ruleGroups {
631-
owned, err := instanceOwnsRuleGroup(ring, g, instanceAddr)
672+
owned, err := instanceOwnsRuleGroup(ring, g, disabledRuleGroups, instanceAddr)
632673
if err != nil {
633-
ringCheckErrors.Inc()
634-
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
635-
continue
674+
switch e := err.(type) {
675+
case *DisabledRuleGroupErr:
676+
level.Info(log).Log("msg", e.Message)
677+
continue
678+
default:
679+
ringCheckErrors.Inc()
680+
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
681+
continue
682+
}
636683
}
637684

638685
if owned {

0 commit comments

Comments
 (0)