From 181eea30fe89ff3aa75d7c033062c25fe26cbdd6 Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Tue, 4 Mar 2025 10:52:15 -0800 Subject: [PATCH 1/4] Add tokenFile and persisit on ring features for AM ring Signed-off-by: Daniel Deluiggi --- pkg/alertmanager/alertmanager_ring.go | 23 ++++++++++++++--------- pkg/alertmanager/multitenant.go | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index cdb52b5ae80..90430137b03 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -48,9 +48,11 @@ type RingConfig struct { HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"` ReplicationFactor int `yaml:"replication_factor"` ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"` + TokensFilePath string `yaml:"tokens_file_path"` - FinalSleep time.Duration `yaml:"final_sleep"` - WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"` + FinalSleep time.Duration `yaml:"final_sleep"` + WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"` + KeepInstanceInTheRingOnShutdown bool `yaml:"keep_instance_in_the_ring_on_shutdown"` // Instance details InstanceID string `yaml:"instance_id" doc:"hidden"` @@ -85,6 +87,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.FinalSleep, rfprefix+"final-sleep", 0*time.Second, "The sleep seconds when alertmanager is shutting down. Need to be close to or larger than KV Store information propagation delay") f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.") f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.") + f.StringVar(&cfg.TokensFilePath, rfprefix+"tokens-file-path", "", "File path where tokens are stored. If empty, tokens are not stored at shutdown and restored at startup.") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} @@ -93,6 +96,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.IntVar(&cfg.InstancePort, rfprefix+"instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).") f.StringVar(&cfg.InstanceID, rfprefix+"instance-id", hostname, "Instance ID to register in the ring.") f.StringVar(&cfg.InstanceZone, rfprefix+"instance-availability-zone", "", "The availability zone where this instance is running. Required if zone-awareness is enabled.") + f.BoolVar(&cfg.KeepInstanceInTheRingOnShutdown, rfprefix+"keep-instance-in-the-ring-on-shutdown", false, "Keep instance in the ring on shut down.") cfg.RingCheckPeriod = 5 * time.Second @@ -111,13 +115,14 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl instancePort := ring.GetInstancePort(cfg.InstancePort, cfg.ListenPort) return ring.BasicLifecyclerConfig{ - ID: cfg.InstanceID, - Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort), - HeartbeatPeriod: cfg.HeartbeatPeriod, - TokensObservePeriod: 0, - Zone: cfg.InstanceZone, - NumTokens: RingNumTokens, - FinalSleep: cfg.FinalSleep, + ID: cfg.InstanceID, + Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort), + HeartbeatPeriod: cfg.HeartbeatPeriod, + TokensObservePeriod: 0, + Zone: cfg.InstanceZone, + NumTokens: RingNumTokens, + FinalSleep: cfg.FinalSleep, + KeepInstanceInTheRingOnShutdown: cfg.KeepInstanceInTheRingOnShutdown, }, nil } diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 0081e9ab78d..47b02d36d1d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -417,6 +417,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC delegate := ring.BasicLifecyclerDelegate(am) delegate = ring.NewLeaveOnStoppingDelegate(delegate, am.logger) delegate = ring.NewAutoForgetDelegate(am.cfg.ShardingRing.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, am.logger) + delegate = ring.NewTokensPersistencyDelegate(am.cfg.ShardingRing.TokensFilePath, ring.JOINING, delegate, am.logger) am.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, am.logger, prometheus.WrapRegistererWithPrefix("cortex_", am.registry)) if err != nil { From 8e8057b04f969a231e1ede805eeae8c66b386339 Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Tue, 4 Mar 2025 11:05:17 -0800 Subject: [PATCH 2/4] changelog Signed-off-by: Daniel Deluiggi --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae3a06fbd98..a6229de92bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458 * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526 * [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590 +* [FEATURE] AlertManager: Add `keep_instance_in_the_ring_on_shutdown` and `tokens_file_path` configs for alertmanager ring. #6628 * [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605 * [ENHANCEMENT] Update prometheus version to v3.1.0. #6583 * [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533 From 8821594ace376cef4356f9d0b8d898ca8b1b4b47 Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Tue, 4 Mar 2025 11:19:34 -0800 Subject: [PATCH 3/4] docs Signed-off-by: Daniel Deluiggi --- docs/configuration/config-file-reference.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index f517ede562c..0fd956f0288 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -388,6 +388,11 @@ sharding_ring: # CLI flag: -alertmanager.sharding-ring.zone-awareness-enabled [zone_awareness_enabled: | default = false] + # File path where tokens are stored. If empty, tokens are not stored at + # shutdown and restored at startup. + # CLI flag: -alertmanager.sharding-ring.tokens-file-path + [tokens_file_path: | default = ""] + # The sleep seconds when alertmanager is shutting down. Need to be close to or # larger than KV Store information propagation delay # CLI flag: -alertmanager.sharding-ring.final-sleep @@ -397,6 +402,10 @@ sharding_ring: # CLI flag: -alertmanager.sharding-ring.wait-instance-state-timeout [wait_instance_state_timeout: | default = 10m] + # Keep instance in the ring on shut down. + # CLI flag: -alertmanager.sharding-ring.keep-instance-in-the-ring-on-shutdown + [keep_instance_in_the_ring_on_shutdown: | default = false] + # Name of network interface to read address from. # CLI flag: -alertmanager.sharding-ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] From 280700503f74ad09284495dfa4c11188c43b4e75 Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Wed, 5 Mar 2025 09:29:09 -0800 Subject: [PATCH 4/4] changelog Signed-off-by: Daniel Deluiggi --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6229de92bc..bca910fe024 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,6 @@ * [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458 * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526 * [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590 -* [FEATURE] AlertManager: Add `keep_instance_in_the_ring_on_shutdown` and `tokens_file_path` configs for alertmanager ring. #6628 * [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605 * [ENHANCEMENT] Update prometheus version to v3.1.0. #6583 * [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533 @@ -15,6 +14,7 @@ * [ENHANCEMENT] Alertmanager: Add receiver validations for msteamsv2 and rocketchat. #6606 * [ENHANCEMENT] Query Frontend: Add a `-frontend.enabled-ruler-query-stats` flag to configure whether to report the query stats log for queries coming from the Ruler. #6504 * [ENHANCEMENT] OTLP: Support otlp metadata ingestion. #6617 +* [ENHANCEMENT] AlertManager: Add `keep_instance_in_the_ring_on_shutdown` and `tokens_file_path` configs for alertmanager ring. #6628 * [BUGFIX] Ingester: Avoid error or early throttling when READONLY ingesters are present in the ring #6517 * [BUGFIX] Ingester: Fix labelset data race condition. #6573 * [BUGFIX] Compactor: Cleaner should not put deletion marker for blocks with no-compact marker. #6576