diff --git a/CHANGELOG.md b/CHANGELOG.md index 63939d5e85c..8436f70ff8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ * `cortex_alertmanager_state_persist_failed_total` * [ENHANCEMENT] Blocks storage: support ingesting exemplars. Enabled by setting new CLI flag `-blocks-storage.tsdb.max-exemplars=` or config option `blocks_storage.tsdb.max_exemplars` to positive value. #4124 * [ENHANCEMENT] Distributor: Added distributors ring status section in the admin page. #4151 +* [ENHANCEMENT] Added zone-awareness support to alertmanager for use when sharding is enabled. When zone-awareness is enabled, alerts will be replicated across availability zones. #4204 * [ENHANCEMENT] Added `tenant_ids` tag to tracing spans #4147 * [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128 * [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 28b2222b419..f45f8c722f7 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1903,10 +1903,20 @@ sharding_ring: # CLI flag: -alertmanager.sharding-ring.replication-factor [replication_factor: | default = 3] + # True to enable zone-awareness and replicate alerts across different + # availability zones. + # CLI flag: -alertmanager.sharding-ring.zone-awareness-enabled + [zone_awareness_enabled: | default = false] + # Name of network interface to read address from. # CLI flag: -alertmanager.sharding-ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] + # The availability zone where this instance is running. Required if + # zone-awareness is enabled. + # CLI flag: -alertmanager.sharding-ring.instance-availability-zone + [instance_availability_zone: | default = ""] + # Filename of fallback config to use if none specified for instance. # CLI flag: -alertmanager.configs.fallback [fallback_config_file: | default = ""] diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index 5cdef80fdde..04d08d2d656 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -42,16 +42,18 @@ var SyncRingOp = ring.NewOp([]ring.InstanceState{ring.ACTIVE, ring.JOINING}, fun // is used to strip down the config to the minimum, and avoid confusion // to the user. type RingConfig struct { - KVStore kv.Config `yaml:"kvstore" doc:"description=The key-value store used to share the hash ring across multiple instances."` - HeartbeatPeriod time.Duration `yaml:"heartbeat_period"` - HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"` - ReplicationFactor int `yaml:"replication_factor"` + KVStore kv.Config `yaml:"kvstore" doc:"description=The key-value store used to share the hash ring across multiple instances."` + HeartbeatPeriod time.Duration `yaml:"heartbeat_period"` + HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"` + ReplicationFactor int `yaml:"replication_factor"` + ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"` // Instance details InstanceID string `yaml:"instance_id" doc:"hidden"` InstanceInterfaceNames []string `yaml:"instance_interface_names"` InstancePort int `yaml:"instance_port" doc:"hidden"` InstanceAddr string `yaml:"instance_addr" doc:"hidden"` + InstanceZone string `yaml:"instance_availability_zone"` // Injected internally ListenPort int `yaml:"-"` @@ -77,6 +79,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.") f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring.") f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.") + f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} @@ -84,6 +87,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.InstanceAddr, rfprefix+"instance-addr", "", "IP address to advertise in the ring.") f.IntVar(&cfg.InstancePort, rfprefix+"instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).") f.StringVar(&cfg.InstanceID, rfprefix+"instance-id", hostname, "Instance ID to register in the ring.") + f.StringVar(&cfg.InstanceZone, rfprefix+"instance-availability-zone", "", "The availability zone where this instance is running. Required if zone-awareness is enabled.") cfg.RingCheckPeriod = 5 * time.Second } @@ -103,6 +107,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() (ring.BasicLifecyclerConfig, error) Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort), HeartbeatPeriod: cfg.HeartbeatPeriod, TokensObservePeriod: 0, + Zone: cfg.InstanceZone, NumTokens: RingNumTokens, }, nil } @@ -114,6 +119,7 @@ func (cfg *RingConfig) ToRingConfig() ring.Config { rc.KVStore = cfg.KVStore rc.HeartbeatTimeout = cfg.HeartbeatTimeout rc.ReplicationFactor = cfg.ReplicationFactor + rc.ZoneAwarenessEnabled = cfg.ZoneAwarenessEnabled return rc } diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index ef4632b18f0..c0bf32ba54d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -86,9 +86,10 @@ const ( var ( statusTemplate *template.Template - errInvalidExternalURL = errors.New("the configured external URL is invalid: should not end with /") - errShardingLegacyStorage = errors.New("deprecated -alertmanager.storage.* not supported with -alertmanager.sharding-enabled, use -alertmanager-storage.*") - errShardingUnsupportedStorage = errors.New("the configured alertmanager storage backend is not supported when sharding is enabled") + errInvalidExternalURL = errors.New("the configured external URL is invalid: should not end with /") + errShardingLegacyStorage = errors.New("deprecated -alertmanager.storage.* not supported with -alertmanager.sharding-enabled, use -alertmanager-storage.*") + errShardingUnsupportedStorage = errors.New("the configured alertmanager storage backend is not supported when sharding is enabled") + errZoneAwarenessEnabledWithoutZoneInfo = errors.New("the configured alertmanager has zone awareness enabled but zone is not set") ) func init() { @@ -197,6 +198,9 @@ func (cfg *MultitenantAlertmanagerConfig) Validate(storageCfg alertstore.Config) if !storageCfg.IsFullStateSupported() { return errShardingUnsupportedStorage } + if cfg.ShardingRing.ZoneAwarenessEnabled && cfg.ShardingRing.InstanceZone == "" { + return errZoneAwarenessEnabledWithoutZoneInfo + } } return nil diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 48c311a29dc..bfb326749b4 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -153,6 +153,13 @@ func TestMultitenantAlertmanagerConfig_Validate(t *testing.T) { }, expected: errShardingLegacyStorage, }, + "should fail if zone aware is enabled but zone is not set": { + setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { + cfg.ShardingEnabled = true + cfg.ShardingRing.ZoneAwarenessEnabled = true + }, + expected: errZoneAwarenessEnabledWithoutZoneInfo, + }, } for testName, testData := range tests { @@ -601,6 +608,78 @@ func TestMultitenantAlertmanager_deleteUnusedLocalUserState(t *testing.T) { require.NotZero(t, dirs[user2]) // has config, files survived } +func TestMultitenantAlertmanager_zoneAwareSharding(t *testing.T) { + ctx := context.Background() + alertStore := prepareInMemoryAlertStore() + ringStore := consul.NewInMemoryClient(ring.GetCodec()) + const ( + user1 = "user1" + user2 = "user2" + user3 = "user3" + ) + + createInstance := func(i int, zone string, registries *util.UserRegistries) *MultitenantAlertmanager { + reg := prometheus.NewPedanticRegistry() + cfg := mockAlertmanagerConfig(t) + instanceID := fmt.Sprintf("instance-%d", i) + registries.AddUserRegistry(instanceID, reg) + + cfg.ShardingRing.ReplicationFactor = 2 + cfg.ShardingRing.InstanceID = instanceID + cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.1-%d", i) + cfg.ShardingEnabled = true + cfg.ShardingRing.ZoneAwarenessEnabled = true + cfg.ShardingRing.InstanceZone = zone + + am, err := createMultitenantAlertmanager(cfg, nil, nil, alertStore, ringStore, nil, log.NewLogfmtLogger(os.Stdout), reg) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) + }) + require.NoError(t, services.StartAndAwaitRunning(ctx, am)) + + return am + } + + registriesZoneA := util.NewUserRegistries() + registriesZoneB := util.NewUserRegistries() + + am1ZoneA := createInstance(1, "zoneA", registriesZoneA) + am2ZoneA := createInstance(2, "zoneA", registriesZoneA) + am1ZoneB := createInstance(3, "zoneB", registriesZoneB) + + { + require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user1, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user2, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user3, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + + err := am1ZoneA.loadAndSyncConfigs(context.Background(), reasonPeriodic) + require.NoError(t, err) + err = am2ZoneA.loadAndSyncConfigs(context.Background(), reasonPeriodic) + require.NoError(t, err) + err = am1ZoneB.loadAndSyncConfigs(context.Background(), reasonPeriodic) + require.NoError(t, err) + } + + metricsZoneA := registriesZoneA.BuildMetricFamiliesPerUser() + metricsZoneB := registriesZoneB.BuildMetricFamiliesPerUser() + + assert.Equal(t, float64(3), metricsZoneA.GetSumOfGauges("cortex_alertmanager_tenants_owned")) + assert.Equal(t, float64(3), metricsZoneB.GetSumOfGauges("cortex_alertmanager_tenants_owned")) +} + func TestMultitenantAlertmanager_deleteUnusedRemoteUserState(t *testing.T) { ctx := context.Background()