From 6455487a2e4440c7bdc011671118822e4f4ed8f1 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 24 Jul 2020 15:49:43 +0100 Subject: [PATCH 1/6] Support Alertmanager HA With this, we can now support increasing the number of replicas for a Cortex AM thus enabling HA. Please note that Alerts themselves are not gossiped between Alertmanagers. Each Ruler needs to send the alert to every Alertmanager available thus the reason why a headless service gets created when the number of replicas is more than 1. --- cortex/alertmanager.libsonnet | 36 ++++++++++++++++++++++++++++++----- cortex/config.libsonnet | 4 ++++ cortex/images.libsonnet | 3 +-- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index e5bb15d8..d19c5436 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -4,15 +4,23 @@ local container = $.core.v1.container, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, + local isGossiping = $._config.alertmanager.replicas > 1, + local peers = if isGossiping then + [ + 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager_gossip_port] + for i in std.range(0, $._config.alertmanager.replicas - 1) + ] + else [], alertmanager_args:: { target: 'alertmanager', 'log.level': 'debug', + 'experimental.alertmanager.enable-api': 'true', 'alertmanager.storage.type': 'gcs', 'alertmanager.storage.path': '/data', - 'alertmanager.gcs.bucketname': '%(cluster)s-cortex-configdb-%(namespace)s' % $._config, + 'alertmanager.storage.gcs.bucketname': '%(cluster)s-cortex-%(namespace)s' % $._config, 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, }, @@ -27,8 +35,22 @@ alertmanager_container:: if $._config.alertmanager_enabled then container.new('alertmanager', $._images.alertmanager) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + + container.withPorts( + $.util.defaultPorts + + if isGossiping then [ + $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager_gossip_port), + $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager_gossip_port), + ] + else [], + ) + + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + + container.withArgsMixin( + $.util.mapToFlags($.alertmanager_args) + + if isGossiping then + ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager_gossip_port] + + ['--cluster.peer=%s' % peer for peer in peers] + else [], + ) + container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + $.util.resourcesRequests('100m', '1Gi') + $.util.readinessProbe + @@ -37,7 +59,7 @@ alertmanager_statefulset: if $._config.alertmanager_enabled then - statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) + + statefulSet.new('alertmanager', $._config.alertmanager.replicas, [$.alertmanager_container], $.alertmanager_pvc) + statefulSet.mixin.spec.withServiceName('alertmanager') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + @@ -50,6 +72,10 @@ alertmanager_service: if $._config.alertmanager_enabled then - $.util.serviceFor($.alertmanager_statefulset) + if $._config.alertmanager.replicas > 1 then + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.spec.withClusterIp('None') + else + $.util.serviceFor($.alertmanager_statefulset) else {}, } diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index d359f22f..87d38bb9 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -243,6 +243,10 @@ }, }[$._config.ruler_client_type], + alertmanager: { + replicas: 1, + }, + overrides: { // === Per-tenant usage limits. === // diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index a4be104a..91466f8f 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -18,8 +18,7 @@ store_gateway: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', - // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. - alertmanager: 'jtlisi/cortex:20190819_alertmanager_update-faa66aa43', + alertmanager: 'quay.io/cortexproject/cortex:master-2b41aa38d', testExporter: 'cortexproject/test-exporter:master-be013707', }, } From d2cb71fd3e6f476d88159010f32608c88c114f43 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 24 Jul 2020 16:04:56 +0100 Subject: [PATCH 2/6] Setup the gossip port --- cortex/alertmanager.libsonnet | 6 +++--- cortex/config.libsonnet | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index d19c5436..23b95de3 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -7,7 +7,7 @@ local isGossiping = $._config.alertmanager.replicas > 1, local peers = if isGossiping then [ - 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager_gossip_port] + 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] for i in std.range(0, $._config.alertmanager.replicas - 1) ] else [], @@ -38,8 +38,8 @@ container.withPorts( $.util.defaultPorts + if isGossiping then [ - $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager_gossip_port), - $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager_gossip_port), + $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), + $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), ] else [], ) + diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 87d38bb9..5e720455 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -245,6 +245,7 @@ alertmanager: { replicas: 1, + gossip_port: 9094, }, overrides: { From 8c44b356490a998b43dc6cb5bf88dd13052b4573 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 29 Jul 2020 17:59:08 +0100 Subject: [PATCH 3/6] s/isGossiping/isHa --- cortex/alertmanager.libsonnet | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index 23b95de3..61d04f5b 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -4,8 +4,8 @@ local container = $.core.v1.container, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, - local isGossiping = $._config.alertmanager.replicas > 1, - local peers = if isGossiping then + local isHA = $._config.alertmanager.replicas > 1, + local peers = if isHA then [ 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] for i in std.range(0, $._config.alertmanager.replicas - 1) @@ -37,7 +37,7 @@ container.new('alertmanager', $._images.alertmanager) + container.withPorts( $.util.defaultPorts + - if isGossiping then [ + if isHA then [ $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), ] @@ -46,7 +46,7 @@ container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + - if isGossiping then + if isHA then ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager_gossip_port] + ['--cluster.peer=%s' % peer for peer in peers] else [], @@ -72,7 +72,7 @@ alertmanager_service: if $._config.alertmanager_enabled then - if $._config.alertmanager.replicas > 1 then + if isHA then $.util.serviceFor($.alertmanager_statefulset) + service.mixin.spec.withClusterIp('None') else From 2e9cc55e6fd32001ea820a6ce17b269cf130f0df Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 12:08:48 +0100 Subject: [PATCH 4/6] Bump to 3 replicas by default --- cortex/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 5e720455..35cc73d4 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -244,7 +244,7 @@ }[$._config.ruler_client_type], alertmanager: { - replicas: 1, + replicas: 3, gossip_port: 9094, }, From 69af294c2f58f7ddc75696814924b53ea1c684ba Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 12:10:38 +0100 Subject: [PATCH 5/6] Update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ecdc3fc..1e47ebeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,12 @@ ## master / unreleased +* [BUGFIX] Add support the `local` ruler client type * [CHANGE] The project is now licensed with Apache-2.0 license. #169 * [CHANGE] Add overrides config to tsdb store-gateway. #167 * [CHANGE] Ingesters now default to running as `StatefulSet` with WAL enabled. It is controlled by the config `$._config.ingester_deployment_without_wal` which is `false` by default. Setting the config to `true` will yeild the old behaviour (stateless `Deployment` without WAL enabled). #72 * [CHANGE] We now allow queries that are 32 days long. For example, rate(metric[32d]). Before it was 31d. #173 -* [BUGFIX] Add support the `local` ruler client type +* [ENHANCEMENT] Enable support for HA in the Cortex Alertmanager #147 ## 1.3.0 / 2020-08-21 From eeb2c8029ea47d0e80a70f9f5fcffee74d1b7235 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 12:18:42 +0100 Subject: [PATCH 6/6] Bump the cortex image, the latest stable is 1.3 --- cortex/images.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 91466f8f..14bc4781 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,8 +5,9 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.2.0', + cortex: 'cortexproject/cortex:v1.3.0', + alertmanager: self.cortex, distributor: self.cortex, ingester: self.cortex, querier: self.cortex, @@ -18,7 +19,6 @@ store_gateway: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', - alertmanager: 'quay.io/cortexproject/cortex:master-2b41aa38d', testExporter: 'cortexproject/test-exporter:master-be013707', }, }