From 1c30bb5267719a22e234a30ead05e04b3bd38965 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Sat, 16 Nov 2019 13:25:58 +0000 Subject: [PATCH 001/192] Initial commit just move everything over Signed-off-by: Goutham Veeramachaneni --- operations/mimir/alertmanager.libsonnet | 32 +++ operations/mimir/common.libsonnet | 14 + operations/mimir/config.libsonnet | 251 ++++++++++++++++++ operations/mimir/consul.libsonnet | 59 ++++ .../mimir/cortex-manifests.jsonnet.example | 26 ++ operations/mimir/cortex.libsonnet | 18 ++ operations/mimir/distributor.libsonnet | 52 ++++ operations/mimir/etcd.libsonnet | 9 + operations/mimir/images.libsonnet | 21 ++ operations/mimir/ingester.libsonnet | 69 +++++ operations/mimir/jsonnetfile.json | 44 +++ operations/mimir/jsonnetfile.lock.json | 48 ++++ operations/mimir/memcached.libsonnet | 63 +++++ operations/mimir/postgresql.libsonnet | 29 ++ operations/mimir/querier.libsonnet | 52 ++++ operations/mimir/query-frontend.libsonnet | 60 +++++ operations/mimir/ruler.libsonnet | 53 ++++ operations/mimir/table-manager.libsonnet | 53 ++++ operations/mimir/test-exporter.libsonnet | 40 +++ 19 files changed, 993 insertions(+) create mode 100644 operations/mimir/alertmanager.libsonnet create mode 100644 operations/mimir/common.libsonnet create mode 100644 operations/mimir/config.libsonnet create mode 100644 operations/mimir/consul.libsonnet create mode 100644 operations/mimir/cortex-manifests.jsonnet.example create mode 100644 operations/mimir/cortex.libsonnet create mode 100644 operations/mimir/distributor.libsonnet create mode 100644 operations/mimir/etcd.libsonnet create mode 100644 operations/mimir/images.libsonnet create mode 100644 operations/mimir/ingester.libsonnet create mode 100644 operations/mimir/jsonnetfile.json create mode 100644 operations/mimir/jsonnetfile.lock.json create mode 100644 operations/mimir/memcached.libsonnet create mode 100644 operations/mimir/postgresql.libsonnet create mode 100644 operations/mimir/querier.libsonnet create mode 100644 operations/mimir/query-frontend.libsonnet create mode 100644 operations/mimir/ruler.libsonnet create mode 100644 operations/mimir/table-manager.libsonnet create mode 100644 operations/mimir/test-exporter.libsonnet diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet new file mode 100644 index 00000000000..168f6be28ac --- /dev/null +++ b/operations/mimir/alertmanager.libsonnet @@ -0,0 +1,32 @@ +{ + local container = $.core.v1.container, + + alertmanager_args:: + { + target: 'alertmanager', + 'log.level': 'debug', + + 'alertmanager.storage.type': 'gcs', + 'alertmanager.gcs.bucketname': '%(cluster)s-cortex-configdb-%(namespace)s' % $._config, + 'alertmanager.web.external-url': 'http://alertmanager.%s.svc.cluster.local/alertmanager' % $._config.namespace, + }, + + alertmanager_container:: + container.new('alertmanager', $._images.alertmanager) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + + $.util.resourcesRequests('100m', '1Gi') + + $.jaeger_mixin, + + local deployment = $.apps.v1beta1.deployment, + + alertmanager_deployment: + deployment.new('alertmanager', 1, [$.alertmanager_container]) + + deployment.mixin.spec.template.spec.withRestartPolicy('Always') + + $.util.antiAffinity, + + local service = $.core.v1.service, + + alertmanager_server: + $.util.serviceFor($.alertmanager_deployment), +} diff --git a/operations/mimir/common.libsonnet b/operations/mimir/common.libsonnet new file mode 100644 index 00000000000..62a5a338c49 --- /dev/null +++ b/operations/mimir/common.libsonnet @@ -0,0 +1,14 @@ +{ + namespace: + $.core.v1.namespace.new($._config.namespace), + + util+:: { + local containerPort = $.core.v1.containerPort, + + defaultPorts:: + [ + containerPort.newNamed('http-metrics', 80), + containerPort.newNamed('grpc', 9095), + ], + }, +} diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet new file mode 100644 index 00000000000..17350551a30 --- /dev/null +++ b/operations/mimir/config.libsonnet @@ -0,0 +1,251 @@ +{ + _config+: { + namespace: error 'must define namespace', + cluster: error 'must define cluster', + replication_factor: 3, + + storage_backend: error 'must specify storage backend (cassandra, gcp)', + table_prefix: $._config.namespace, + cassandra_addresses: error 'must specify cassandra addresses', + bigtable_instance: error 'must specify bigtable instance', + bigtable_project: error 'must specify bigtable project', + aws_region: error 'must specify AWS region', + s3_bucket_name: error 'must specify S3 bucket name', + + // schema is used to generate the storage schema yaml file used by + // the Cortex chunks storage: + // - More information: https://github.com/cortexproject/cortex/pull/1072 + // - TSDB integration doesn't rely on the Cortex chunks store, so doesn't + // support the schema config. + schema: if $._config.storage_engine != 'tsdb' then + error 'must specify a schema config' + else + [], + + max_series_per_user: 250000, + max_series_per_metric: 10000, + max_chunk_idle: '15m', + + test_exporter_enabled: false, + test_exporter_start_time: error 'must specify test exporter start time', + test_exporter_user_id: error 'must specify test exporter used id', + + querierConcurrency: 8, + querier_ingester_streaming_enabled: $._config.storage_engine != 'tsdb', + + jaeger_agent_host: null, + + // Use the Cortex chunks storage engine by default, while giving the ability + // to switch to tsdb storage. + storage_engine: 'chunks', + storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', + + // TSDB storage engine doesn't require the table manager. + table_manager_enabled: $._config.storage_engine != 'tsdb', + + // TSDB storage engine doesn't require memcached for chunks or chunk indexes. + memcached_index_queries_enabled: $._config.storage_engine != 'tsdb', + memcached_index_writes_enabled: $._config.storage_engine != 'tsdb', + memcached_chunks_enabled: $._config.storage_engine != 'tsdb', + + enabledBackends: [ + backend + for backend in std.split($._config.storage_backend, ',') + ], + + client_configs: { + aws: + if std.count($._config.enabledBackends, 'aws') > 0 then { + 'dynamodb.api-limit': 10, + 'dynamodb.url': 'https://%s' % $._config.aws_region, + 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], + } else {}, + cassandra: + if std.count($._config.enabledBackends, 'cassandra') > 0 then { + 'cassandra.keyspace': $._config.namespace, + 'cassandra.addresses': $._config.cassandra_addresses, + 'cassandra.replication-factor': $._config.replication_factor, + } else {}, + gcp: + if std.count($._config.enabledBackends, 'gcp') > 0 then { + 'bigtable.project': $._config.bigtable_project, + 'bigtable.instance': $._config.bigtable_instance, + } else {}, + }, + + storeConfig: self.storeMemcachedChunksConfig, + + storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled then + { + 'memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, + 'memcached.service': 'memcached-client', + 'memcached.timeout': '3s', + 'memcached.batchsize': 1024, + 'memcached.consistent-hash': true, + } + else {}, + + storageConfig: + $._config.client_configs.aws + + $._config.client_configs.cassandra + + $._config.client_configs.gcp + + $._config.storageTSDBConfig + + { 'config-yaml': '/etc/cortex/schema/config.yaml' }, + + // TSDB blocks storage configuration, used only when 'tsdb' storage + // engine is explicitly enabled. + storageTSDBConfig: if $._config.storage_engine == 'tsdb' then { + 'store.engine': 'tsdb', + 'experimental.tsdb.dir': '/tmp/tsdb', + 'experimental.tsdb.sync-dir': '/tmp/tsdb', + 'experimental.tsdb.block-ranges-period': '2h', + 'experimental.tsdb.retention-period': '1h', + 'experimental.tsdb.ship-interval': '1m', + 'experimental.tsdb.backend': 'gcs', + 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, + } else {}, + + // Shared between the Ruler and Querier + queryConfig: { + // Use iterators to merge chunks, to reduce memory usage. + 'querier.ingester-streaming': $._config.querier_ingester_streaming_enabled, + 'querier.batch-iterators': true, + + // Don't query the chunk store for data younger than max_chunk_idle. + 'store.min-chunk-age': $._config.max_chunk_idle, + + // Don't query ingesters for older queries. + // Chunks are 6hrs right now. Add some slack for safety. + 'querier.query-ingesters-within': '12h', + + 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + + // Limit the size of the rows we read from the index. + 'store.cardinality-limit': 1e6, + + // Don't allow individual queries of longer than 31days. Due to day query + // splitting in the frontend, the reality is this only limits rate(foo[31d]) + // type queries. + 'store.max-query-length': '744h', + } + ( + if $._config.memcached_index_queries_enabled then + { + // Setting for index cache. + 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. + 'store.index-cache-read.cache.enable-fifocache': true, + 'store.index-cache-read.fifocache.size': 102400, + 'store.index-cache-read.memcached.hostname': 'memcached-index-queries.%(namespace)s.svc.cluster.local' % $._config, + 'store.index-cache-read.memcached.service': 'memcached-client', + 'store.index-cache-read.memcached.timeout': '500ms', + 'store.index-cache-read.memcached.consistent-hash': true, + 'store.cache-lookups-older-than': '36h', + } + else {} + ), + + ringConfig: { + 'consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'consul.consistent-reads': false, + 'ring.prefix': '', + }, + + // Some distributor config is shared with the querier. + distributorConfig: { + 'distributor.replication-factor': $._config.replication_factor, + 'distributor.shard-by-all-labels': true, + 'distributor.health-check-ingesters': true, + 'ring.heartbeat-timeout': '10m', + 'consul.consistent-reads': false, + }, + + overrides: { + // === Per-tenant usage limits. === + // These are the defaults. These are not global limits but per instance limits. + // + // small_user: { + // ingestion_rate: 10,000 + // ingestion_burst_size: 20,000 + // + // max_series_per_user: 250,000 + // max_series_per_metric: 10,000 + // + // max_series_per_query: 10,000 + // max_samples_per_query: 100,000 + // }, + + medium_user:: { + ingestion_rate: 25000, + ingestion_burst_size: 50000, + + max_series_per_metric: 100000, + max_series_per_user: 500000, + + max_series_per_query: 100000, + max_samples_per_query: 1000000, + }, + + big_user:: { + ingestion_rate: 50000, + ingestion_burst_size: 70000, + + max_series_per_metric: 100000, + max_series_per_user: 1000000, + + max_series_per_query: 100000, + max_samples_per_query: 1000000, + }, + + super_user:: { + ingestion_rate: 200000, + ingestion_burst_size: 240000, + + max_series_per_metric: 200000, + max_series_per_user: 2000000, + + max_series_per_query: 100000, + max_samples_per_query: 1000000, + }, + }, + + schemaID: std.md5(std.toString($._config.schema)), + + enable_pod_priorities: true, + }, + + local configMap = $.core.v1.configMap, + + overrides_config: + configMap.new('overrides') + + configMap.withData({ + 'overrides.yaml': $.util.manifestYaml({ + overrides: $._config.overrides, + }), + }), + + storage_config: + configMap.new('schema-' + $._config.schemaID) + + configMap.withData({ + 'config.yaml': $.util.manifestYaml({ + configs: $._config.schema, + }), + }), + + local deployment = $.apps.v1beta1.deployment, + storage_config_mixin:: + deployment.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), + + // This removed the CPU limit from the config. NB won't show up in subset + // diffs, but ks apply will do the right thing. + removeCPULimitsMixin:: { + resources+: { + // Can't use super.memory in limits, as we want to + // override the whole limits struct. + local memoryLimit = super.limits.memory, + + limits: { + memory: memoryLimit, + }, + }, + }, +} diff --git a/operations/mimir/consul.libsonnet b/operations/mimir/consul.libsonnet new file mode 100644 index 00000000000..9ece317d279 --- /dev/null +++ b/operations/mimir/consul.libsonnet @@ -0,0 +1,59 @@ +local consul = import 'consul/consul.libsonnet'; + +{ + _config+:: { + consul_replicas: 1, + other_namespaces+: [], + }, + + consul: consul { + _config+:: { + consul_replicas: $._config.consul_replicas, + namespace: $._config.namespace, + }, + + // Snapshot the raft.db very frequently, to stop it getting to big. + consul_config+:: { + raft_snapshot_threshold: 128, + raft_trailing_logs: 10e3, + }, + + local container = $.core.v1.container, + + consul_container+:: + container.withArgsMixin([ + '-ui-content-path=/%s/consul/' % $._config.namespace, + ]) + + $.util.resourcesRequests('4', '4Gi'), + + local deployment = $.apps.v1beta1.deployment, + local podAntiAffinity = deployment.mixin.spec.template.spec.affinity.podAntiAffinity, + local volume = $.core.v1.volume, + consul_deployment+: + + // Keep the consul state on a ramdisk, as they are ephemeral to us. + $.util.emptyVolumeMount( + 'data', + '/consul/data/', + volumeMixin=volume.mixin.emptyDir.withMedium('Memory'), + ) + + + // Ensure Consul is not scheduled on the same host as an ingester + // (in any namespace - hence other_namespaces). + podAntiAffinity.withRequiredDuringSchedulingIgnoredDuringExecutionMixin([ + podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.new() + + podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.mixin.labelSelector.withMatchLabels({ name: 'ingester' }) + + podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.withNamespaces([$._config.namespace] + $._config.other_namespaces) + + podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.withTopologyKey('kubernetes.io/hostname'), + ]) + + + $.util.podPriority('high'), + + // Don't healthcheck services, adds load to consul. + consul_exporter+:: + container.withArgsMixin([ + '--no-consul.health-summary', + '--consul.allow_stale', + ]), + }, +} diff --git a/operations/mimir/cortex-manifests.jsonnet.example b/operations/mimir/cortex-manifests.jsonnet.example new file mode 100644 index 00000000000..7edc14cd644 --- /dev/null +++ b/operations/mimir/cortex-manifests.jsonnet.example @@ -0,0 +1,26 @@ +local cortex = import "cortex/cortex.libsonnet"; + +cortex { + _config+:: { + namespace: "default", + schema: [{ + from: '2019-11-15', + store: 'bigtable-hashed', + object_store: 'gcs', + schema: 'v10', + index: { + prefix: 'dev_index_', + period: '168h', + }, + chunks: { + prefix: 'dev_chunks_', + period: '168h', + }, + }], + + storage_backend: 'gcp', + bigtable_instance: 'example-instance-prod', + bigtable_project: 'example-project1-cortex', + }, +} + diff --git a/operations/mimir/cortex.libsonnet b/operations/mimir/cortex.libsonnet new file mode 100644 index 00000000000..430c1d4374d --- /dev/null +++ b/operations/mimir/cortex.libsonnet @@ -0,0 +1,18 @@ +(import 'ksonnet-util/kausal.libsonnet') + +(import 'ksonnet-util/jaeger.libsonnet') + +(import 'images.libsonnet') + +(import 'common.libsonnet') + +(import 'config.libsonnet') + +(import 'consul.libsonnet') + + +// Cortex services +(import 'distributor.libsonnet') + +(import 'ingester.libsonnet') + +(import 'querier.libsonnet') + +(import 'query-frontend.libsonnet') + +(import 'table-manager.libsonnet') + + +// Supporting services +(import 'etcd.libsonnet') + +(import 'memcached.libsonnet') + +(import 'test-exporter.libsonnet') diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet new file mode 100644 index 00000000000..d5b147d845f --- /dev/null +++ b/operations/mimir/distributor.libsonnet @@ -0,0 +1,52 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + + distributor_args:: + $._config.ringConfig + + $._config.distributorConfig + + { + target: 'distributor', + + 'distributor.ingestion-rate-limit': 10000, + 'distributor.ingestion-burst-size': 20000, + 'validation.reject-old-samples': true, + 'validation.reject-old-samples.max-age': '12h', + 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'distributor.remote-timeout': '20s', + + 'distributor.ha-tracker.enable': true, + 'distributor.ha-tracker.enable-for-all-users': true, + 'distributor.ha-tracker.store': 'etcd', + 'distributor.ha-tracker.etcd.endpoints': 'etcd-client.%s.svc.cluster.local.:2379' % $._config.namespace, + 'distributor.ha-tracker.prefix': 'prom_ha/', + + // The memory requests are 2G, and we barely use 100M. + // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at + // around 1.25G, reducing the 99%ile. + 'mem-ballast-size-bytes': 1 << 30, // 1GB + }, + + distributor_container:: + container.new('distributor', $._images.distributor) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + + $.util.resourcesRequests('2', '2Gi') + + $.util.resourcesLimits('6', '4Gi') + + $.jaeger_mixin, + + local deployment = $.apps.v1beta1.deployment, + + distributor_deployment: + deployment.new('distributor', 3, [ + $.distributor_container, + ]) + + $.util.antiAffinity + + $.util.configVolumeMount('overrides', '/etc/cortex'), + + local service = $.core.v1.service, + + distributor_service: + $.util.serviceFor($.distributor_deployment) + + service.mixin.spec.withClusterIp('None'), +} diff --git a/operations/mimir/etcd.libsonnet b/operations/mimir/etcd.libsonnet new file mode 100644 index 00000000000..41981db9ffb --- /dev/null +++ b/operations/mimir/etcd.libsonnet @@ -0,0 +1,9 @@ +local etcd_cluster = import 'etcd-operator/etcd-cluster.libsonnet'; + +etcd_cluster { + etcd: + $.etcd_cluster('etcd', env=[{ + name: 'ETCD_AUTO_COMPACTION_RETENTION', + value: '1h', + }]), +} diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet new file mode 100644 index 00000000000..8d5ccce41af --- /dev/null +++ b/operations/mimir/images.libsonnet @@ -0,0 +1,21 @@ +{ + _images+:: { + // Various third-party images. + memcached: 'memcached:1.5.17-alpine', + memcachedExporter: 'prom/memcached-exporter:v0.6.0', + postgresql: 'postgres:9.6.11-alpine', + + // Our services. + cortex: 'cortexproject/cortex:master-37c1f178', + + distributor: self.cortex, + ingester: self.cortex, + querier: self.cortex, + query_frontend: self.cortex, + tableManager: self.cortex, + // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. + ruler: 'jtlisi/cortex:20190806_prommanager_ruler_with_api-50343f8d', + alertmanager: 'jtlisi/cortex:20190819_alertmanager_update-165b393a', + testExporter: 'cortexproject/test-exporter:master-ef99cdaf', + }, +} diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet new file mode 100644 index 00000000000..0e08ba15f99 --- /dev/null +++ b/operations/mimir/ingester.libsonnet @@ -0,0 +1,69 @@ +{ + local container = $.core.v1.container, + + ingester_args:: + $._config.ringConfig + + $._config.storeConfig + + $._config.storageConfig + + { + target: 'ingester', + + // Ring config. + 'ingester.num-tokens': 512, + 'ingester.join-after': '30s', + 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. + 'ingester.claim-on-rollout': true, + 'ingester.heartbeat-period': '15s', + + // Chunk building/flushing config. + 'ingester.chunk-encoding': 3, // Bigchunk encoding + 'ingester.retain-period': '15m', + 'ingester.max-chunk-age': '6h', + 'ingester.spread-flushes': true, + + // Limits config. + 'ingester.max-chunk-idle': $._config.max_chunk_idle, + 'ingester.max-series-per-user': $._config.max_series_per_user, + 'ingester.max-series-per-metric': $._config.max_series_per_metric, + 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'server.grpc-max-concurrent-streams': 100000, + } + ( + if $._config.memcached_index_writes_enabled then + { + // Setup index write deduping. + 'store.index-cache-write.memcached.hostname': 'memcached-index-writes.%(namespace)s.svc.cluster.local' % $._config, + 'store.index-cache-write.memcached.service': 'memcached-client', + 'store.index-cache-write.memcached.consistent-hash': true, + } + else {} + ), + + ingester_container:: + container.new('ingester', $._images.ingester) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + + container.mixin.readinessProbe.httpGet.withPath('/ready') + + container.mixin.readinessProbe.httpGet.withPort(80) + + container.mixin.readinessProbe.withInitialDelaySeconds(15) + + container.mixin.readinessProbe.withTimeoutSeconds(1) + + + $.util.resourcesRequests('4', '15Gi') + + $.util.resourcesLimits(null, '25Gi') + + $.jaeger_mixin, + + local deployment = $.apps.v1beta1.deployment, + + ingester_deployment: + deployment.new('ingester', 3, [$.ingester_container]) + + $.util.antiAffinity + + $.util.configVolumeMount('overrides', '/etc/cortex') + + deployment.mixin.spec.withMinReadySeconds(60) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + $.storage_config_mixin + + $.util.podPriority('high'), + + ingester_service: + $.util.serviceFor($.ingester_deployment), +} diff --git a/operations/mimir/jsonnetfile.json b/operations/mimir/jsonnetfile.json new file mode 100644 index 00000000000..375b98130fb --- /dev/null +++ b/operations/mimir/jsonnetfile.json @@ -0,0 +1,44 @@ +{ + "dependencies": [ + { + "name": "consul", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "consul" + } + }, + "version": "master" + }, + { + "name": "etcd-operator", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "etcd-operator" + } + }, + "version": "master" + }, + { + "name": "ksonnet-util", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "ksonnet-util" + } + }, + "version": "master" + }, + { + "name": "memcached", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "memcached" + } + }, + "version": "master" + } + ] +} diff --git a/operations/mimir/jsonnetfile.lock.json b/operations/mimir/jsonnetfile.lock.json new file mode 100644 index 00000000000..e4f26b0b08c --- /dev/null +++ b/operations/mimir/jsonnetfile.lock.json @@ -0,0 +1,48 @@ +{ + "dependencies": [ + { + "name": "consul", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "consul" + } + }, + "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", + "sum": "S3cLCI5OLpSdwqsAWkNtdGXTlFTpuVGB29m6CXw8xHI=" + }, + { + "name": "etcd-operator", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "etcd-operator" + } + }, + "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", + "sum": "KUklp389C8zcSrYjRkIy00w81gP1HGU3eDmxghqtmBs=" + }, + { + "name": "ksonnet-util", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "ksonnet-util" + } + }, + "version": "250bf5499d81e5e77e1e5ed2242c89ad27485aec", + "sum": "8gmmSMANOAs4dfP5a09Y+nE9pd8E4TMpk3YPKxT4ys0=" + }, + { + "name": "memcached", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "memcached" + } + }, + "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", + "sum": "hroD9u119YWI5g2SnspmSgMDJUMyXFZDnjymhUS6Pjs=" + } + ] +} diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet new file mode 100644 index 00000000000..bd00189f031 --- /dev/null +++ b/operations/mimir/memcached.libsonnet @@ -0,0 +1,63 @@ +local memcached = import 'memcached/memcached.libsonnet'; + +memcached { + memcached+:: { + cpu_limits:: null, + + deployment: {}, + + local statefulSet = $.apps.v1beta1.statefulSet, + + statefulSet: + statefulSet.new(self.name, 3, [ + self.memcached_container, + self.memcached_exporter, + ], []) + + statefulSet.mixin.spec.withServiceName(self.name) + + $.util.antiAffinity, + + local service = $.core.v1.service, + + service: + $.util.serviceFor(self.statefulSet) + + service.mixin.spec.withClusterIp('None'), + }, + + // Dedicated memcached instance used to cache query results. + memcached_frontend: $.memcached { + name: 'memcached-frontend', + max_item_size: '5m', + }, + + // Dedicated memcached instance used to temporarily cache index lookups. + memcached_index_queries: if $._config.memcached_index_queries_enabled then + $.memcached { + name: 'memcached-index-queries', + max_item_size: '5m', + } + else {}, + + // Dedicated memcached instance used to dedupe writes to the index. + memcached_index_writes: if $._config.memcached_index_writes_enabled then + $.memcached { + name: 'memcached-index-writes', + } + else {}, + + // Memcached instance used to cache chunks. + memcached_chunks: if $._config.memcached_chunks_enabled then + $.memcached { + name: 'memcached', + + // Save memory by more tightly provisioning memcached chunks. + memory_limit_mb: 6 * 1024, + overprovision_factor: 1.05, + + local container = $.core.v1.container, + + // Raise connection limits now our clusters are bigger. + memcached_container+:: + container.withArgsMixin(['-c 4096']), + } + else {}, +} diff --git a/operations/mimir/postgresql.libsonnet b/operations/mimir/postgresql.libsonnet new file mode 100644 index 00000000000..d63eb8c66da --- /dev/null +++ b/operations/mimir/postgresql.libsonnet @@ -0,0 +1,29 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + + _config+: { + pgUser: 'cortex', + pgPassword: '1234', + }, + + postgresql_container:: + container.new('postgres', $._images.postgresql) + + container.withPorts([ + containerPort.newNamed('postgresql', 5432), + ]) + + container.withEnvMap({ + POSTGRES_USER: $._config.pgUser, + POSTGRES_DB: 'configs', + }) + + $.util.resourcesRequests('2', '1Gi') + + $.util.resourcesLimits('4', '2Gi'), + + local deployment = $.apps.v1beta1.deployment, + postgresql_deployment: + deployment.new('postgresql', 1, [$.postgresql_container]), + + local service = $.core.v1.service, + postgresql_service: + $.util.serviceFor($.postgresql_deployment), +} diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet new file mode 100644 index 00000000000..fda924c71fc --- /dev/null +++ b/operations/mimir/querier.libsonnet @@ -0,0 +1,52 @@ +{ + local container = $.core.v1.container, + + querier_args:: + $._config.ringConfig + + $._config.storeConfig + + $._config.storageConfig + + $._config.queryConfig + + $._config.distributorConfig + + { + target: 'querier', + + // Increase HTTP server response write timeout, as we were seeing some + // queries that return a lot of data timeing out. + 'server.http-write-timeout': '1m', + + // Limit query concurrency to prevent multi large queries causing an OOM. + 'querier.max-concurrent': $._config.querierConcurrency, + + // Limit to N/2 worker threads per frontend, as we have two frontends. + 'querier.worker-parallelism': $._config.querierConcurrency / 2, + 'querier.frontend-address': 'query-frontend.%(namespace)s.svc.cluster.local:9095' % $._config, + 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, + + 'log.level': 'debug', + }, + + querier_container:: + container.new('querier', $._images.querier) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.querier_args)) + + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '24Gi') + + $.jaeger_mixin + + container.withEnvMap({ + JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. + }), + + local deployment = $.apps.v1beta1.deployment, + + querier_deployment: + deployment.new('querier', 3, [$.querier_container]) + + $.util.antiAffinity + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.storage_config_mixin, + + local service = $.core.v1.service, + + querier_service: + $.util.serviceFor($.querier_deployment) + + service.mixin.spec.withSelector({ name: 'query-frontend' }), +} diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet new file mode 100644 index 00000000000..a9e64abe1fc --- /dev/null +++ b/operations/mimir/query-frontend.libsonnet @@ -0,0 +1,60 @@ +{ + local container = $.core.v1.container, + + query_frontend_args:: { + target: 'query-frontend', + + // Need log.level=debug so all queries are logged, needed for analyse.py. + 'log.level': 'debug', + + // Increase HTTP server response write timeout, as we were seeing some + // queries that return a lot of data timeing out. + 'server.http-write-timeout': '1m', + + // Split long queries up into multiple day-long queries. + 'querier.split-queries-by-day': true, + + // Cache query results. + 'querier.align-querier-with-step': true, + 'querier.cache-results': true, + 'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace, + 'frontend.memcached.service': 'memcached-client', + 'frontend.memcached.timeout': '500ms', + 'frontend.memcached.consistent-hash': true, + + // So that exporters like cloudwatch can still send in data and be un-cached. + 'frontend.max-cache-freshness': '10m', + + // Compress HTTP responses; improves latency for very big results and slow + // connections. + 'querier.compress-http-responses': true, + + // So it can recieve big responses from the querier. + 'server.grpc-max-recv-msg-size-bytes': 100 << 20, + + // Limit queries to 500 days, allow this to be override per-user. + 'store.max-query-length': '12000h', // 500 Days + 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + }, + + query_frontend_container:: + container.new('query-frontend', $._images.query_frontend) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + + $.util.resourcesRequests('2', '600Mi') + + $.util.resourcesLimits(null, '1200Mi') + + $.jaeger_mixin, + + local deployment = $.apps.v1beta1.deployment, + + query_frontend_deployment: + deployment.new('query-frontend', 2, [$.query_frontend_container]) + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.antiAffinity, + + local service = $.core.v1.service, + + query_frontend_service: + $.util.serviceFor($.query_frontend_deployment) + + service.mixin.spec.withClusterIp('None'), +} diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet new file mode 100644 index 00000000000..749c8f8403a --- /dev/null +++ b/operations/mimir/ruler.libsonnet @@ -0,0 +1,53 @@ +{ + local container = $.core.v1.container, + + ruler_args:: + $._config.ringConfig + + $._config.storeConfig + + $._config.storageConfig + + $._config.queryConfig + + $._config.distributorConfig + + { + target: 'ruler', + // Alertmanager configs + 'ruler.alertmanager-url': 'http://alertmanager.%s.svc.cluster.local/alertmanager' % $._config.namespace, + + // Ring Configs + 'ruler.enable-sharding': true, + 'ruler.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'ruler.consul.consistent-reads': false, + 'ruler.prefix': 'rulers/', + 'ruler.distributor.replication-factor': 1, + 'ruler.claim-on-rollout': true, + 'ruler.join-after': '15s', + 'ruler.ring.heartbeat-timeout': '10m', + 'ruler.heartbeat-period': '1m', + 'ruler.search-pending-for': '1m', + + // Rule Storage Configs + 'ruler.storage.type': 'gcs', + 'rules.gcs.bucketname': '%(cluster)s-cortex-configdb-%(namespace)s' % $._config, + }, + + ruler_container:: + container.new('ruler', $._images.ruler) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + + $.util.resourcesRequests('1', '6Gi') + + $.util.resourcesLimits('16', '16Gi') + + $.jaeger_mixin, + + local deployment = $.apps.v1beta1.deployment, + + ruler_deployment: + deployment.new('ruler', 2, [$.ruler_container]) + + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + + $.util.antiAffinity + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.storage_config_mixin, + + local service = $.core.v1.service, + + ruler_service: + $.util.serviceFor($.ruler_deployment), +} diff --git a/operations/mimir/table-manager.libsonnet b/operations/mimir/table-manager.libsonnet new file mode 100644 index 00000000000..b392cd93d29 --- /dev/null +++ b/operations/mimir/table-manager.libsonnet @@ -0,0 +1,53 @@ +{ + local container = $.core.v1.container, + + table_manager_args:: + $._config.storageConfig + { + target: 'table-manager', + + // Cassandra / BigTable doesn't use these fields, so set them to zero + 'dynamodb.chunk-table.inactive-read-throughput': 0, + 'dynamodb.chunk-table.inactive-write-throughput': 0, + 'dynamodb.chunk-table.read-throughput': 0, + 'dynamodb.chunk-table.write-throughput': 0, + 'dynamodb.periodic-table.inactive-read-throughput': 0, + 'dynamodb.periodic-table.inactive-write-throughput': 0, + 'dynamodb.periodic-table.read-throughput': 0, + 'dynamodb.periodic-table.write-throughput': 0, + + // Rate limit Bigtable Admin calls. Google seem to limit to ~100QPS, + // and given 2yrs worth of tables (~100) a sync will table 20s. This + // allows you to run upto 20 independant Cortex clusters on the same + // Google project before running into issues. + 'dynamodb.poll-interval': '10m', + 'dynamodb.periodic-table.grace-period': '3h', + 'bigtable.grpc-client-rate-limit': 5.0, + 'bigtable.grpc-client-rate-limit-burst': 5, + 'bigtable.backoff-on-ratelimits': true, + 'bigtable.table-cache.enabled': true, + }, + + table_manager_container:: + if $._config.table_manager_enabled then + container.new('table-manager', $._images.tableManager) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.table_manager_args)) + + $.util.resourcesRequests('100m', '100Mi') + + $.util.resourcesLimits('200m', '200Mi') + + $.jaeger_mixin + else {}, + + local deployment = $.apps.v1beta1.deployment, + + table_manager_deployment: + if $._config.table_manager_enabled then + deployment.new('table-manager', 1, [$.table_manager_container]) + + $.storage_config_mixin + else {}, + + table_manager_service: + if $._config.table_manager_enabled then + $.util.serviceFor($.table_manager_deployment) + else {}, +} diff --git a/operations/mimir/test-exporter.libsonnet b/operations/mimir/test-exporter.libsonnet new file mode 100644 index 00000000000..31d7a2c96d8 --- /dev/null +++ b/operations/mimir/test-exporter.libsonnet @@ -0,0 +1,40 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + + test_exporter_args:: { + 'user-id': $._config.test_exporter_user_id, + 'prometheus-address': 'http://query-frontend.%(namespace)s.svc.cluster.local/api/prom' % $._config, + 'test-query-start': $._config.test_exporter_start_time, + 'extra-selectors': 'job="%(namespace)s/test-exporter"' % $._config, + 'test-query-min-size': '1m', + 'test-epsilion': '0.05', // There is enough jitter in our system for scrapes to be off by 5%. + }, + + test_exporter_container:: + if !($._config.test_exporter_enabled) + then {} + else + container.new('test-exporter', $._images.testExporter) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.test_exporter_args)) + + $.util.resourcesRequests('100m', '100Mi') + + $.util.resourcesLimits('100m', '100Mi') + + $.jaeger_mixin, + + local deployment = $.apps.v1beta1.deployment, + + test_exporter_deployment: + if !($._config.test_exporter_enabled) + then {} + else + deployment.new('test-exporter', 1, [ + $.test_exporter_container, + ]), + + test_exporter_service: + if !($._config.test_exporter_enabled) + then {} + else + $.util.serviceFor($.test_exporter_deployment), +} From d1d7d8cf30b338de5b03d0fdac5c5dbe44dfda7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 5 Mar 2020 11:14:05 +0100 Subject: [PATCH 002/192] Copy jsonnet files for Cortex. Main Cortex image uses latest master. --- operations/mimir/alertmanager.libsonnet | 36 ++++-- operations/mimir/common.libsonnet | 4 +- operations/mimir/config.libsonnet | 82 +++++++++--- operations/mimir/distributor.libsonnet | 32 ++++- operations/mimir/flusher-job.libsonnet | 54 ++++++++ operations/mimir/gossip.libsonnet | 76 +++++++++++ operations/mimir/images.libsonnet | 11 +- operations/mimir/ingester.libsonnet | 22 +++- operations/mimir/postgresql.libsonnet | 2 +- operations/mimir/querier.libsonnet | 20 ++- operations/mimir/query-frontend.libsonnet | 6 +- operations/mimir/query-tee.libsonnet | 33 +++++ operations/mimir/ruler.libsonnet | 11 +- operations/mimir/tsdb.libsonnet | 147 ++++++++++++++++++++++ 14 files changed, 472 insertions(+), 64 deletions(-) create mode 100644 operations/mimir/flusher-job.libsonnet create mode 100644 operations/mimir/gossip.libsonnet create mode 100644 operations/mimir/query-tee.libsonnet create mode 100644 operations/mimir/tsdb.libsonnet diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 168f6be28ac..fbcc6e4ef4e 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -1,5 +1,10 @@ { + local pvc = $.core.v1.persistentVolumeClaim, + local volumeMount = $.core.v1.volumeMount, local container = $.core.v1.container, + local statefulSet = $.apps.v1beta1.statefulSet, + local service = $.core.v1.service, + alertmanager_args:: { @@ -7,26 +12,37 @@ 'log.level': 'debug', 'alertmanager.storage.type': 'gcs', + 'alertmanager.storage.path': '/data', 'alertmanager.gcs.bucketname': '%(cluster)s-cortex-configdb-%(namespace)s' % $._config, - 'alertmanager.web.external-url': 'http://alertmanager.%s.svc.cluster.local/alertmanager' % $._config.namespace, + 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, }, + alertmanager_pvc:: + pvc.new() + + pvc.mixin.metadata.withName('alertmanager-data') + + pvc.mixin.spec.withAccessModes('ReadWriteOnce') + + pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }), + alertmanager_container:: container.new('alertmanager', $._images.alertmanager) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + + container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + $.util.resourcesRequests('100m', '1Gi') + $.jaeger_mixin, - local deployment = $.apps.v1beta1.deployment, - alertmanager_deployment: - deployment.new('alertmanager', 1, [$.alertmanager_container]) + - deployment.mixin.spec.template.spec.withRestartPolicy('Always') + - $.util.antiAffinity, - - local service = $.core.v1.service, + alertmanager_statefulset: + statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) + .withServiceName('alertmanager') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900), - alertmanager_server: - $.util.serviceFor($.alertmanager_deployment), + alertmanager_service: + $.util.serviceFor($.alertmanager_statefulset), } diff --git a/operations/mimir/common.libsonnet b/operations/mimir/common.libsonnet index 62a5a338c49..2da29bacde6 100644 --- a/operations/mimir/common.libsonnet +++ b/operations/mimir/common.libsonnet @@ -7,8 +7,8 @@ defaultPorts:: [ - containerPort.newNamed('http-metrics', 80), - containerPort.newNamed('grpc', 9095), + containerPort.newNamed(name='http-metrics', containerPort=80), + containerPort.newNamed(name='grpc', containerPort=9095), ], }, } diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 17350551a30..b120ebb07aa 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -3,6 +3,7 @@ namespace: error 'must define namespace', cluster: error 'must define cluster', replication_factor: 3, + external_url: error 'must define external url for cluster', storage_backend: error 'must specify storage backend (cassandra, gcp)', table_prefix: $._config.namespace, @@ -22,11 +23,9 @@ else [], - max_series_per_user: 250000, - max_series_per_metric: 10000, max_chunk_idle: '15m', - test_exporter_enabled: false, + test_exporter_enabled: true, test_exporter_start_time: error 'must specify test exporter start time', test_exporter_user_id: error 'must specify test exporter used id', @@ -40,6 +39,8 @@ storage_engine: 'chunks', storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', + distributor_short_grpc_keepalive_enabled: false, + // TSDB storage engine doesn't require the table manager. table_manager_enabled: $._config.storage_engine != 'tsdb', @@ -48,6 +49,15 @@ memcached_index_writes_enabled: $._config.storage_engine != 'tsdb', memcached_chunks_enabled: $._config.storage_engine != 'tsdb', + ingestion_rate_global_limit_enabled: false, + + // The query-tee is an optional service which can be used to send + // the same input query to multiple backends and make them compete + // (comparing performances). + query_tee_enabled: false, + query_tee_backend_endpoints: [], + query_tee_backend_preferred: '', + enabledBackends: [ backend for backend in std.split($._config.storage_backend, ',') @@ -96,8 +106,8 @@ // engine is explicitly enabled. storageTSDBConfig: if $._config.storage_engine == 'tsdb' then { 'store.engine': 'tsdb', - 'experimental.tsdb.dir': '/tmp/tsdb', - 'experimental.tsdb.sync-dir': '/tmp/tsdb', + 'experimental.tsdb.dir': '/data/tsdb', + 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', 'experimental.tsdb.block-ranges-period': '2h', 'experimental.tsdb.retention-period': '1h', 'experimental.tsdb.ship-interval': '1m', @@ -111,9 +121,6 @@ 'querier.ingester-streaming': $._config.querier_ingester_streaming_enabled, 'querier.batch-iterators': true, - // Don't query the chunk store for data younger than max_chunk_idle. - 'store.min-chunk-age': $._config.max_chunk_idle, - // Don't query ingesters for older queries. // Chunks are 6hrs right now. Add some slack for safety. 'querier.query-ingesters-within': '12h', @@ -127,6 +134,9 @@ // splitting in the frontend, the reality is this only limits rate(foo[31d]) // type queries. 'store.max-query-length': '744h', + + // Don't query the chunk store for data younger than max_chunk_idle. + 'querier.query-store-after': $._config.max_chunk_idle, } + ( if $._config.memcached_index_queries_enabled then { @@ -147,6 +157,8 @@ 'consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'consul.consistent-reads': false, 'ring.prefix': '', + 'consul.watch-rate-limit': 1, + 'consul.watch-burst-size': 5, }, // Some distributor config is shared with the querier. @@ -160,14 +172,19 @@ overrides: { // === Per-tenant usage limits. === - // These are the defaults. These are not global limits but per instance limits. + // + // These are the defaults. Distributor limits will be 5x (#replicas) higher, + // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. // // small_user: { // ingestion_rate: 10,000 // ingestion_burst_size: 20,000 // - // max_series_per_user: 250,000 - // max_series_per_metric: 10,000 + // max_series_per_user: 0 (disabled) + // max_series_per_metric: 0 (disabled) + // + // max_global_series_per_user: 1,000,000 + // max_global_series_per_metric: 100,000 // // max_series_per_query: 10,000 // max_samples_per_query: 100,000 @@ -177,36 +194,57 @@ ingestion_rate: 25000, ingestion_burst_size: 50000, - max_series_per_metric: 100000, - max_series_per_user: 500000, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 3000000, // 3M + max_global_series_per_metric: 300000, // 300K max_series_per_query: 100000, max_samples_per_query: 1000000, + } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + ingestion_rate: 350000, // 350K + ingestion_burst_size: 3500000, // 3.5M }, big_user:: { ingestion_rate: 50000, ingestion_burst_size: 70000, - max_series_per_metric: 100000, - max_series_per_user: 1000000, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit max_series_per_query: 100000, max_samples_per_query: 1000000, + + max_global_series_per_user: 6000000, // 6M + max_global_series_per_metric: 600000, // 600K + } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + ingestion_rate: 700000, // 700K + ingestion_burst_size: 7000000, // 7M }, super_user:: { ingestion_rate: 200000, ingestion_burst_size: 240000, - max_series_per_metric: 200000, - max_series_per_user: 2000000, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 12000000, // 12M + max_global_series_per_metric: 1200000, // 1.2M max_series_per_query: 100000, max_samples_per_query: 1000000, + } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + ingestion_rate: 1500000, // 1.5M + ingestion_burst_size: 15000000, // 15M }, }, + // if not empty, passed to overrides.yaml as another top-level field + multi_kv_config: {}, + schemaID: std.md5(std.toString($._config.schema)), enable_pod_priorities: true, @@ -217,9 +255,13 @@ overrides_config: configMap.new('overrides') + configMap.withData({ - 'overrides.yaml': $.util.manifestYaml({ - overrides: $._config.overrides, - }), + 'overrides.yaml': $.util.manifestYaml( + { + overrides: $._config.overrides, + } + if std.length($._config.multi_kv_config) > 0 then { + multi_kv_config: $._config.multi_kv_config, + } else {} + ), }), storage_config: diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index d5b147d845f..f27bb87e3e2 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -25,11 +25,31 @@ // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at // around 1.25G, reducing the 99%ile. 'mem-ballast-size-bytes': 1 << 30, // 1GB + } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + 'distributor.ingestion-rate-limit-strategy': 'global', + 'distributor.ingestion-rate-limit': 100000, // 100K + 'distributor.ingestion-burst-size': 1000000, // 1M + + // The ingestion rate global limit requires the distributors to form a ring. + 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'distributor.ring.consul.consistent-reads': false, + 'distributor.ring.consul.watch-rate-limit': 1, + 'distributor.ring.consul.watch-burst-size': 1, + 'distributor.ring.prefix': '', + } + if !$._config.distributor_short_grpc_keepalive_enabled then {} else { + // The cortex-gateway should frequently reopen the connections towards the + // distributors in order to guarantee that new distributors receive traffic + // as soon as they're ready. + 'server.grpc.keepalive.max-connection-age': '2m', + 'server.grpc.keepalive.max-connection-age-grace': '5m', + 'server.grpc.keepalive.max-connection-idle': '1m', }, + distributor_ports:: $.util.defaultPorts, + distributor_container:: container.new('distributor', $._images.distributor) + - container.withPorts($.util.defaultPorts) + + container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits('6', '4Gi') + @@ -37,16 +57,18 @@ local deployment = $.apps.v1beta1.deployment, + distributor_deployment_labels:: {}, + distributor_deployment: - deployment.new('distributor', 3, [ - $.distributor_container, - ]) + + deployment.new('distributor', 3, [$.distributor_container], $.distributor_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex'), local service = $.core.v1.service, + distributor_service_ignored_labels:: [], + distributor_service: - $.util.serviceFor($.distributor_deployment) + + $.util.serviceFor($.distributor_deployment, $.distributor_service_ignored_labels) + service.mixin.spec.withClusterIp('None'), } diff --git a/operations/mimir/flusher-job.libsonnet b/operations/mimir/flusher-job.libsonnet new file mode 100644 index 00000000000..514a867de37 --- /dev/null +++ b/operations/mimir/flusher-job.libsonnet @@ -0,0 +1,54 @@ +{ + // Usage example: + // local flusher_job = import 'cortex/flusher-job.libsonnet'; + // flusher_job + { + // flusher_job: + // $.flusher_job_func('pvc-af8947e6-182e-11ea-82e4-42010a9a0137', 'ingester-pvc-ingester-5'), + // } + + local container = $.core.v1.container, + local job = $.batch.v1.job, + local volumeMount = $.core.v1.volumeMount, + local volume = $.core.v1.volume, + + flusher_container:: + container.new('flusher', $._images.flusher) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ingester_args { + target: 'flusher', + 'flusher.wal-dir': $._config.wal_dir, + })) + + container.mixin.readinessProbe.httpGet.withPath('/ready') + + container.mixin.readinessProbe.httpGet.withPort(80) + + container.mixin.readinessProbe.withInitialDelaySeconds(15) + + container.mixin.readinessProbe.withTimeoutSeconds(1) + + $.util.resourcesRequests('4', '15Gi') + + $.util.resourcesLimits(null, '25Gi') + + $.jaeger_mixin, + + flusher_job_storage_config_mixin:: + job.mixin.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), + + flusher_job_func(volumeName, pvcName):: + job.new() + + job.mixin.spec.template.spec.withContainers([ + $.flusher_container + + container.withVolumeMountsMixin([ + volumeMount.new(volumeName, $._config.wal_dir), + ]), + ]) + + job.mixin.spec.template.spec.withRestartPolicy('Never') + + job.mixin.spec.template.spec.withVolumes([ + volume.fromPersistentVolumeClaim(volumeName, pvcName), + ]) + + $.flusher_job_storage_config_mixin + + job.mixin.metadata.withName('flusher') + + job.mixin.metadata.withNamespace($._config.namespace) + + job.mixin.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.podPriority('high'), +} diff --git a/operations/mimir/gossip.libsonnet b/operations/mimir/gossip.libsonnet new file mode 100644 index 00000000000..c8238951ac7 --- /dev/null +++ b/operations/mimir/gossip.libsonnet @@ -0,0 +1,76 @@ +{ + _config+:: { + // Use memberlist only. This works fine on already-migrated clusters. + // To do a migration from Consul to memberlist, multi kv storage needs to be used (See below). + ringConfig+: { + 'ring.store': 'memberlist', + 'memberlist.abort-if-join-fails': false, + 'memberlist.bind-port': gossipRingPort, + 'memberlist.join': 'gossip-ring.%s.svc.cluster.local:%d' % [$._config.namespace, gossipRingPort], + }, + + // This can be used to enable multi KV store, with consul and memberlist. + ringConfigMulti: { + 'ring.store': 'multi', + 'multi.primary': 'consul', + 'multi.secondary': 'memberlist', + }, + + // When doing migration via multi KV store, this section can be used + // to configure runtime parameters of multi KV store + /* + multi_kv_config: { + primary: 'memberlist', + // 'mirror-enabled': false, // renamed to 'mirror_enabled' on after r67 + }, + */ + }, + + ingester_args+: { + // wait longer to see LEAVING ingester in the gossiped ring, to avoid + // auto-join without transfer from LEAVING ingester. + 'ingester.join-after': '60s', + + // Updating heartbeat is low-cost operation when using gossiped ring, we can + // do it more often (gossiping will happen no matter what, we may as well send + // recent timestamps). + // It also helps other components to see more recent update in the ring. + 'ingester.heartbeat-period': '5s', + }, + + local gossipRingPort = 7946, + + local containerPort = $.core.v1.containerPort, + local gossipPort = containerPort.newNamed(name='gossip-ring', containerPort=gossipRingPort), + + distributor_ports+:: [gossipPort], + querier_ports+:: [gossipPort], + ingester_ports+:: [gossipPort], + + local gossip_member_label = 'gossip_ring_member', + + distributor_deployment_labels+:: { [gossip_member_label]: 'true' }, + ingester_deployment_labels+:: { [gossip_member_label]: 'true' }, + querier_deployment_labels+:: { [gossip_member_label]: 'true' }, + + // Don't use gossip ring member label in service definition. + distributor_service_ignored_labels+:: [gossip_member_label], + ingester_service_ignored_labels+:: [gossip_member_label], + querier_service_ignored_labels+:: [gossip_member_label], + + // Headless service (= no assigned IP, DNS returns all targets instead) pointing to some + // users of gossiped-ring. We use ingesters as seed nodes for joining gossip cluster. + // During migration to gossip, it may be useful to use distributors instead, since they are restarted faster. + gossip_ring_service: + local service = $.core.v1.service; + local servicePort = service.mixin.spec.portsType; + local ports = [ + servicePort.newNamed('gossip-ring', gossipRingPort, gossipRingPort) + + servicePort.withProtocol('TCP'), + ]; + service.new( + 'gossip-ring', // name + { [gossip_member_label]: 'true' }, // point to all gossip members + ports, + ) + service.mixin.spec.withClusterIp('None'), // headless service +} diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 8d5ccce41af..d53daccb2df 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -6,16 +6,19 @@ postgresql: 'postgres:9.6.11-alpine', // Our services. - cortex: 'cortexproject/cortex:master-37c1f178', + cortex: 'cortexproject/cortex:master-fdcd992f', distributor: self.cortex, ingester: self.cortex, querier: self.cortex, query_frontend: self.cortex, tableManager: self.cortex, + compactor: self.cortex, + flusher: 'ganeshve/cortex:flusher-target-5aac2d73', + query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. - ruler: 'jtlisi/cortex:20190806_prommanager_ruler_with_api-50343f8d', - alertmanager: 'jtlisi/cortex:20190819_alertmanager_update-165b393a', - testExporter: 'cortexproject/test-exporter:master-ef99cdaf', + ruler: 'jtlisi/cortex:20191122_ruler_with_api-4059a06d3', + alertmanager: 'jtlisi/cortex:20190819_alertmanager_update-faa66aa43', + testExporter: 'cortexproject/test-exporter:master-be013707', }, } diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 0e08ba15f99..a585806157d 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -5,6 +5,7 @@ $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + + $._config.distributorConfig + // This adds the distributor ring flags to the ingester. { target: 'ingester', @@ -14,6 +15,8 @@ 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. 'ingester.claim-on-rollout': true, 'ingester.heartbeat-period': '15s', + 'ingester.max-stale-chunk-idle': '5m', + 'ingester.normalise-tokens': true, // Chunk building/flushing config. 'ingester.chunk-encoding': 3, // Bigchunk encoding @@ -23,8 +26,10 @@ // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, - 'ingester.max-series-per-user': $._config.max_series_per_user, - 'ingester.max-series-per-metric': $._config.max_series_per_metric, + 'ingester.max-global-series-per-user': 1000000, // 1M + 'ingester.max-global-series-per-metric': 100000, // 100K + 'ingester.max-series-per-user': 0, // Disabled in favour of the max global limit + 'ingester.max-series-per-metric': 0, // Disabled in favour of the max global limit 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 100000, } + ( @@ -38,23 +43,26 @@ else {} ), + ingester_ports:: $.util.defaultPorts, + ingester_container:: container.new('ingester', $._images.ingester) + - container.withPorts($.util.defaultPorts) + + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + container.mixin.readinessProbe.httpGet.withPath('/ready') + container.mixin.readinessProbe.httpGet.withPort(80) + container.mixin.readinessProbe.withInitialDelaySeconds(15) + container.mixin.readinessProbe.withTimeoutSeconds(1) + - $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.jaeger_mixin, local deployment = $.apps.v1beta1.deployment, + ingester_deployment_labels:: {}, + ingester_deployment: - deployment.new('ingester', 3, [$.ingester_container]) + + deployment.new('ingester', 3, [$.ingester_container], $.ingester_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + deployment.mixin.spec.withMinReadySeconds(60) + @@ -64,6 +72,8 @@ $.storage_config_mixin + $.util.podPriority('high'), + ingester_service_ignored_labels:: [], + ingester_service: - $.util.serviceFor($.ingester_deployment), + $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels), } diff --git a/operations/mimir/postgresql.libsonnet b/operations/mimir/postgresql.libsonnet index d63eb8c66da..bd69b496785 100644 --- a/operations/mimir/postgresql.libsonnet +++ b/operations/mimir/postgresql.libsonnet @@ -10,7 +10,7 @@ postgresql_container:: container.new('postgres', $._images.postgresql) + container.withPorts([ - containerPort.newNamed('postgresql', 5432), + containerPort.newNamed(name='postgresql', containerPort=5432), ]) + container.withEnvMap({ POSTGRES_USER: $._config.pgUser, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index fda924c71fc..20ea8a06826 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -25,28 +25,36 @@ 'log.level': 'debug', }, + querier_ports:: $.util.defaultPorts, + + querier_env_map:: { + JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. + }, + querier_container:: container.new('querier', $._images.querier) + - container.withPorts($.util.defaultPorts) + + container.withPorts($.querier_ports) + container.withArgsMixin($.util.mapToFlags($.querier_args)) + $.util.resourcesRequests('1', '12Gi') + $.util.resourcesLimits(null, '24Gi') + $.jaeger_mixin + - container.withEnvMap({ - JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. - }), + container.withEnvMap($.querier_env_map), local deployment = $.apps.v1beta1.deployment, + querier_deployment_labels: {}, + querier_deployment: - deployment.new('querier', 3, [$.querier_container]) + + deployment.new('querier', 3, [$.querier_container], $.querier_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + $.storage_config_mixin, local service = $.core.v1.service, + querier_service_ignored_labels:: [], + querier_service: - $.util.serviceFor($.querier_deployment) + + $.util.serviceFor($.querier_deployment, $.querier_service_ignored_labels) + service.mixin.spec.withSelector({ name: 'query-frontend' }), } diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index a9e64abe1fc..08bc282301d 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -12,7 +12,7 @@ 'server.http-write-timeout': '1m', // Split long queries up into multiple day-long queries. - 'querier.split-queries-by-day': true, + 'querier.split-queries-by-interval': '24h', // Cache query results. 'querier.align-querier-with-step': true, @@ -56,5 +56,9 @@ query_frontend_service: $.util.serviceFor($.query_frontend_deployment) + + // Make sure that query frontend worker, running in the querier, do resolve + // each query-frontend pod IP and NOT the service IP. To make it, we do NOT + // use the service cluster IP so that when the service DNS is resolved it + // returns the set of query-frontend IPs. service.mixin.spec.withClusterIp('None'), } diff --git a/operations/mimir/query-tee.libsonnet b/operations/mimir/query-tee.libsonnet new file mode 100644 index 00000000000..f0eab8aef76 --- /dev/null +++ b/operations/mimir/query-tee.libsonnet @@ -0,0 +1,33 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + local deployment = $.apps.v1beta1.deployment, + local service = $.core.v1.service, + local servicePort = $.core.v1.servicePort, + + query_tee_args:: { + 'log.level': 'debug', + 'backend.endpoints': std.join(',', $._config.query_tee_backend_endpoints), + 'backend.preferred': $._config.query_tee_backend_preferred, + }, + + query_tee_container:: if !($._config.query_tee_enabled) then {} else + container.new('query-tee', $._images.query_tee) + + container.withPorts([ + containerPort.newNamed(name='http', containerPort=80), + containerPort.newNamed(name='http-metrics', containerPort=9900), + ]) + + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + + $.util.resourcesRequests('1', '512Mi') + + $.jaeger_mixin, + + query_tee_deployment: if !($._config.query_tee_enabled) then {} else + deployment.new('query-tee', 2, [$.query_tee_container]), + + query_tee_service: if !($._config.query_tee_enabled) then {} else + service.new('query-tee', { name: 'query-tee' }, [ + servicePort.newNamed('http', 80, 80) + + servicePort.withNodePort($._config.query_tee_node_port), + ]) + + service.mixin.spec.withType('NodePort'), +} diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 749c8f8403a..81a55e91b48 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -11,18 +11,11 @@ target: 'ruler', // Alertmanager configs 'ruler.alertmanager-url': 'http://alertmanager.%s.svc.cluster.local/alertmanager' % $._config.namespace, + 'experimental.ruler.enable-api': true, // Ring Configs 'ruler.enable-sharding': true, - 'ruler.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'ruler.consul.consistent-reads': false, - 'ruler.prefix': 'rulers/', - 'ruler.distributor.replication-factor': 1, - 'ruler.claim-on-rollout': true, - 'ruler.join-after': '15s', - 'ruler.ring.heartbeat-timeout': '10m', - 'ruler.heartbeat-period': '1m', - 'ruler.search-pending-for': '1m', + 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, // Rule Storage Configs 'ruler.storage.type': 'gcs', diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet new file mode 100644 index 00000000000..458a99f93a2 --- /dev/null +++ b/operations/mimir/tsdb.libsonnet @@ -0,0 +1,147 @@ +{ + local pvc = $.core.v1.persistentVolumeClaim, + local volumeMount = $.core.v1.volumeMount, + local container = $.core.v1.container, + local statefulSet = $.apps.v1beta1.statefulSet, + local service = $.core.v1.service, + + _config+:: { + // Enforce TSDB storage + storage_backend: 'none', + storage_engine: 'tsdb', + }, + + // The querier should run on a dedicated volume used to sync TSDB + // indexes, in order to not negatively affect the node performances + // in case of sustained I/O or utilization. For this reason we: + // 1. Remove default querier deployment + // 2. Run querier as statefulset with PVC + // 3. Replace the service switching it to the statefulset + local querier_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: '10Gi' }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName('standard') + + pvc.mixin.metadata.withName('querier-data'), + + querier_args+:: { + // Reduce the number of blocks synched simultaneously, in order to + // keep the memory utilization under control when the index header + // is generated + 'experimental.tsdb.bucket-store.tenant-sync-concurrency': 2, + 'experimental.tsdb.bucket-store.block-sync-concurrency': 5, + }, + + querier_container+:: + container.mixin.readinessProbe.httpGet.withPath('/ready') + + container.mixin.readinessProbe.httpGet.withPort(80) + + container.mixin.readinessProbe.withInitialDelaySeconds(5) + + container.mixin.readinessProbe.withTimeoutSeconds(1) + + container.withVolumeMountsMixin([ + volumeMount.new('querier-data', '/data'), + ]), + + querier_deployment: {}, + + querier_statefulset: + statefulSet.new('querier', 3, [$.querier_container], querier_data_pvc) + .withServiceName('querier') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'querier' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'querier' } + $.querier_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'querier' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(60) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.antiAffinity, + + querier_service: + $.util.serviceFor($.querier_statefulset, $.querier_service_ignored_labels) + + service.mixin.spec.withSelector({ name: 'query-frontend' }), + + // The ingesters should persist TSDB blocks and WAL on a persistent + // volume in order to be crash resilient. + local ingester_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName('fast') + + pvc.mixin.metadata.withName('ingester-data'), + + ingester_deployment: {}, + + ingester_args+:: { + // Disable TSDB blocks transfer because of persistent volumes + 'ingester.max-transfer-retries': 0, + + // Persist ring tokens so that when the ingester will be restarted + // it will pick the same tokens + 'ingester.tokens-file-path': '/data/tokens', + }, + + ingester_container+:: + container.withVolumeMountsMixin([ + volumeMount.new('ingester-data', '/data'), + ]), + + ingester_statefulset: + statefulSet.new('ingester', 3, [$.ingester_container], ingester_data_pvc) + .withServiceName('ingester') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'ingester' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.podPriority('high') + + $.util.antiAffinity, + + ingester_service: + $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), + + // The compactor runs a statefulset with a single replica, because + // it does not support horizontal scalability yet. + local compactor_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: '250Gi' }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName('standard') + + pvc.mixin.metadata.withName('compactor-data'), + + compactor_args:: + $._config.storageConfig + { + target: 'compactor', + + // Compactor config. + 'compactor.block-ranges': '2h,12h,24h', + 'compactor.data-dir': '/data', + 'compactor.compaction-interval': '30m', + }, + + compactor_ports:: $.util.defaultPorts, + + compactor_container:: + container.new('compactor', $._images.compactor) + + container.withPorts($.compactor_ports) + + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + + $.util.resourcesRequests('1', '6Gi') + + $.util.resourcesLimits('1', '6Gi') + + $.jaeger_mixin, + + compactor_statefulset: + statefulSet.new('compactor', 1, [$.compactor_container], compactor_data_pvc) + .withServiceName('compactor') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'compactor' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'compactor' }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'compactor' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900), + +} From 6f3ccce33e5a8d7f13ad9379aa38646e78a6a756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 6 Mar 2020 11:28:27 +0100 Subject: [PATCH 003/192] jb update --- operations/mimir/jsonnetfile.lock.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/operations/mimir/jsonnetfile.lock.json b/operations/mimir/jsonnetfile.lock.json index e4f26b0b08c..c4dcaacf706 100644 --- a/operations/mimir/jsonnetfile.lock.json +++ b/operations/mimir/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "consul" } }, - "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", - "sum": "S3cLCI5OLpSdwqsAWkNtdGXTlFTpuVGB29m6CXw8xHI=" + "version": "c19a92e586a6752f11745b47f309b13f02ef7147", + "sum": "qlVBnIShhHEPglAl1xYIAmOP/W8LD0wQmHCT0m9sTLU=" }, { "name": "etcd-operator", @@ -19,8 +19,8 @@ "subdir": "etcd-operator" } }, - "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", - "sum": "KUklp389C8zcSrYjRkIy00w81gP1HGU3eDmxghqtmBs=" + "version": "c19a92e586a6752f11745b47f309b13f02ef7147", + "sum": "RbSlOsk0EBAMOfMOKPBdD0joHN6UKZqeP3zy9LjBQTE=" }, { "name": "ksonnet-util", @@ -30,8 +30,8 @@ "subdir": "ksonnet-util" } }, - "version": "250bf5499d81e5e77e1e5ed2242c89ad27485aec", - "sum": "8gmmSMANOAs4dfP5a09Y+nE9pd8E4TMpk3YPKxT4ys0=" + "version": "c19a92e586a6752f11745b47f309b13f02ef7147", + "sum": "LKsTTBcH8TXX5ANgRUu5I7Y1tf5le4nANFV3/W53I+c=" }, { "name": "memcached", @@ -41,8 +41,8 @@ "subdir": "memcached" } }, - "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", - "sum": "hroD9u119YWI5g2SnspmSgMDJUMyXFZDnjymhUS6Pjs=" + "version": "c19a92e586a6752f11745b47f309b13f02ef7147", + "sum": "GQeyWFtqhwM+hGxQbdywWG1PFJ/KmSC1at0hai7AHXU=" } ] } From f6f034f3745174cf172a799a16ddbacbbe3cb607 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 11 Mar 2020 10:59:24 +0100 Subject: [PATCH 004/192] Allow to configure querier disk size for the blocks storage Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 458a99f93a2..20eabb1c5af 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -9,6 +9,9 @@ // Enforce TSDB storage storage_backend: 'none', storage_engine: 'tsdb', + + // Allow to configure the querier disk size based on the cluster size. + cortex_querier_data_disk_size: '10Gi', }, // The querier should run on a dedicated volume used to sync TSDB @@ -19,7 +22,7 @@ // 3. Replace the service switching it to the statefulset local querier_data_pvc = pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: '10Gi' }) + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_querier_data_disk_size }) + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + pvc.mixin.spec.withStorageClassName('standard') + pvc.mixin.metadata.withName('querier-data'), From 3575abba5cb9dec6819a7912ed08a6d04b74e3e2 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 11 Mar 2020 15:52:15 -0400 Subject: [PATCH 005/192] remove postgres related configs Signed-off-by: Jacob Lisi --- operations/mimir/images.libsonnet | 1 - operations/mimir/postgresql.libsonnet | 29 --------------------------- 2 files changed, 30 deletions(-) delete mode 100644 operations/mimir/postgresql.libsonnet diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index d53daccb2df..8df2dc72735 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -3,7 +3,6 @@ // Various third-party images. memcached: 'memcached:1.5.17-alpine', memcachedExporter: 'prom/memcached-exporter:v0.6.0', - postgresql: 'postgres:9.6.11-alpine', // Our services. cortex: 'cortexproject/cortex:master-fdcd992f', diff --git a/operations/mimir/postgresql.libsonnet b/operations/mimir/postgresql.libsonnet deleted file mode 100644 index bd69b496785..00000000000 --- a/operations/mimir/postgresql.libsonnet +++ /dev/null @@ -1,29 +0,0 @@ -{ - local container = $.core.v1.container, - local containerPort = $.core.v1.containerPort, - - _config+: { - pgUser: 'cortex', - pgPassword: '1234', - }, - - postgresql_container:: - container.new('postgres', $._images.postgresql) + - container.withPorts([ - containerPort.newNamed(name='postgresql', containerPort=5432), - ]) + - container.withEnvMap({ - POSTGRES_USER: $._config.pgUser, - POSTGRES_DB: 'configs', - }) + - $.util.resourcesRequests('2', '1Gi') + - $.util.resourcesLimits('4', '2Gi'), - - local deployment = $.apps.v1beta1.deployment, - postgresql_deployment: - deployment.new('postgresql', 1, [$.postgresql_container]), - - local service = $.core.v1.service, - postgresql_service: - $.util.serviceFor($.postgresql_deployment), -} From 7ac28153a50a0d0866b6003764b2d403c95befce Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 12 Mar 2020 14:49:41 +0100 Subject: [PATCH 006/192] Allow to configure Cortex querier and compactor disk size and class Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 20eabb1c5af..cbd3470c508 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -10,8 +10,13 @@ storage_backend: 'none', storage_engine: 'tsdb', - // Allow to configure the querier disk size based on the cluster size. + // Allow to configure the querier disk. cortex_querier_data_disk_size: '10Gi', + cortex_querier_data_disk_class: 'standard', + + // Allow to configure the compactor disk. + cortex_compactor_data_disk_size: '250Gi', + cortex_compactor_data_disk_class: 'standard', }, // The querier should run on a dedicated volume used to sync TSDB @@ -24,7 +29,7 @@ pvc.new() + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_querier_data_disk_size }) + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName('standard') + + pvc.mixin.spec.withStorageClassName($._config.cortex_querier_data_disk_class) + pvc.mixin.metadata.withName('querier-data'), querier_args+:: { @@ -109,9 +114,9 @@ // it does not support horizontal scalability yet. local compactor_data_pvc = pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: '250Gi' }) + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName('standard') + + pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) + pvc.mixin.metadata.withName('compactor-data'), compactor_args:: From 1f79f99740224f486c36178ce374341da728c14c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 13 Mar 2020 14:48:06 +0100 Subject: [PATCH 007/192] Fixed distributor conditional settings Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index f27bb87e3e2..abe71a47438 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -25,7 +25,7 @@ // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at // around 1.25G, reducing the 99%ile. 'mem-ballast-size-bytes': 1 << 30, // 1GB - } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + } + (if !$._config.ingestion_rate_global_limit_enabled then {} else { 'distributor.ingestion-rate-limit-strategy': 'global', 'distributor.ingestion-rate-limit': 100000, // 100K 'distributor.ingestion-burst-size': 1000000, // 1M @@ -36,14 +36,14 @@ 'distributor.ring.consul.watch-rate-limit': 1, 'distributor.ring.consul.watch-burst-size': 1, 'distributor.ring.prefix': '', - } + if !$._config.distributor_short_grpc_keepalive_enabled then {} else { + }) + (if !$._config.distributor_short_grpc_keepalive_enabled then {} else { // The cortex-gateway should frequently reopen the connections towards the // distributors in order to guarantee that new distributors receive traffic // as soon as they're ready. 'server.grpc.keepalive.max-connection-age': '2m', 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', - }, + }), distributor_ports:: $.util.defaultPorts, From d2818ec1cbd25587036883c8a2efaa7c37efbec9 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 16 Mar 2020 15:33:52 -0400 Subject: [PATCH 008/192] adds sharding support to cortex lib Signed-off-by: Owen Diehl --- operations/mimir/config.libsonnet | 14 +++++-- operations/mimir/querier.libsonnet | 16 ++++++-- operations/mimir/query-frontend.libsonnet | 47 ++++++++++++++++++----- 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index b120ebb07aa..e4f6893863d 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -29,7 +29,12 @@ test_exporter_start_time: error 'must specify test exporter start time', test_exporter_user_id: error 'must specify test exporter used id', - querierConcurrency: 8, + // The expectation is that if sharding is enabled, we can force more (smaller) + // queries on the queriers. However this can't be extended too far because most queries + // concern recent (ingester) data, which isn't sharded. Therefore, we must strike a balance + // which allows us to process more sharded queries in parallel when requested, but not overload + // queriers during normal queries. + querierConcurrency: if self.sharded_queries_enabled then 16 else 8, querier_ingester_streaming_enabled: $._config.storage_engine != 'tsdb', jaeger_agent_host: null, @@ -51,6 +56,8 @@ ingestion_rate_global_limit_enabled: false, + sharded_queries_enabled: true, + // The query-tee is an optional service which can be used to send // the same input query to multiple backends and make them compete // (comparing performances). @@ -122,8 +129,9 @@ 'querier.batch-iterators': true, // Don't query ingesters for older queries. - // Chunks are 6hrs right now. Add some slack for safety. - 'querier.query-ingesters-within': '12h', + // Chunks are 6hrs right now. Add some slack for safety although not too much + // if sharded queries are enabled because they only shard non ingester queries. + 'querier.query-ingesters-within': if self.sharded_queries_enabled then '6h15m' else '12h', 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 20ea8a06826..cf3feee4403 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -1,6 +1,10 @@ { local container = $.core.v1.container, + querier_params:: { + replicas: if $._config.sharded_queries_enabled then 12 else 6, + }, + querier_args:: $._config.ringConfig + $._config.storeConfig + @@ -18,7 +22,7 @@ 'querier.max-concurrent': $._config.querierConcurrency, // Limit to N/2 worker threads per frontend, as we have two frontends. - 'querier.worker-parallelism': $._config.querierConcurrency / 2, + 'querier.worker-parallelism': $._config.querierConcurrency / $.query_frontend_params.replicas, 'querier.frontend-address': 'query-frontend.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, @@ -35,10 +39,14 @@ container.new('querier', $._images.querier) + container.withPorts($.querier_ports) + container.withArgsMixin($.util.mapToFlags($.querier_args)) + - $.util.resourcesRequests('1', '12Gi') + - $.util.resourcesLimits(null, '24Gi') + $.jaeger_mixin + - container.withEnvMap($.querier_env_map), + container.withEnvMap($.querier_env_map) + + if $._config.sharded_queries_enabled then + $.util.resourcesRequests('3', '12Gi') + + $.util.resourcesLimits(null, '24Gi') + else + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1beta1.deployment, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 08bc282301d..f6bcbfcb9b8 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -1,5 +1,21 @@ { local container = $.core.v1.container, + local deployment = $.apps.v1beta1.deployment, + local service = $.core.v1.service, + + query_frontend_params:: { + replicas: 2, // number of frontends to run + shard_factor: 16, // v10 schema shard factor + + // Queries can technically be sharded an arbitrary number of times. Thus query_split_factor is used + // as a coefficient to multiply the frontend tenant queues by. The idea is that this + // yields a bit of headroom so tenant queues aren't underprovisioned. Therefore the split factor + // should be represent the highest reasonable split factor for a query. If too low, a long query + // (i.e. 30d) with a high split factor (i.e. 5) would result in + // (day_splits * shard_factor * split_factor) or 30 * 16 * 5 = 2400 sharded queries, which may be + // more than the max queue size and thus would always error. + query_split_factor:: 6, + }, query_frontend_args:: { target: 'query-frontend', @@ -35,25 +51,38 @@ // Limit queries to 500 days, allow this to be override per-user. 'store.max-query-length': '12000h', // 500 Days 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', - }, + } + if $._config.sharded_queries_enabled then { + 'querier.parallelise-shardable-queries': 'true', + + // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. + // basically base * shard_factor * query_split_factor / num_frontends where + 'querier.max-outstanding-requests-per-tenant': std.floor(200 * self.query_frontend_params.shard_factor * self.query_frontend_params.query_split_factor / self.query_frontend_params.replicas), + + // per request parallelism factor passed to doRequests -- not a good solution. + 'querier.max-query-parallelism': 14 * self.query_frontend_params.shard_factor, + 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], + } + $._config.storageConfig + else {}, query_frontend_container:: container.new('query-frontend', $._images.query_frontend) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + - $.util.resourcesRequests('2', '600Mi') + - $.util.resourcesLimits(null, '1200Mi') + - $.jaeger_mixin, - - local deployment = $.apps.v1beta1.deployment, + $.jaeger_mixin + + if $._config.sharded_queries_enabled then + $.util.resourcesRequests('2', '2Gi') + + $.util.resourcesLimits(null, '6Gi') + + container.withEnvMap({ + JAEGER_REPORTER_MAX_QUEUE_SIZE: '5000', + }) + else $.util.resourcesRequests('2', '600Mi') + + $.util.resourcesLimits(null, '1200Mi'), query_frontend_deployment: - deployment.new('query-frontend', 2, [$.query_frontend_container]) + + deployment.new('query-frontend', self.query_frontend_params.replicas, [$.query_frontend_container]) + $.util.configVolumeMount('overrides', '/etc/cortex') + $.util.antiAffinity, - local service = $.core.v1.service, - query_frontend_service: $.util.serviceFor($.query_frontend_deployment) + // Make sure that query frontend worker, running in the querier, do resolve From 4ae5183623eff9dd9ac3c5f44770fe3bc00971f9 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 16 Mar 2020 17:35:11 -0400 Subject: [PATCH 009/192] reference path fixes, reduces query_split_factor, removes max-query-paralllelism overrides Signed-off-by: Owen Diehl --- operations/mimir/config.libsonnet | 2 +- operations/mimir/querier.libsonnet | 2 +- operations/mimir/query-frontend.libsonnet | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index e4f6893863d..feba3f55ca9 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -131,7 +131,7 @@ // Don't query ingesters for older queries. // Chunks are 6hrs right now. Add some slack for safety although not too much // if sharded queries are enabled because they only shard non ingester queries. - 'querier.query-ingesters-within': if self.sharded_queries_enabled then '6h15m' else '12h', + 'querier.query-ingesters-within': if $._config.sharded_queries_enabled then '6h15m' else '12h', 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index cf3feee4403..8a093f4446e 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -53,7 +53,7 @@ querier_deployment_labels: {}, querier_deployment: - deployment.new('querier', 3, [$.querier_container], $.querier_deployment_labels) + + deployment.new('querier', $.querier_params.replicas, [$.querier_container], $.querier_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + $.storage_config_mixin, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index f6bcbfcb9b8..72b32f7c375 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -14,7 +14,7 @@ // (i.e. 30d) with a high split factor (i.e. 5) would result in // (day_splits * shard_factor * split_factor) or 30 * 16 * 5 = 2400 sharded queries, which may be // more than the max queue size and thus would always error. - query_split_factor:: 6, + query_split_factor:: 3, }, query_frontend_args:: { @@ -56,10 +56,8 @@ // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. // basically base * shard_factor * query_split_factor / num_frontends where - 'querier.max-outstanding-requests-per-tenant': std.floor(200 * self.query_frontend_params.shard_factor * self.query_frontend_params.query_split_factor / self.query_frontend_params.replicas), + 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $.query_frontend_params.shard_factor * $.query_frontend_params.query_split_factor / $.query_frontend_params.replicas), - // per request parallelism factor passed to doRequests -- not a good solution. - 'querier.max-query-parallelism': 14 * self.query_frontend_params.shard_factor, 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], } + $._config.storageConfig else {}, From b3671f3e7d3ca4facbf959bc63e663b93821a3f4 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 16 Mar 2020 17:44:43 -0400 Subject: [PATCH 010/192] sharded_queries_enabled defaults to false Signed-off-by: Owen Diehl --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index feba3f55ca9..1cb67955003 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -56,7 +56,7 @@ ingestion_rate_global_limit_enabled: false, - sharded_queries_enabled: true, + sharded_queries_enabled: false, // The query-tee is an optional service which can be used to send // the same input query to multiple backends and make them compete From 997be11b8ff9ffbc4371f324e4eee5f21c470263 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Tue, 17 Mar 2020 14:05:42 -0400 Subject: [PATCH 011/192] querier/frontend configs derived from _config params Signed-off-by: Owen Diehl --- operations/mimir/config.libsonnet | 30 ++++++++++++++------ operations/mimir/querier.libsonnet | 24 +++++++--------- operations/mimir/query-frontend.libsonnet | 34 +++++++++-------------- 3 files changed, 45 insertions(+), 43 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 1cb67955003..ee280bac9bf 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -34,8 +34,25 @@ // concern recent (ingester) data, which isn't sharded. Therefore, we must strike a balance // which allows us to process more sharded queries in parallel when requested, but not overload // queriers during normal queries. - querierConcurrency: if self.sharded_queries_enabled then 16 else 8, - querier_ingester_streaming_enabled: $._config.storage_engine != 'tsdb', + querier: { + replicas: if $._config.queryFrontend.sharded_queries_enabled then 12 else 6, + concurrency: if $._config.queryFrontend.sharded_queries_enabled then 16 else 8, + ingester_streaming_enabled: $._config.storage_engine != 'tsdb', + }, + + queryFrontend: { + replicas: 2, + shard_factor: 16, // v10 schema shard factor + sharded_queries_enabled: false, + // Queries can technically be sharded an arbitrary number of times. Thus query_split_factor is used + // as a coefficient to multiply the frontend tenant queues by. The idea is that this + // yields a bit of headroom so tenant queues aren't underprovisioned. Therefore the split factor + // should be represent the highest reasonable split factor for a query. If too low, a long query + // (i.e. 30d) with a high split factor (i.e. 5) would result in + // (day_splits * shard_factor * split_factor) or 30 * 16 * 5 = 2400 sharded queries, which may be + // more than the max queue size and thus would always error. + query_split_factor:: 3, + }, jaeger_agent_host: null, @@ -56,8 +73,6 @@ ingestion_rate_global_limit_enabled: false, - sharded_queries_enabled: false, - // The query-tee is an optional service which can be used to send // the same input query to multiple backends and make them compete // (comparing performances). @@ -125,13 +140,12 @@ // Shared between the Ruler and Querier queryConfig: { // Use iterators to merge chunks, to reduce memory usage. - 'querier.ingester-streaming': $._config.querier_ingester_streaming_enabled, + 'querier.ingester-streaming': $._config.querier.ingester_streaming_enabled, 'querier.batch-iterators': true, // Don't query ingesters for older queries. - // Chunks are 6hrs right now. Add some slack for safety although not too much - // if sharded queries are enabled because they only shard non ingester queries. - 'querier.query-ingesters-within': if $._config.sharded_queries_enabled then '6h15m' else '12h', + // Chunks are 6hrs right now. Add some slack for safety. + 'querier.query-ingesters-within': '12h', 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 8a093f4446e..c850397f736 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -1,10 +1,6 @@ { local container = $.core.v1.container, - querier_params:: { - replicas: if $._config.sharded_queries_enabled then 12 else 6, - }, - querier_args:: $._config.ringConfig + $._config.storeConfig + @@ -19,10 +15,10 @@ 'server.http-write-timeout': '1m', // Limit query concurrency to prevent multi large queries causing an OOM. - 'querier.max-concurrent': $._config.querierConcurrency, + 'querier.max-concurrent': $._config.querier.concurrency, // Limit to N/2 worker threads per frontend, as we have two frontends. - 'querier.worker-parallelism': $._config.querierConcurrency / $.query_frontend_params.replicas, + 'querier.worker-parallelism': $._config.querier.concurrency / $._config.queryFrontend.replicas, 'querier.frontend-address': 'query-frontend.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, @@ -40,20 +36,20 @@ container.withPorts($.querier_ports) + container.withArgsMixin($.util.mapToFlags($.querier_args)) + $.jaeger_mixin + - container.withEnvMap($.querier_env_map) + - if $._config.sharded_queries_enabled then - $.util.resourcesRequests('3', '12Gi') + - $.util.resourcesLimits(null, '24Gi') - else - $.util.resourcesRequests('1', '12Gi') + - $.util.resourcesLimits(null, '24Gi'), + container.withEnvMap($.querier_env_map) + + if $._config.queryFrontend.sharded_queries_enabled then + $.util.resourcesRequests('3', '12Gi') + + $.util.resourcesLimits(null, '24Gi') + else + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1beta1.deployment, querier_deployment_labels: {}, querier_deployment: - deployment.new('querier', $.querier_params.replicas, [$.querier_container], $.querier_deployment_labels) + + deployment.new('querier', $._config.querier.replicas, [$.querier_container], $.querier_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + $.storage_config_mixin, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 72b32f7c375..6012d591b5f 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -1,21 +1,5 @@ { local container = $.core.v1.container, - local deployment = $.apps.v1beta1.deployment, - local service = $.core.v1.service, - - query_frontend_params:: { - replicas: 2, // number of frontends to run - shard_factor: 16, // v10 schema shard factor - - // Queries can technically be sharded an arbitrary number of times. Thus query_split_factor is used - // as a coefficient to multiply the frontend tenant queues by. The idea is that this - // yields a bit of headroom so tenant queues aren't underprovisioned. Therefore the split factor - // should be represent the highest reasonable split factor for a query. If too low, a long query - // (i.e. 30d) with a high split factor (i.e. 5) would result in - // (day_splits * shard_factor * split_factor) or 30 * 16 * 5 = 2400 sharded queries, which may be - // more than the max queue size and thus would always error. - query_split_factor:: 3, - }, query_frontend_args:: { target: 'query-frontend', @@ -51,12 +35,12 @@ // Limit queries to 500 days, allow this to be override per-user. 'store.max-query-length': '12000h', // 500 Days 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', - } + if $._config.sharded_queries_enabled then { + } + if $._config.queryFrontend.sharded_queries_enabled then { 'querier.parallelise-shardable-queries': 'true', // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. // basically base * shard_factor * query_split_factor / num_frontends where - 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $.query_frontend_params.shard_factor * $.query_frontend_params.query_split_factor / $.query_frontend_params.replicas), + 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], } + $._config.storageConfig @@ -67,7 +51,7 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + $.jaeger_mixin + - if $._config.sharded_queries_enabled then + if $._config.queryFrontend.sharded_queries_enabled then $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '6Gi') + container.withEnvMap({ @@ -76,10 +60,18 @@ else $.util.resourcesRequests('2', '600Mi') + $.util.resourcesLimits(null, '1200Mi'), + local deployment = $.apps.v1beta1.deployment, + query_frontend_deployment: - deployment.new('query-frontend', self.query_frontend_params.replicas, [$.query_frontend_container]) + + deployment.new('query-frontend', $._config.queryFrontend.replicas, [$.query_frontend_container]) + $.util.configVolumeMount('overrides', '/etc/cortex') + - $.util.antiAffinity, + $.util.antiAffinity + + // inject storage schema in order to know what/how to shard + if $._config.queryFrontend.sharded_queries_enabled then + $.storage_config_mixin + else {}, + + local service = $.core.v1.service, query_frontend_service: $.util.serviceFor($.query_frontend_deployment) + From 2d968ccedaab7a744b36757954704134fb3f8a6f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 18 Mar 2020 11:17:28 +0100 Subject: [PATCH 012/192] -config-yaml has been renamed to -schema-config-file in Cortex 0.7 Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index ee280bac9bf..738dac63bb9 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -122,7 +122,7 @@ $._config.client_configs.cassandra + $._config.client_configs.gcp + $._config.storageTSDBConfig + - { 'config-yaml': '/etc/cortex/schema/config.yaml' }, + { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, // TSDB blocks storage configuration, used only when 'tsdb' storage // engine is explicitly enabled. From 53e56e7c19683ef63b0fb9f010a54f951d828c2e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 18 Mar 2020 11:19:22 +0100 Subject: [PATCH 013/192] Removed distributor_short_grpc_keepalive_enabled feature flag (should be always enabled) Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 2 -- operations/mimir/distributor.libsonnet | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 738dac63bb9..6eef728f0b4 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -61,8 +61,6 @@ storage_engine: 'chunks', storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', - distributor_short_grpc_keepalive_enabled: false, - // TSDB storage engine doesn't require the table manager. table_manager_enabled: $._config.storage_engine != 'tsdb', diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index abe71a47438..dff9130549d 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -25,6 +25,13 @@ // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at // around 1.25G, reducing the 99%ile. 'mem-ballast-size-bytes': 1 << 30, // 1GB + + // The cortex-gateway should frequently reopen the connections towards the + // distributors in order to guarantee that new distributors receive traffic + // as soon as they're ready. + 'server.grpc.keepalive.max-connection-age': '2m', + 'server.grpc.keepalive.max-connection-age-grace': '5m', + 'server.grpc.keepalive.max-connection-idle': '1m', } + (if !$._config.ingestion_rate_global_limit_enabled then {} else { 'distributor.ingestion-rate-limit-strategy': 'global', 'distributor.ingestion-rate-limit': 100000, // 100K @@ -36,13 +43,6 @@ 'distributor.ring.consul.watch-rate-limit': 1, 'distributor.ring.consul.watch-burst-size': 1, 'distributor.ring.prefix': '', - }) + (if !$._config.distributor_short_grpc_keepalive_enabled then {} else { - // The cortex-gateway should frequently reopen the connections towards the - // distributors in order to guarantee that new distributors receive traffic - // as soon as they're ready. - 'server.grpc.keepalive.max-connection-age': '2m', - 'server.grpc.keepalive.max-connection-age-grace': '5m', - 'server.grpc.keepalive.max-connection-idle': '1m', }), distributor_ports:: $.util.defaultPorts, From b74a530566f85a92b3f738bec69cdb63608d59f8 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 18 Mar 2020 10:30:03 +0000 Subject: [PATCH 014/192] Blocks now supports streaming Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index ee280bac9bf..f19cb51c44e 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -37,7 +37,7 @@ querier: { replicas: if $._config.queryFrontend.sharded_queries_enabled then 12 else 6, concurrency: if $._config.queryFrontend.sharded_queries_enabled then 16 else 8, - ingester_streaming_enabled: $._config.storage_engine != 'tsdb', + ingester_streaming_enabled: true, }, queryFrontend: { From 299db59acffbf9bac510f47f4594b08b41363069 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 18 Mar 2020 12:28:14 +0100 Subject: [PATCH 015/192] Removed ingestion_rate_global_limit_enabled feature flag Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 21 +++++---------------- operations/mimir/distributor.libsonnet | 4 +--- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 517bd4fd6db..215fe366297 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -69,8 +69,6 @@ memcached_index_writes_enabled: $._config.storage_engine != 'tsdb', memcached_chunks_enabled: $._config.storage_engine != 'tsdb', - ingestion_rate_global_limit_enabled: false, - // The query-tee is an optional service which can be used to send // the same input query to multiple backends and make them compete // (comparing performances). @@ -197,8 +195,8 @@ // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. // // small_user: { - // ingestion_rate: 10,000 - // ingestion_burst_size: 20,000 + // ingestion_rate: 100,000 + // ingestion_burst_size: 1,000,000 // // max_series_per_user: 0 (disabled) // max_series_per_metric: 0 (disabled) @@ -211,9 +209,6 @@ // }, medium_user:: { - ingestion_rate: 25000, - ingestion_burst_size: 50000, - max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit @@ -222,15 +217,12 @@ max_series_per_query: 100000, max_samples_per_query: 1000000, - } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M }, big_user:: { - ingestion_rate: 50000, - ingestion_burst_size: 70000, - max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit @@ -239,15 +231,12 @@ max_global_series_per_user: 6000000, // 6M max_global_series_per_metric: 600000, // 600K - } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + ingestion_rate: 700000, // 700K ingestion_burst_size: 7000000, // 7M }, super_user:: { - ingestion_rate: 200000, - ingestion_burst_size: 240000, - max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit @@ -256,7 +245,7 @@ max_series_per_query: 100000, max_samples_per_query: 1000000, - } + if !$._config.ingestion_rate_global_limit_enabled then {} else { + ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M }, diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index dff9130549d..a4b6a7df76d 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -8,8 +8,6 @@ { target: 'distributor', - 'distributor.ingestion-rate-limit': 10000, - 'distributor.ingestion-burst-size': 20000, 'validation.reject-old-samples': true, 'validation.reject-old-samples.max-age': '12h', 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', @@ -32,7 +30,7 @@ 'server.grpc.keepalive.max-connection-age': '2m', 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', - } + (if !$._config.ingestion_rate_global_limit_enabled then {} else { + 'distributor.ingestion-rate-limit-strategy': 'global', 'distributor.ingestion-rate-limit': 100000, // 100K 'distributor.ingestion-burst-size': 1000000, // 1M From 0f94c162c58bab13816a2f49b87cd19016641e99 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 18 Mar 2020 13:20:00 +0100 Subject: [PATCH 016/192] Fixed typo in mixin Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index a4b6a7df76d..672e4b4e2cb 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -41,7 +41,7 @@ 'distributor.ring.consul.watch-rate-limit': 1, 'distributor.ring.consul.watch-burst-size': 1, 'distributor.ring.prefix': '', - }), + }, distributor_ports:: $.util.defaultPorts, From 0901ca573e19f2265340ed68db5cd381bbb8975e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 18 Mar 2020 15:42:14 +0100 Subject: [PATCH 017/192] Fixed lint issues Signed-off-by: Marco Pracucci --- operations/mimir/querier.libsonnet | 8 ++++---- operations/mimir/query-frontend.libsonnet | 17 +++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index c850397f736..ba589515f5d 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -38,11 +38,11 @@ $.jaeger_mixin + container.withEnvMap($.querier_env_map) + if $._config.queryFrontend.sharded_queries_enabled then - $.util.resourcesRequests('3', '12Gi') + - $.util.resourcesLimits(null, '24Gi') + $.util.resourcesRequests('3', '12Gi') + + $.util.resourcesLimits(null, '24Gi') else - $.util.resourcesRequests('1', '12Gi') + - $.util.resourcesLimits(null, '24Gi'), + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1beta1.deployment, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 6012d591b5f..8e7f909930d 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -52,13 +52,14 @@ container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + $.jaeger_mixin + if $._config.queryFrontend.sharded_queries_enabled then - $.util.resourcesRequests('2', '2Gi') + - $.util.resourcesLimits(null, '6Gi') + - container.withEnvMap({ - JAEGER_REPORTER_MAX_QUEUE_SIZE: '5000', - }) - else $.util.resourcesRequests('2', '600Mi') + - $.util.resourcesLimits(null, '1200Mi'), + $.util.resourcesRequests('2', '2Gi') + + $.util.resourcesLimits(null, '6Gi') + + container.withEnvMap({ + JAEGER_REPORTER_MAX_QUEUE_SIZE: '5000', + }) + else + $.util.resourcesRequests('2', '600Mi') + + $.util.resourcesLimits(null, '1200Mi'), local deployment = $.apps.v1beta1.deployment, @@ -68,7 +69,7 @@ $.util.antiAffinity + // inject storage schema in order to know what/how to shard if $._config.queryFrontend.sharded_queries_enabled then - $.storage_config_mixin + $.storage_config_mixin else {}, local service = $.core.v1.service, From 5234fee9e836248adc7ad09aadb262a4d058349b Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Sat, 28 Mar 2020 16:44:43 +0530 Subject: [PATCH 018/192] Update the image of flusher (https://github.com/grafana/cortex-jsonnet/pull/19) Signed-off-by: Ganesh Vernekar --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 8df2dc72735..58ba62fab0b 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -13,7 +13,7 @@ query_frontend: self.cortex, tableManager: self.cortex, compactor: self.cortex, - flusher: 'ganeshve/cortex:flusher-target-5aac2d73', + flusher: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. ruler: 'jtlisi/cortex:20191122_ruler_with_api-4059a06d3', From 492fd009ad7f72d18b91e8fa817d7f774088bd53 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Tue, 31 Mar 2020 14:45:47 +0530 Subject: [PATCH 019/192] Remove CPU limits on distributor (https://github.com/grafana/cortex-jsonnet/pull/20) Signed-off-by: Ganesh Vernekar --- operations/mimir/distributor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 672e4b4e2cb..d377ce61483 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -50,7 +50,7 @@ container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + $.util.resourcesRequests('2', '2Gi') + - $.util.resourcesLimits('6', '4Gi') + + $.util.resourcesLimits(null, '4Gi') + $.jaeger_mixin, local deployment = $.apps.v1beta1.deployment, From 5afcbd4f1ba2f8f296aefbd49230a3b5d24f576c Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 1 Apr 2020 19:22:41 +0530 Subject: [PATCH 020/192] Add PodDisruptionBudget for ingester (https://github.com/grafana/cortex-jsonnet/pull/22) Signed-off-by: Ganesh Vernekar --- operations/mimir/ingester.libsonnet | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index a585806157d..f02996722b9 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -65,6 +65,7 @@ deployment.new('ingester', 3, [$.ingester_container], $.ingester_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + + deployment.mixin.metadata.withLabels({ name: 'ingester' }) + deployment.mixin.spec.withMinReadySeconds(60) + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + @@ -76,4 +77,13 @@ ingester_service: $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels), + + local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + + ingester_pdb: + podDisruptionBudget.new() + + podDisruptionBudget.mixin.metadata.withName('ingester-pdb') + + podDisruptionBudget.mixin.metadata.withLabels({ name: 'ingester-pdb' }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: $.ingester_deployment.metadata.labels.name }) + + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), } From 7d2bda6d81154912b00772ce63964b8bf68821ad Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 2 Apr 2020 11:12:59 +0530 Subject: [PATCH 021/192] Remove deep nesting in ingester PodDisruptionBudget (https://github.com/grafana/cortex-jsonnet/pull/25) Signed-off-by: Ganesh Vernekar --- operations/mimir/ingester.libsonnet | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index f02996722b9..f64339b373b 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -45,8 +45,10 @@ ingester_ports:: $.util.defaultPorts, + local name = 'ingester', + ingester_container:: - container.new('ingester', $._images.ingester) + + container.new(name, $._images.ingester) + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + container.mixin.readinessProbe.httpGet.withPath('/ready') + @@ -62,10 +64,10 @@ ingester_deployment_labels:: {}, ingester_deployment: - deployment.new('ingester', 3, [$.ingester_container], $.ingester_deployment_labels) + + deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + - deployment.mixin.metadata.withLabels({ name: 'ingester' }) + + deployment.mixin.metadata.withLabels({ name: name }) + deployment.mixin.spec.withMinReadySeconds(60) + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + @@ -84,6 +86,6 @@ podDisruptionBudget.new() + podDisruptionBudget.mixin.metadata.withName('ingester-pdb') + podDisruptionBudget.mixin.metadata.withLabels({ name: 'ingester-pdb' }) + - podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: $.ingester_deployment.metadata.labels.name }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: name }) + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), } From 51cd939dd405a6190e6d876800a26def5ce534fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 6 Apr 2020 14:25:34 +0200 Subject: [PATCH 022/192] Setup readiness probe for all Cortex components. (https://github.com/grafana/cortex-jsonnet/pull/30) --- operations/mimir/alertmanager.libsonnet | 1 + operations/mimir/common.libsonnet | 7 +++++++ operations/mimir/distributor.libsonnet | 1 + operations/mimir/flusher-job.libsonnet | 5 +---- operations/mimir/ingester.libsonnet | 5 +---- operations/mimir/querier.libsonnet | 1 + operations/mimir/query-frontend.libsonnet | 1 + operations/mimir/ruler.libsonnet | 1 + operations/mimir/table-manager.libsonnet | 1 + operations/mimir/tsdb.libsonnet | 5 +---- 10 files changed, 16 insertions(+), 12 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index fbcc6e4ef4e..3b4ca16b9f4 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -29,6 +29,7 @@ container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + $.util.resourcesRequests('100m', '1Gi') + + $.util.readinessProbe + $.jaeger_mixin, diff --git a/operations/mimir/common.libsonnet b/operations/mimir/common.libsonnet index 2da29bacde6..9daf7ee57ea 100644 --- a/operations/mimir/common.libsonnet +++ b/operations/mimir/common.libsonnet @@ -4,11 +4,18 @@ util+:: { local containerPort = $.core.v1.containerPort, + local container = $.core.v1.container, defaultPorts:: [ containerPort.newNamed(name='http-metrics', containerPort=80), containerPort.newNamed(name='grpc', containerPort=9095), ], + + readinessProbe:: + container.mixin.readinessProbe.httpGet.withPath('/ready') + + container.mixin.readinessProbe.httpGet.withPort(80) + + container.mixin.readinessProbe.withInitialDelaySeconds(15) + + container.mixin.readinessProbe.withTimeoutSeconds(1), }, } diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index d377ce61483..18e2ff2c48a 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -51,6 +51,7 @@ container.withArgsMixin($.util.mapToFlags($.distributor_args)) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '4Gi') + + $.util.readinessProbe + $.jaeger_mixin, local deployment = $.apps.v1beta1.deployment, diff --git a/operations/mimir/flusher-job.libsonnet b/operations/mimir/flusher-job.libsonnet index 514a867de37..78eadeb7c5a 100644 --- a/operations/mimir/flusher-job.libsonnet +++ b/operations/mimir/flusher-job.libsonnet @@ -18,12 +18,9 @@ target: 'flusher', 'flusher.wal-dir': $._config.wal_dir, })) + - container.mixin.readinessProbe.httpGet.withPath('/ready') + - container.mixin.readinessProbe.httpGet.withPort(80) + - container.mixin.readinessProbe.withInitialDelaySeconds(15) + - container.mixin.readinessProbe.withTimeoutSeconds(1) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + + $.util.readinessProbe + $.jaeger_mixin, flusher_job_storage_config_mixin:: diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index f64339b373b..96065dcce4a 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -51,12 +51,9 @@ container.new(name, $._images.ingester) + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + - container.mixin.readinessProbe.httpGet.withPath('/ready') + - container.mixin.readinessProbe.httpGet.withPort(80) + - container.mixin.readinessProbe.withInitialDelaySeconds(15) + - container.mixin.readinessProbe.withTimeoutSeconds(1) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + + $.util.readinessProbe + $.jaeger_mixin, local deployment = $.apps.v1beta1.deployment, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index ba589515f5d..86f1d1fa309 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -36,6 +36,7 @@ container.withPorts($.querier_ports) + container.withArgsMixin($.util.mapToFlags($.querier_args)) + $.jaeger_mixin + + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + if $._config.queryFrontend.sharded_queries_enabled then $.util.resourcesRequests('3', '12Gi') + diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 8e7f909930d..670e709c74f 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -51,6 +51,7 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + $.jaeger_mixin + + $.util.readinessProbe + if $._config.queryFrontend.sharded_queries_enabled then $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '6Gi') + diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 81a55e91b48..b8bb44a9673 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -28,6 +28,7 @@ container.withArgsMixin($.util.mapToFlags($.ruler_args)) + $.util.resourcesRequests('1', '6Gi') + $.util.resourcesLimits('16', '16Gi') + + $.util.readinessProbe + $.jaeger_mixin, local deployment = $.apps.v1beta1.deployment, diff --git a/operations/mimir/table-manager.libsonnet b/operations/mimir/table-manager.libsonnet index b392cd93d29..d2f6e0da3f4 100644 --- a/operations/mimir/table-manager.libsonnet +++ b/operations/mimir/table-manager.libsonnet @@ -35,6 +35,7 @@ container.withArgsMixin($.util.mapToFlags($.table_manager_args)) + $.util.resourcesRequests('100m', '100Mi') + $.util.resourcesLimits('200m', '200Mi') + + $.util.readinessProbe + $.jaeger_mixin else {}, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index cbd3470c508..7bd81492aa9 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -41,10 +41,6 @@ }, querier_container+:: - container.mixin.readinessProbe.httpGet.withPath('/ready') + - container.mixin.readinessProbe.httpGet.withPort(80) + - container.mixin.readinessProbe.withInitialDelaySeconds(5) + - container.mixin.readinessProbe.withTimeoutSeconds(1) + container.withVolumeMountsMixin([ volumeMount.new('querier-data', '/data'), ]), @@ -139,6 +135,7 @@ container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + $.util.resourcesRequests('1', '6Gi') + $.util.resourcesLimits('1', '6Gi') + + $.util.readinessProbe + $.jaeger_mixin, compactor_statefulset: From 47b712cf6ac2d61b0197a554bab446318f2c22ad Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Mon, 6 Apr 2020 13:03:08 -0700 Subject: [PATCH 023/192] remove gateway recording rules and alerts Signed-off-by: Callum Styan --- operations/mimir/distributor.libsonnet | 3 --- 1 file changed, 3 deletions(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 18e2ff2c48a..cce8ddc5a8e 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -24,9 +24,6 @@ // around 1.25G, reducing the 99%ile. 'mem-ballast-size-bytes': 1 << 30, // 1GB - // The cortex-gateway should frequently reopen the connections towards the - // distributors in order to guarantee that new distributors receive traffic - // as soon as they're ready. 'server.grpc.keepalive.max-connection-age': '2m', 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', From 07834490253b3204d71965d83e2db06baa0452a2 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 15 Apr 2020 17:00:07 +0100 Subject: [PATCH 024/192] This is the config for internal r81 release (1.0.0) Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 18 +++--------------- operations/mimir/distributor.libsonnet | 3 --- operations/mimir/images.libsonnet | 5 +++-- operations/mimir/ingester.libsonnet | 4 ---- operations/mimir/query-frontend.libsonnet | 1 - operations/mimir/table-manager.libsonnet | 14 ++------------ 6 files changed, 8 insertions(+), 37 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 215fe366297..ad47b8eaede 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -37,7 +37,6 @@ querier: { replicas: if $._config.queryFrontend.sharded_queries_enabled then 12 else 6, concurrency: if $._config.queryFrontend.sharded_queries_enabled then 16 else 8, - ingester_streaming_enabled: true, }, queryFrontend: { @@ -105,11 +104,9 @@ storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled then { - 'memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, - 'memcached.service': 'memcached-client', - 'memcached.timeout': '3s', - 'memcached.batchsize': 1024, - 'memcached.consistent-hash': true, + 'store.chunks-cache.memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, + 'store.chunks-cache.memcached.service': 'memcached-client', + 'store.chunks-cache.memcached.timeout': '3s', } else {}, @@ -135,10 +132,6 @@ // Shared between the Ruler and Querier queryConfig: { - // Use iterators to merge chunks, to reduce memory usage. - 'querier.ingester-streaming': $._config.querier.ingester_streaming_enabled, - 'querier.batch-iterators': true, - // Don't query ingesters for older queries. // Chunks are 6hrs right now. Add some slack for safety. 'querier.query-ingesters-within': '12h', @@ -165,7 +158,6 @@ 'store.index-cache-read.memcached.hostname': 'memcached-index-queries.%(namespace)s.svc.cluster.local' % $._config, 'store.index-cache-read.memcached.service': 'memcached-client', 'store.index-cache-read.memcached.timeout': '500ms', - 'store.index-cache-read.memcached.consistent-hash': true, 'store.cache-lookups-older-than': '36h', } else {} @@ -173,10 +165,7 @@ ringConfig: { 'consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'consul.consistent-reads': false, 'ring.prefix': '', - 'consul.watch-rate-limit': 1, - 'consul.watch-burst-size': 5, }, // Some distributor config is shared with the querier. @@ -185,7 +174,6 @@ 'distributor.shard-by-all-labels': true, 'distributor.health-check-ingesters': true, 'ring.heartbeat-timeout': '10m', - 'consul.consistent-reads': false, }, overrides: { diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index cce8ddc5a8e..aca539a503f 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -34,9 +34,6 @@ // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'distributor.ring.consul.consistent-reads': false, - 'distributor.ring.consul.watch-rate-limit': 1, - 'distributor.ring.consul.watch-burst-size': 1, 'distributor.ring.prefix': '', }, diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 58ba62fab0b..edce2aa80f1 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:master-fdcd992f', + cortex: 'cortexproject/cortex:v1.0.0', distributor: self.cortex, ingester: self.cortex, @@ -14,9 +14,10 @@ tableManager: self.cortex, compactor: self.cortex, flusher: self.cortex, + ruler: self.cortex, + query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. - ruler: 'jtlisi/cortex:20191122_ruler_with_api-4059a06d3', alertmanager: 'jtlisi/cortex:20190819_alertmanager_update-faa66aa43', testExporter: 'cortexproject/test-exporter:master-be013707', }, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 96065dcce4a..cf94c49cd48 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -13,16 +13,13 @@ 'ingester.num-tokens': 512, 'ingester.join-after': '30s', 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. - 'ingester.claim-on-rollout': true, 'ingester.heartbeat-period': '15s', 'ingester.max-stale-chunk-idle': '5m', - 'ingester.normalise-tokens': true, // Chunk building/flushing config. 'ingester.chunk-encoding': 3, // Bigchunk encoding 'ingester.retain-period': '15m', 'ingester.max-chunk-age': '6h', - 'ingester.spread-flushes': true, // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, @@ -38,7 +35,6 @@ // Setup index write deduping. 'store.index-cache-write.memcached.hostname': 'memcached-index-writes.%(namespace)s.svc.cluster.local' % $._config, 'store.index-cache-write.memcached.service': 'memcached-client', - 'store.index-cache-write.memcached.consistent-hash': true, } else {} ), diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 670e709c74f..cd4789df2ea 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -20,7 +20,6 @@ 'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace, 'frontend.memcached.service': 'memcached-client', 'frontend.memcached.timeout': '500ms', - 'frontend.memcached.consistent-hash': true, // So that exporters like cloudwatch can still send in data and be un-cached. 'frontend.max-cache-freshness': '10m', diff --git a/operations/mimir/table-manager.libsonnet b/operations/mimir/table-manager.libsonnet index d2f6e0da3f4..45b97f4a3ea 100644 --- a/operations/mimir/table-manager.libsonnet +++ b/operations/mimir/table-manager.libsonnet @@ -6,26 +6,16 @@ { target: 'table-manager', - // Cassandra / BigTable doesn't use these fields, so set them to zero - 'dynamodb.chunk-table.inactive-read-throughput': 0, - 'dynamodb.chunk-table.inactive-write-throughput': 0, - 'dynamodb.chunk-table.read-throughput': 0, - 'dynamodb.chunk-table.write-throughput': 0, - 'dynamodb.periodic-table.inactive-read-throughput': 0, - 'dynamodb.periodic-table.inactive-write-throughput': 0, - 'dynamodb.periodic-table.read-throughput': 0, - 'dynamodb.periodic-table.write-throughput': 0, - // Rate limit Bigtable Admin calls. Google seem to limit to ~100QPS, // and given 2yrs worth of tables (~100) a sync will table 20s. This // allows you to run upto 20 independant Cortex clusters on the same // Google project before running into issues. - 'dynamodb.poll-interval': '10m', - 'dynamodb.periodic-table.grace-period': '3h', 'bigtable.grpc-client-rate-limit': 5.0, 'bigtable.grpc-client-rate-limit-burst': 5, 'bigtable.backoff-on-ratelimits': true, 'bigtable.table-cache.enabled': true, + 'table-manager.poll-interval': '10m', + 'table-manager.periodic-table.grace-period': '3h', }, table_manager_container:: From d80199ea47dd718aac01f5a65d9259572abfe8e6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 17 Apr 2020 15:49:38 +0200 Subject: [PATCH 025/192] Added store-gateway support Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 32 +++++++++++++++------- operations/mimir/images.libsonnet | 1 + operations/mimir/tsdb.libsonnet | 44 +++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 10 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index ad47b8eaede..081e3ad122f 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -59,6 +59,7 @@ // to switch to tsdb storage. storage_engine: 'chunks', storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', + store_gateway_enabled: false, // TSDB storage engine doesn't require the table manager. table_manager_enabled: $._config.storage_engine != 'tsdb', @@ -119,16 +120,27 @@ // TSDB blocks storage configuration, used only when 'tsdb' storage // engine is explicitly enabled. - storageTSDBConfig: if $._config.storage_engine == 'tsdb' then { - 'store.engine': 'tsdb', - 'experimental.tsdb.dir': '/data/tsdb', - 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', - 'experimental.tsdb.block-ranges-period': '2h', - 'experimental.tsdb.retention-period': '1h', - 'experimental.tsdb.ship-interval': '1m', - 'experimental.tsdb.backend': 'gcs', - 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, - } else {}, + storageTSDBConfig: ( + if $._config.storage_engine != 'tsdb' then {} else { + 'store.engine': 'tsdb', + 'experimental.tsdb.dir': '/data/tsdb', + 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', + 'experimental.tsdb.block-ranges-period': '2h', + 'experimental.tsdb.retention-period': '1h', + 'experimental.tsdb.ship-interval': '1m', + 'experimental.tsdb.backend': 'gcs', + 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, + 'experimental.tsdb.store-gateway-enabled': $._config.store_gateway_enabled, + } + ) + ( + if $._config.storage_engine != 'tsdb' || !$._config.store_gateway_enabled then {} else { + 'experimental.store-gateway.sharding-enabled': true, + 'experimental.store-gateway.sharding-ring.store': 'consul', + 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'experimental.store-gateway.sharding-ring.prefix': '', + 'experimental.store-gateway.replication-factor': 3, + } + ), // Shared between the Ruler and Querier queryConfig: { diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index edce2aa80f1..ba47b5f96ca 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -15,6 +15,7 @@ compactor: self.cortex, flusher: self.cortex, ruler: self.cortex, + store_gateway: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 7bd81492aa9..f6774830a3c 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -14,6 +14,10 @@ cortex_querier_data_disk_size: '10Gi', cortex_querier_data_disk_class: 'standard', + // Allow to configure the store-gateway disk. + cortex_store_gateway_data_disk_size: '50Gi', + cortex_store_gateway_data_disk_class: 'standard', + // Allow to configure the compactor disk. cortex_compactor_data_disk_size: '250Gi', cortex_compactor_data_disk_class: 'standard', @@ -149,4 +153,44 @@ statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900), + // The store-gateway runs a statefulset. + local store_gateway_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) + + pvc.mixin.metadata.withName('store-gateway-data'), + + store_gateway_args:: + $._config.storageConfig + { + target: 'store-gateway', + + // Persist ring tokens so that when the store-gateway will be restarted + // it will pick the same tokens + 'experimental.store-gateway.tokens-file-path': '/data/tokens', + }, + + store_gateway_ports:: $.util.defaultPorts, + + store_gateway_container:: + container.new('store-gateway', $._images.store_gateway) + + container.withPorts($.store_gateway_ports) + + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + + $.util.resourcesRequests('1', '6Gi') + + $.util.resourcesLimits('1', '6Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + store_gateway_statefulset: if !$._config.store_gateway_enabled then {} else + statefulSet.new('store-gateway', 3, [$.store_gateway_container], store_gateway_data_pvc) + .withServiceName('store-gateway') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'store-gateway' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'store-gateway' }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120), } From 33890841e099f7f19fa5c439437cc6f6fd83c70a Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 21 Apr 2020 14:34:51 -0400 Subject: [PATCH 026/192] update ruler jsonnet Signed-off-by: Jacob Lisi --- operations/mimir/config.libsonnet | 21 +++++++++++++++++++++ operations/mimir/ruler.libsonnet | 5 +---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 081e3ad122f..acb97ff0868 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -188,6 +188,27 @@ 'ring.heartbeat-timeout': '10m', }, + ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3)', + # TODO: Generic client generating functions would be nice. + ruler_s3_bucket_name: $._config.s3_bucket_name, + ruler_gcs_bucket_name: error 'must specify a GCS bucket name', + + rulerClientConfig: + { + 'ruler.storage.type': $._config.ruler_client_type, + } + + { + 'configdb': { + 'configs_api_url': 'config.%s.svc.cluster.local' % $._config.namespace, + }, + 'gcs': { + 'ruler.storage.gcs.bucketname': $._config.ruler_gcs_bucket_name, + }, + 's3': { + 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], + }, + }[$._config.ruler_client_type], + overrides: { // === Per-tenant usage limits. === // diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index b8bb44a9673..f6860cff757 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -7,6 +7,7 @@ $._config.storageConfig + $._config.queryConfig + $._config.distributorConfig + + $._config.rulerClientConfig + { target: 'ruler', // Alertmanager configs @@ -16,10 +17,6 @@ // Ring Configs 'ruler.enable-sharding': true, 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - - // Rule Storage Configs - 'ruler.storage.type': 'gcs', - 'rules.gcs.bucketname': '%(cluster)s-cortex-configdb-%(namespace)s' % $._config, }, ruler_container:: From 93f7358b5c2e099c7425749694847d57ca8942ce Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 21 Apr 2020 14:41:35 -0400 Subject: [PATCH 027/192] format jsonnet Signed-off-by: Jacob Lisi --- operations/mimir/config.libsonnet | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index acb97ff0868..46c38b65859 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -189,7 +189,7 @@ }, ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3)', - # TODO: Generic client generating functions would be nice. + // TODO: Generic client generating functions would be nice. ruler_s3_bucket_name: $._config.s3_bucket_name, ruler_gcs_bucket_name: error 'must specify a GCS bucket name', @@ -198,16 +198,16 @@ 'ruler.storage.type': $._config.ruler_client_type, } + { - 'configdb': { - 'configs_api_url': 'config.%s.svc.cluster.local' % $._config.namespace, + configdb: { + configs_api_url: 'config.%s.svc.cluster.local' % $._config.namespace, }, - 'gcs': { + gcs: { 'ruler.storage.gcs.bucketname': $._config.ruler_gcs_bucket_name, }, - 's3': { + s3: { 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], }, - }[$._config.ruler_client_type], + }[$._config.ruler_client_type], overrides: { // === Per-tenant usage limits. === From f28da25d8a1f270c6adc71bb941a5a09b79b8ca7 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 28 Apr 2020 09:50:50 +0200 Subject: [PATCH 028/192] Immediately join ingesters in the ring when using the blocks storage Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index f6774830a3c..999165ef803 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -82,6 +82,7 @@ ingester_args+:: { // Disable TSDB blocks transfer because of persistent volumes 'ingester.max-transfer-retries': 0, + 'ingester.join-after': '0s', // Persist ring tokens so that when the ingester will be restarted // it will pick the same tokens From 3281ef8a1223eca4a4def61c54f7e8fa6267c0ce Mon Sep 17 00:00:00 2001 From: Martin Schneppenheim Date: Wed, 6 May 2020 14:23:07 +0200 Subject: [PATCH 029/192] Fixes https://github.com/grafana/cortex-jsonnet/pull/49 Signed-off-by: Martin Schneppenheim --- operations/mimir/config.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 46c38b65859..00d62a1464b 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -145,7 +145,8 @@ // Shared between the Ruler and Querier queryConfig: { // Don't query ingesters for older queries. - // Chunks are 6hrs right now. Add some slack for safety. + // Chunks are held in memory for up to 6hrs right now. Additional 6h are granted for safety reasons because + // the remote writing Prometheus may have a delay or write requests into the database are queued. 'querier.query-ingesters-within': '12h', 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', From 8c064dc008931619b7c1a92ce900b3745fcf39ef Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 17:16:57 +0200 Subject: [PATCH 030/192] Expose store-gateway via service Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 3 +++ 1 file changed, 3 insertions(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 999165ef803..aa0757a6a4c 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -194,4 +194,7 @@ statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120), + + store_gateway_service: + $.util.serviceFor($.store_gateway_statefulset), } From 4c91064096a18cf526037363e02f7538f28de80e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 17:40:20 +0200 Subject: [PATCH 031/192] Fix store-gateway service because it's optional Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index aa0757a6a4c..b3828115b95 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -195,6 +195,6 @@ statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120), - store_gateway_service: + store_gateway_service: if !$._config.store_gateway_enabled then {} else $.util.serviceFor($.store_gateway_statefulset), } From 82f435f8e32654a5f637a682fa276bcd9c526323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 21 May 2020 12:09:58 +0200 Subject: [PATCH 032/192] Enable chunks caching when using blocks (https://github.com/grafana/cortex-jsonnet/pull/71) * Enable chunks memcached when using blocks. * Enable index cache by default. * Introduced _max_item_size_mb config fields for individual caches. --- operations/mimir/config.libsonnet | 16 +++++++++++----- operations/mimir/memcached.libsonnet | 4 +++- operations/mimir/tsdb.libsonnet | 23 +++++++++++++++++++++-- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 46c38b65859..b660a74d492 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -64,10 +64,16 @@ // TSDB storage engine doesn't require the table manager. table_manager_enabled: $._config.storage_engine != 'tsdb', - // TSDB storage engine doesn't require memcached for chunks or chunk indexes. - memcached_index_queries_enabled: $._config.storage_engine != 'tsdb', + // TSDB storage engine doesn't support index-writes (for writes deduplication) cache. memcached_index_writes_enabled: $._config.storage_engine != 'tsdb', - memcached_chunks_enabled: $._config.storage_engine != 'tsdb', + memcached_index_writes_max_item_size_mb: 1, + + // Index and chunks caches are supported by both TSDB storage engine and chunks engine. + memcached_index_queries_enabled: true, + memcached_index_queries_max_item_size_mb: 5, + + memcached_chunks_enabled: true, + memcached_chunks_max_item_size_mb: 1, // The query-tee is an optional service which can be used to send // the same input query to multiple backends and make them compete @@ -103,7 +109,7 @@ storeConfig: self.storeMemcachedChunksConfig, - storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled then + storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled && $._config.storage_engine == 'chunks' then { 'store.chunks-cache.memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, 'store.chunks-cache.memcached.service': 'memcached-client', @@ -161,7 +167,7 @@ // Don't query the chunk store for data younger than max_chunk_idle. 'querier.query-store-after': $._config.max_chunk_idle, } + ( - if $._config.memcached_index_queries_enabled then + if $._config.memcached_index_queries_enabled && $._config.storage_engine == 'chunks' then { // Setting for index cache. 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet index bd00189f031..e55cc8e5935 100644 --- a/operations/mimir/memcached.libsonnet +++ b/operations/mimir/memcached.libsonnet @@ -33,7 +33,7 @@ memcached { memcached_index_queries: if $._config.memcached_index_queries_enabled then $.memcached { name: 'memcached-index-queries', - max_item_size: '5m', + max_item_size: '%dm' % [$._config.memcached_index_queries_max_item_size_mb], } else {}, @@ -41,6 +41,7 @@ memcached { memcached_index_writes: if $._config.memcached_index_writes_enabled then $.memcached { name: 'memcached-index-writes', + max_item_size: '%dm' % [$._config.memcached_index_writes_max_item_size_mb], } else {}, @@ -48,6 +49,7 @@ memcached { memcached_chunks: if $._config.memcached_chunks_enabled then $.memcached { name: 'memcached', + max_item_size: '%dm' % [$._config.memcached_chunks_max_item_size_mb], // Save memory by more tightly provisioning memcached chunks. memory_limit_mb: 6 * 1024, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index b3828115b95..b758d252355 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -23,6 +23,25 @@ cortex_compactor_data_disk_class: 'standard', }, + blocks_chunks_caching_config:: { + 'experimental.tsdb.bucket-store.index-cache.backend': 'memcached', + 'experimental.tsdb.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.tsdb.bucket-store.index-cache.memcached.timeout': '200ms', + 'experimental.tsdb.bucket-store.index-cache.memcached.max-item-size': $._config._memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'experimental.tsdb.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + 'experimental.tsdb.bucket-store.index-cache.postings-compression-enabled': 'true', + + 'experimental.tsdb.bucket-store.chunks-cache.backend': 'memcached', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.tsdb.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + }, + // The querier should run on a dedicated volume used to sync TSDB // indexes, in order to not negatively affect the node performances // in case of sustained I/O or utilization. For this reason we: @@ -42,7 +61,7 @@ // is generated 'experimental.tsdb.bucket-store.tenant-sync-concurrency': 2, 'experimental.tsdb.bucket-store.block-sync-concurrency': 5, - }, + } + (if !$._config.store_gateway_enabled then $.blocks_chunks_caching_config else {}), querier_container+:: container.withVolumeMountsMixin([ @@ -170,7 +189,7 @@ // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens 'experimental.store-gateway.tokens-file-path': '/data/tokens', - }, + } + (if $._config.store_gateway_enabled then $.blocks_chunks_caching_config else {}), store_gateway_ports:: $.util.defaultPorts, From df86b0c300540d2a5f3fede65b777d56a5c2365a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 21 May 2020 12:31:14 +0200 Subject: [PATCH 033/192] Fix typo (https://github.com/grafana/cortex-jsonnet/pull/73) --- operations/mimir/tsdb.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index b758d252355..1d0a421caef 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -27,7 +27,7 @@ 'experimental.tsdb.bucket-store.index-cache.backend': 'memcached', 'experimental.tsdb.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, 'experimental.tsdb.bucket-store.index-cache.memcached.timeout': '200ms', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-item-size': $._config._memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'experimental.tsdb.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-concurrency': '50', 'experimental.tsdb.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', From 0c7da28ede99795a454a47eb8aaf52e98d17b42b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 May 2020 09:13:36 +0200 Subject: [PATCH 034/192] Fix retention period. Configure querying for TSDB. (https://github.com/grafana/cortex-jsonnet/pull/74) --- operations/mimir/config.libsonnet | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index b660a74d492..b86d8fb42d9 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -132,7 +132,7 @@ 'experimental.tsdb.dir': '/data/tsdb', 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', 'experimental.tsdb.block-ranges-period': '2h', - 'experimental.tsdb.retention-period': '1h', + 'experimental.tsdb.retention-period': '6h', 'experimental.tsdb.ship-interval': '1m', 'experimental.tsdb.backend': 'gcs', 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, @@ -150,10 +150,6 @@ // Shared between the Ruler and Querier queryConfig: { - // Don't query ingesters for older queries. - // Chunks are 6hrs right now. Add some slack for safety. - 'querier.query-ingesters-within': '12h', - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', // Limit the size of the rows we read from the index. @@ -163,10 +159,22 @@ // splitting in the frontend, the reality is this only limits rate(foo[31d]) // type queries. 'store.max-query-length': '744h', - - // Don't query the chunk store for data younger than max_chunk_idle. - 'querier.query-store-after': $._config.max_chunk_idle, } + ( + if $._config.storage_engine == 'chunks' then { + // Don't query ingesters for older queries. + // Chunks are 6hrs right now. Add some slack for safety. + 'querier.query-ingesters-within': '12h', + + // Don't query the chunk store for data younger than max_chunk_idle. + 'querier.query-store-after': $._config.max_chunk_idle, + } else if $._config.storage_engine == 'tsdb' then { + // Ingesters don't have data older than 6h, no need to ask them. + 'querier.query-ingesters-within': '6h', + + // No need to look at store for data younger than 4h, as ingesters have all of it. + 'querier.query-store-after': '4h', + } + ) + ( if $._config.memcached_index_queries_enabled && $._config.storage_engine == 'chunks' then { // Setting for index cache. From 66594b8e8436423d4f6d2f9d90b399a8c146235e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 27 May 2020 10:38:21 +0200 Subject: [PATCH 035/192] Configure metadata cache when using blocks storage. (https://github.com/grafana/cortex-jsonnet/pull/78) Cache is used by both querier (even if store-gateway is disabled) and store-gateway. --- operations/mimir/config.libsonnet | 3 +++ operations/mimir/memcached.libsonnet | 14 ++++++++++++++ operations/mimir/tsdb.libsonnet | 14 ++++++++++++-- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index b86d8fb42d9..f29f4519aa5 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -75,6 +75,9 @@ memcached_chunks_enabled: true, memcached_chunks_max_item_size_mb: 1, + memcached_metadata_enabled: $._config.storage_engine == 'tsdb', + memcached_metadata_max_item_size_mb: 1, + // The query-tee is an optional service which can be used to send // the same input query to multiple backends and make them compete // (comparing performances). diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet index e55cc8e5935..6c6b751e2c1 100644 --- a/operations/mimir/memcached.libsonnet +++ b/operations/mimir/memcached.libsonnet @@ -62,4 +62,18 @@ memcached { container.withArgsMixin(['-c 4096']), } else {}, + + // Memcached instance for caching TSDB blocks metadata (meta.json files, deletion marks, list of users and blocks). + memcached_metadata: if $._config.memcached_metadata_enabled then + $.memcached { + name: 'memcached-metadata', + max_item_size: '%dm' % [$._config.memcached_metadata_max_item_size_mb], + + // Metadata cache doesn't need much memory. + memory_limit_mb: 512, + + local statefulSet = $.apps.v1beta1.statefulSet, + statefulSet+: + statefulSet.mixin.spec.withReplicas(1), + }, } diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 1d0a421caef..4b597234071 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -42,6 +42,16 @@ 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', }, + blocks_metadata_caching_config:: { + 'experimental.tsdb.bucket-store.metadata-cache.backend': 'memcached', + 'experimental.tsdb.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.tsdb.bucket-store.metadata-cache.memcached.timeout': '200ms', + 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, + 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', + 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', + 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + }, + // The querier should run on a dedicated volume used to sync TSDB // indexes, in order to not negatively affect the node performances // in case of sustained I/O or utilization. For this reason we: @@ -61,7 +71,7 @@ // is generated 'experimental.tsdb.bucket-store.tenant-sync-concurrency': 2, 'experimental.tsdb.bucket-store.block-sync-concurrency': 5, - } + (if !$._config.store_gateway_enabled then $.blocks_chunks_caching_config else {}), + } + $.blocks_metadata_caching_config + (if !$._config.store_gateway_enabled then $.blocks_chunks_caching_config else {}), querier_container+:: container.withVolumeMountsMixin([ @@ -189,7 +199,7 @@ // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens 'experimental.store-gateway.tokens-file-path': '/data/tokens', - } + (if $._config.store_gateway_enabled then $.blocks_chunks_caching_config else {}), + } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config, store_gateway_ports:: $.util.defaultPorts, From e5537aab4280b6ad35b7670c356be6636ff37bb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 27 May 2020 11:19:41 +0200 Subject: [PATCH 036/192] Add caching arguments only if caching is enabled. (https://github.com/grafana/cortex-jsonnet/pull/79) --- operations/mimir/tsdb.libsonnet | 47 ++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 4b597234071..e3f2f2376ea 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -23,26 +23,31 @@ cortex_compactor_data_disk_class: 'standard', }, - blocks_chunks_caching_config:: { - 'experimental.tsdb.bucket-store.index-cache.backend': 'memcached', - 'experimental.tsdb.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.tsdb.bucket-store.index-cache.memcached.timeout': '200ms', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, - 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-concurrency': '50', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', - 'experimental.tsdb.bucket-store.index-cache.postings-compression-enabled': 'true', - - 'experimental.tsdb.bucket-store.chunks-cache.backend': 'memcached', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.tsdb.bucket-store.chunks-cache.memcached.timeout': '200ms', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', - }, - - blocks_metadata_caching_config:: { + blocks_chunks_caching_config:: + ( + if $._config.memcached_index_queries_enabled then { + 'experimental.tsdb.bucket-store.index-cache.backend': 'memcached', + 'experimental.tsdb.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.tsdb.bucket-store.index-cache.memcached.timeout': '200ms', + 'experimental.tsdb.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'experimental.tsdb.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + 'experimental.tsdb.bucket-store.index-cache.postings-compression-enabled': 'true', + } else {} + ) + ( + if $._config.memcached_chunks_enabled then { + 'experimental.tsdb.bucket-store.chunks-cache.backend': 'memcached', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.tsdb.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + } else {} + ), + + blocks_metadata_caching_config:: if $.config.memcached_metadata_enabled then { 'experimental.tsdb.bucket-store.metadata-cache.backend': 'memcached', 'experimental.tsdb.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, 'experimental.tsdb.bucket-store.metadata-cache.memcached.timeout': '200ms', @@ -50,7 +55,7 @@ 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', - }, + } else {}, // The querier should run on a dedicated volume used to sync TSDB // indexes, in order to not negatively affect the node performances From 50f8c8878f7cd43917719e0ebd3d2d8e7c15bfc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 27 May 2020 11:44:53 +0200 Subject: [PATCH 037/192] Fix typo :facepalm: (https://github.com/grafana/cortex-jsonnet/pull/81) --- operations/mimir/tsdb.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index e3f2f2376ea..936a21d9d8d 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -47,7 +47,7 @@ } else {} ), - blocks_metadata_caching_config:: if $.config.memcached_metadata_enabled then { + blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { 'experimental.tsdb.bucket-store.metadata-cache.backend': 'memcached', 'experimental.tsdb.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, 'experimental.tsdb.bucket-store.metadata-cache.memcached.timeout': '200ms', From 4db15627e554c87ba212ae3f2200d2b292a78d87 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 4 Jun 2020 10:25:31 +0200 Subject: [PATCH 038/192] Make store-gateway mandatory for blocks storage and switch queriers back to deployments Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 6 +--- operations/mimir/tsdb.libsonnet | 53 ++----------------------------- 2 files changed, 4 insertions(+), 55 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index f29f4519aa5..729c833ffa9 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -59,7 +59,6 @@ // to switch to tsdb storage. storage_engine: 'chunks', storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', - store_gateway_enabled: false, // TSDB storage engine doesn't require the table manager. table_manager_enabled: $._config.storage_engine != 'tsdb', @@ -139,10 +138,7 @@ 'experimental.tsdb.ship-interval': '1m', 'experimental.tsdb.backend': 'gcs', 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, - 'experimental.tsdb.store-gateway-enabled': $._config.store_gateway_enabled, - } - ) + ( - if $._config.storage_engine != 'tsdb' || !$._config.store_gateway_enabled then {} else { + 'experimental.tsdb.store-gateway-enabled': true, 'experimental.store-gateway.sharding-enabled': true, 'experimental.store-gateway.sharding-ring.store': 'consul', 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 936a21d9d8d..92499d80901 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -10,10 +10,6 @@ storage_backend: 'none', storage_engine: 'tsdb', - // Allow to configure the querier disk. - cortex_querier_data_disk_size: '10Gi', - cortex_querier_data_disk_class: 'standard', - // Allow to configure the store-gateway disk. cortex_store_gateway_data_disk_size: '50Gi', cortex_store_gateway_data_disk_class: 'standard', @@ -57,50 +53,7 @@ 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', } else {}, - // The querier should run on a dedicated volume used to sync TSDB - // indexes, in order to not negatively affect the node performances - // in case of sustained I/O or utilization. For this reason we: - // 1. Remove default querier deployment - // 2. Run querier as statefulset with PVC - // 3. Replace the service switching it to the statefulset - local querier_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_querier_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_querier_data_disk_class) + - pvc.mixin.metadata.withName('querier-data'), - - querier_args+:: { - // Reduce the number of blocks synched simultaneously, in order to - // keep the memory utilization under control when the index header - // is generated - 'experimental.tsdb.bucket-store.tenant-sync-concurrency': 2, - 'experimental.tsdb.bucket-store.block-sync-concurrency': 5, - } + $.blocks_metadata_caching_config + (if !$._config.store_gateway_enabled then $.blocks_chunks_caching_config else {}), - - querier_container+:: - container.withVolumeMountsMixin([ - volumeMount.new('querier-data', '/data'), - ]), - - querier_deployment: {}, - - querier_statefulset: - statefulSet.new('querier', 3, [$.querier_container], querier_data_pvc) - .withServiceName('querier') + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'querier' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'querier' } + $.querier_deployment_labels) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'querier' }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(60) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - $.util.configVolumeMount('overrides', '/etc/cortex') + - $.util.antiAffinity, - - querier_service: - $.util.serviceFor($.querier_statefulset, $.querier_service_ignored_labels) + - service.mixin.spec.withSelector({ name: 'query-frontend' }), + querier_args+:: $.blocks_metadata_caching_config, // The ingesters should persist TSDB blocks and WAL on a persistent // volume in order to be crash resilient. @@ -218,7 +171,7 @@ $.util.readinessProbe + $.jaeger_mixin, - store_gateway_statefulset: if !$._config.store_gateway_enabled then {} else + store_gateway_statefulset: statefulSet.new('store-gateway', 3, [$.store_gateway_container], store_gateway_data_pvc) .withServiceName('store-gateway') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + @@ -229,6 +182,6 @@ statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120), - store_gateway_service: if !$._config.store_gateway_enabled then {} else + store_gateway_service: $.util.serviceFor($.store_gateway_statefulset), } From e612e9d670715c79f83ce90ec58b7752d9920ea6 Mon Sep 17 00:00:00 2001 From: sh0rez Date: Mon, 15 Jun 2020 12:54:12 +0200 Subject: [PATCH 039/192] refactor: remove method usage (https://github.com/grafana/cortex-jsonnet/pull/93) To allow the migration to upcoming jsonnet-libs/k8s library, we need to remove the "object-oriented" usages of ksonnet-lib, as those will not be supported anymore for performance reasons. cortex-jsonnet didn't use a lot of those, so this is fairly small. --- operations/mimir/alertmanager.libsonnet | 4 ++-- operations/mimir/tsdb.libsonnet | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 3b4ca16b9f4..b3fcfbf5874 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -34,8 +34,8 @@ alertmanager_statefulset: - statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) - .withServiceName('alertmanager') + + statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) + + statefulSet.mixin.spec.withServiceName('alertmanager') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'alertmanager' }) + diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 92499d80901..db036d8dfa7 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -82,8 +82,8 @@ ]), ingester_statefulset: - statefulSet.new('ingester', 3, [$.ingester_container], ingester_data_pvc) - .withServiceName('ingester') + + statefulSet.new('ingester', 3, [$.ingester_container], ingester_data_pvc) + + statefulSet.mixin.spec.withServiceName('ingester') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + @@ -131,8 +131,8 @@ $.jaeger_mixin, compactor_statefulset: - statefulSet.new('compactor', 1, [$.compactor_container], compactor_data_pvc) - .withServiceName('compactor') + + statefulSet.new('compactor', 1, [$.compactor_container], compactor_data_pvc) + + statefulSet.mixin.spec.withServiceName('compactor') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: 'compactor' }) + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'compactor' }) + @@ -172,8 +172,8 @@ $.jaeger_mixin, store_gateway_statefulset: - statefulSet.new('store-gateway', 3, [$.store_gateway_container], store_gateway_data_pvc) - .withServiceName('store-gateway') + + statefulSet.new('store-gateway', 3, [$.store_gateway_container], store_gateway_data_pvc) + + statefulSet.mixin.spec.withServiceName('store-gateway') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: 'store-gateway' }) + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'store-gateway' }) + From a2c2f281c7f5c65088d902e3f0c6b3b03062d2ba Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 16 Jun 2020 14:09:01 +0200 Subject: [PATCH 040/192] Added mega_user class Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 729c833ffa9..721737d833d 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -283,6 +283,21 @@ ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M }, + + // This user class has limits increased by +50% compared to the previous one. + mega_user+:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 16000000, // 16M + max_global_series_per_metric: 1600000, // 1.6M + + max_series_per_query: 100000, + max_samples_per_query: 1000000, + + ingestion_rate: 2250000, // 2.25M + ingestion_burst_size: 22500000, // 22.5M + }, }, // if not empty, passed to overrides.yaml as another top-level field From 1815a636cadd538e01fb0bec97305f507a16076b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 17 Jun 2020 12:52:55 +0200 Subject: [PATCH 041/192] Fine-tune blocks storage config Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 11 ++++++----- operations/mimir/tsdb.libsonnet | 6 +++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 721737d833d..e0d3b4a7718 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -133,8 +133,9 @@ 'store.engine': 'tsdb', 'experimental.tsdb.dir': '/data/tsdb', 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', + 'experimental.tsdb.bucket-store.ignore-deletion-marks-delay': '1h', 'experimental.tsdb.block-ranges-period': '2h', - 'experimental.tsdb.retention-period': '6h', + 'experimental.tsdb.retention-period': '13h', 'experimental.tsdb.ship-interval': '1m', 'experimental.tsdb.backend': 'gcs', 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, @@ -167,11 +168,11 @@ // Don't query the chunk store for data younger than max_chunk_idle. 'querier.query-store-after': $._config.max_chunk_idle, } else if $._config.storage_engine == 'tsdb' then { - // Ingesters don't have data older than 6h, no need to ask them. - 'querier.query-ingesters-within': '6h', + // Ingesters don't have data older than 13h, no need to ask them. + 'querier.query-ingesters-within': '13h', - // No need to look at store for data younger than 4h, as ingesters have all of it. - 'querier.query-store-after': '4h', + // No need to look at store for data younger than 12h, as ingesters have all of it. + 'querier.query-store-after': '12h', } ) + ( if $._config.memcached_index_queries_enabled && $._config.storage_engine == 'chunks' then diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index db036d8dfa7..07e0bba29fd 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -17,6 +17,9 @@ // Allow to configure the compactor disk. cortex_compactor_data_disk_size: '250Gi', cortex_compactor_data_disk_class: 'standard', + + // Allow to fine tune compactor. + cortex_compactor_max_concurrency: 1, }, blocks_chunks_caching_config:: @@ -116,6 +119,7 @@ 'compactor.block-ranges': '2h,12h,24h', 'compactor.data-dir': '/data', 'compactor.compaction-interval': '30m', + 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, }, compactor_ports:: $.util.defaultPorts, @@ -126,7 +130,7 @@ container.withArgsMixin($.util.mapToFlags($.compactor_args)) + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + $.util.resourcesRequests('1', '6Gi') + - $.util.resourcesLimits('1', '6Gi') + + $.util.resourcesLimits($._config.cortex_compactor_max_concurrency, '6Gi') + $.util.readinessProbe + $.jaeger_mixin, From 41aa70d440fed4c93867da579b8a153f01d334de Mon Sep 17 00:00:00 2001 From: Austin McKinley <54160+amckinley@users.noreply.github.com> Date: Wed, 17 Jun 2020 10:48:03 -0700 Subject: [PATCH 042/192] Disable tests by default to fix README instructions Ref https://github.com/grafana/cortex-jsonnet/issues/95 --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index e0d3b4a7718..276077e198f 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -25,7 +25,7 @@ max_chunk_idle: '15m', - test_exporter_enabled: true, + test_exporter_enabled: false, test_exporter_start_time: error 'must specify test exporter start time', test_exporter_user_id: error 'must specify test exporter used id', From bf5517585aed601ca55655efa2f59dfbb0e56312 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 23 Jun 2020 17:20:35 +0200 Subject: [PATCH 043/192] Run store-gateway without CPU limits Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 07e0bba29fd..a52e94b827a 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -171,7 +171,7 @@ container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + $.util.resourcesRequests('1', '6Gi') + - $.util.resourcesLimits('1', '6Gi') + + $.util.resourcesLimits(null, '6Gi') + $.util.readinessProbe + $.jaeger_mixin, From 04d14d0344d6a6c3c506caebcd837cb3caba5a5d Mon Sep 17 00:00:00 2001 From: Austin McKinley Date: Wed, 24 Jun 2020 19:31:18 -0700 Subject: [PATCH 044/192] Use v1 API for Deployment and StatefulSet resources --- operations/mimir/alertmanager.libsonnet | 2 +- operations/mimir/config.libsonnet | 2 +- operations/mimir/consul.libsonnet | 2 +- operations/mimir/distributor.libsonnet | 2 +- operations/mimir/ingester.libsonnet | 2 +- operations/mimir/memcached.libsonnet | 4 ++-- operations/mimir/querier.libsonnet | 2 +- operations/mimir/query-frontend.libsonnet | 2 +- operations/mimir/query-tee.libsonnet | 2 +- operations/mimir/ruler.libsonnet | 2 +- operations/mimir/table-manager.libsonnet | 2 +- operations/mimir/test-exporter.libsonnet | 2 +- operations/mimir/tsdb.libsonnet | 2 +- 13 files changed, 14 insertions(+), 14 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index b3fcfbf5874..aaded5d6524 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -2,7 +2,7 @@ local pvc = $.core.v1.persistentVolumeClaim, local volumeMount = $.core.v1.volumeMount, local container = $.core.v1.container, - local statefulSet = $.apps.v1beta1.statefulSet, + local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 01edc087be1..cfc94ce54f5 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -332,7 +332,7 @@ }), }), - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, storage_config_mixin:: deployment.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), diff --git a/operations/mimir/consul.libsonnet b/operations/mimir/consul.libsonnet index 9ece317d279..98a32c50cb1 100644 --- a/operations/mimir/consul.libsonnet +++ b/operations/mimir/consul.libsonnet @@ -26,7 +26,7 @@ local consul = import 'consul/consul.libsonnet'; ]) + $.util.resourcesRequests('4', '4Gi'), - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, local podAntiAffinity = deployment.mixin.spec.template.spec.affinity.podAntiAffinity, local volume = $.core.v1.volume, consul_deployment+: diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index aca539a503f..24169840dc5 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -48,7 +48,7 @@ $.util.readinessProbe + $.jaeger_mixin, - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, distributor_deployment_labels:: {}, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index cf94c49cd48..2fa7b5ced6c 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -52,7 +52,7 @@ $.util.readinessProbe + $.jaeger_mixin, - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, ingester_deployment_labels:: {}, diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet index 6c6b751e2c1..a7600c33719 100644 --- a/operations/mimir/memcached.libsonnet +++ b/operations/mimir/memcached.libsonnet @@ -6,7 +6,7 @@ memcached { deployment: {}, - local statefulSet = $.apps.v1beta1.statefulSet, + local statefulSet = $.apps.v1.statefulSet, statefulSet: statefulSet.new(self.name, 3, [ @@ -72,7 +72,7 @@ memcached { // Metadata cache doesn't need much memory. memory_limit_mb: 512, - local statefulSet = $.apps.v1beta1.statefulSet, + local statefulSet = $.apps.v1.statefulSet, statefulSet+: statefulSet.mixin.spec.withReplicas(1), }, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 86f1d1fa309..9908c43a6a4 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -45,7 +45,7 @@ $.util.resourcesRequests('1', '12Gi') + $.util.resourcesLimits(null, '24Gi'), - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, querier_deployment_labels: {}, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index cd4789df2ea..f062642ab18 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -61,7 +61,7 @@ $.util.resourcesRequests('2', '600Mi') + $.util.resourcesLimits(null, '1200Mi'), - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, query_frontend_deployment: deployment.new('query-frontend', $._config.queryFrontend.replicas, [$.query_frontend_container]) + diff --git a/operations/mimir/query-tee.libsonnet b/operations/mimir/query-tee.libsonnet index f0eab8aef76..4ac3b0a1275 100644 --- a/operations/mimir/query-tee.libsonnet +++ b/operations/mimir/query-tee.libsonnet @@ -1,7 +1,7 @@ { local container = $.core.v1.container, local containerPort = $.core.v1.containerPort, - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, local service = $.core.v1.service, local servicePort = $.core.v1.servicePort, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index f6860cff757..12cc8471301 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -28,7 +28,7 @@ $.util.readinessProbe + $.jaeger_mixin, - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, ruler_deployment: deployment.new('ruler', 2, [$.ruler_container]) + diff --git a/operations/mimir/table-manager.libsonnet b/operations/mimir/table-manager.libsonnet index 45b97f4a3ea..90cb733c333 100644 --- a/operations/mimir/table-manager.libsonnet +++ b/operations/mimir/table-manager.libsonnet @@ -29,7 +29,7 @@ $.jaeger_mixin else {}, - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, table_manager_deployment: if $._config.table_manager_enabled then diff --git a/operations/mimir/test-exporter.libsonnet b/operations/mimir/test-exporter.libsonnet index 31d7a2c96d8..535686b88a3 100644 --- a/operations/mimir/test-exporter.libsonnet +++ b/operations/mimir/test-exporter.libsonnet @@ -22,7 +22,7 @@ $.util.resourcesLimits('100m', '100Mi') + $.jaeger_mixin, - local deployment = $.apps.v1beta1.deployment, + local deployment = $.apps.v1.deployment, test_exporter_deployment: if !($._config.test_exporter_enabled) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index a52e94b827a..807139754ab 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -2,7 +2,7 @@ local pvc = $.core.v1.persistentVolumeClaim, local volumeMount = $.core.v1.volumeMount, local container = $.core.v1.container, - local statefulSet = $.apps.v1beta1.statefulSet, + local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, _config+:: { From 30d37b9e6ce1bf3cd5c03d445f2cf35636b65248 Mon Sep 17 00:00:00 2001 From: Austin McKinley Date: Wed, 24 Jun 2020 18:09:10 -0700 Subject: [PATCH 045/192] Version bump to v1.1.0 --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index ba47b5f96ca..93aa65aa9e2 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.0.0', + cortex: 'cortexproject/cortex:v1.1.0', distributor: self.cortex, ingester: self.cortex, From 32e230827f205605734fa1702c5561db0e01e3c5 Mon Sep 17 00:00:00 2001 From: Austin McKinley Date: Wed, 24 Jun 2020 21:18:01 -0700 Subject: [PATCH 046/192] Actually include the ruler --- operations/mimir/cortex-manifests.jsonnet.example | 1 + operations/mimir/cortex.libsonnet | 1 + 2 files changed, 2 insertions(+) diff --git a/operations/mimir/cortex-manifests.jsonnet.example b/operations/mimir/cortex-manifests.jsonnet.example index 7edc14cd644..9abcc1b1747 100644 --- a/operations/mimir/cortex-manifests.jsonnet.example +++ b/operations/mimir/cortex-manifests.jsonnet.example @@ -21,6 +21,7 @@ cortex { storage_backend: 'gcp', bigtable_instance: 'example-instance-prod', bigtable_project: 'example-project1-cortex', + ruler_client_type: 'gcs' }, } diff --git a/operations/mimir/cortex.libsonnet b/operations/mimir/cortex.libsonnet index 430c1d4374d..940ae2bb269 100644 --- a/operations/mimir/cortex.libsonnet +++ b/operations/mimir/cortex.libsonnet @@ -11,6 +11,7 @@ (import 'querier.libsonnet') + (import 'query-frontend.libsonnet') + (import 'table-manager.libsonnet') + +(import 'ruler.libsonnet') + // Supporting services (import 'etcd.libsonnet') + From 42e05b485322058daad461be17d37b3e8193e8f2 Mon Sep 17 00:00:00 2001 From: Austin McKinley Date: Wed, 24 Jun 2020 21:34:16 -0700 Subject: [PATCH 047/192] Update config option name --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index cfc94ce54f5..12bdf7782b6 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -181,7 +181,7 @@ // Setting for index cache. 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. 'store.index-cache-read.cache.enable-fifocache': true, - 'store.index-cache-read.fifocache.size': 102400, + 'store.index-cache-read.fifocache.max-size-items': 102400, 'store.index-cache-read.memcached.hostname': 'memcached-index-queries.%(namespace)s.svc.cluster.local' % $._config, 'store.index-cache-read.memcached.service': 'memcached-client', 'store.index-cache-read.memcached.timeout': '500ms', From 56f59b2e3231bc944e622529af3cdf1e0d248a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 29 Jun 2020 14:08:26 +0200 Subject: [PATCH 048/192] Added ruler_enabled and alertmanager_enabled flags. (https://github.com/grafana/cortex-jsonnet/pull/116) --- operations/mimir/alertmanager.libsonnet | 52 ++++++++++++++----------- operations/mimir/config.libsonnet | 3 ++ operations/mimir/cortex.libsonnet | 1 + operations/mimir/ruler.libsonnet | 32 ++++++++------- 4 files changed, 52 insertions(+), 36 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index aaded5d6524..e5bb15d87f8 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -5,7 +5,6 @@ local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, - alertmanager_args:: { target: 'alertmanager', @@ -18,32 +17,39 @@ }, alertmanager_pvc:: - pvc.new() + - pvc.mixin.metadata.withName('alertmanager-data') + - pvc.mixin.spec.withAccessModes('ReadWriteOnce') + - pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }), + if $._config.alertmanager_enabled then + pvc.new() + + pvc.mixin.metadata.withName('alertmanager-data') + + pvc.mixin.spec.withAccessModes('ReadWriteOnce') + + pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }) + else {}, alertmanager_container:: - container.new('alertmanager', $._images.alertmanager) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + - container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + - $.util.resourcesRequests('100m', '1Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - + if $._config.alertmanager_enabled then + container.new('alertmanager', $._images.alertmanager) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + + container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + + $.util.resourcesRequests('100m', '1Gi') + + $.util.readinessProbe + + $.jaeger_mixin + else {}, alertmanager_statefulset: - statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) + - statefulSet.mixin.spec.withServiceName('alertmanager') + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'alertmanager' }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'alertmanager' }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900), + if $._config.alertmanager_enabled then + statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) + + statefulSet.mixin.spec.withServiceName('alertmanager') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + else {}, alertmanager_service: - $.util.serviceFor($.alertmanager_statefulset), + if $._config.alertmanager_enabled then + $.util.serviceFor($.alertmanager_statefulset) + else {}, } diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 12bdf7782b6..eabbae6cc89 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -203,6 +203,7 @@ 'ring.heartbeat-timeout': '10m', }, + ruler_enabled: false, ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3)', // TODO: Generic client generating functions would be nice. ruler_s3_bucket_name: $._config.s3_bucket_name, @@ -308,6 +309,8 @@ schemaID: std.md5(std.toString($._config.schema)), enable_pod_priorities: true, + + alertmanager_enabled: false, }, local configMap = $.core.v1.configMap, diff --git a/operations/mimir/cortex.libsonnet b/operations/mimir/cortex.libsonnet index 940ae2bb269..6ad5c57a81b 100644 --- a/operations/mimir/cortex.libsonnet +++ b/operations/mimir/cortex.libsonnet @@ -12,6 +12,7 @@ (import 'query-frontend.libsonnet') + (import 'table-manager.libsonnet') + (import 'ruler.libsonnet') + +(import 'alertmanager.libsonnet') + // Supporting services (import 'etcd.libsonnet') + diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 12cc8471301..1aaab71eac1 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -20,25 +20,31 @@ }, ruler_container:: - container.new('ruler', $._images.ruler) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.ruler_args)) + - $.util.resourcesRequests('1', '6Gi') + - $.util.resourcesLimits('16', '16Gi') + - $.util.readinessProbe + - $.jaeger_mixin, + if $._config.ruler_enabled then + container.new('ruler', $._images.ruler) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + + $.util.resourcesRequests('1', '6Gi') + + $.util.resourcesLimits('16', '16Gi') + + $.util.readinessProbe + + $.jaeger_mixin + else {}, local deployment = $.apps.v1.deployment, ruler_deployment: - deployment.new('ruler', 2, [$.ruler_container]) + - deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + - $.util.antiAffinity + - $.util.configVolumeMount('overrides', '/etc/cortex') + - $.storage_config_mixin, + if $._config.ruler_enabled then + deployment.new('ruler', 2, [$.ruler_container]) + + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + + $.util.antiAffinity + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.storage_config_mixin + else {}, local service = $.core.v1.service, ruler_service: - $.util.serviceFor($.ruler_deployment), + if $._config.ruler_enabled then + $.util.serviceFor($.ruler_deployment) + else {}, } From 82ab7717111d18e19b004e24ad3fa5bcbdc1b0e4 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 30 Jun 2020 15:15:04 -0400 Subject: [PATCH 049/192] Added publish not ready addresses Signed-off-by: Joe Elliott --- operations/mimir/query-frontend.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index f062642ab18..87647c89ad8 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -80,5 +80,6 @@ // each query-frontend pod IP and NOT the service IP. To make it, we do NOT // use the service cluster IP so that when the service DNS is resolved it // returns the set of query-frontend IPs. + service.mixin.spec.withPublishNotReadyAddresses(true) + service.mixin.spec.withClusterIp('None'), } From 78e222bd132ab9d1c48fec7e7e4c9ecb8c5c5258 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 1 Jul 2020 15:10:58 +0200 Subject: [PATCH 050/192] Removed -experimental.tsdb.store-gateway-enabled flag Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index eabbae6cc89..e0cda9c9e02 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -139,7 +139,6 @@ 'experimental.tsdb.ship-interval': '1m', 'experimental.tsdb.backend': 'gcs', 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, - 'experimental.tsdb.store-gateway-enabled': true, 'experimental.store-gateway.sharding-enabled': true, 'experimental.store-gateway.sharding-ring.store': 'consul', 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, From fbe8643e6feac8b661431d2ed07d8fe2fa96023e Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Wed, 1 Jul 2020 11:20:00 -0400 Subject: [PATCH 051/192] Added a discovery svc and pointed the querier service at itself Signed-off-by: Joe Elliott --- operations/mimir/querier.libsonnet | 7 ++----- operations/mimir/query-frontend.libsonnet | 6 +++++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 9908c43a6a4..8dd4bc7b88e 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -19,7 +19,7 @@ // Limit to N/2 worker threads per frontend, as we have two frontends. 'querier.worker-parallelism': $._config.querier.concurrency / $._config.queryFrontend.replicas, - 'querier.frontend-address': 'query-frontend.%(namespace)s.svc.cluster.local:9095' % $._config, + 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, 'log.level': 'debug', @@ -57,9 +57,6 @@ local service = $.core.v1.service, - querier_service_ignored_labels:: [], - querier_service: - $.util.serviceFor($.querier_deployment, $.querier_service_ignored_labels) + - service.mixin.spec.withSelector({ name: 'query-frontend' }), + $.util.serviceFor($.querier_deployment), } diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 87647c89ad8..debb1dfb13c 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -75,11 +75,15 @@ local service = $.core.v1.service, query_frontend_service: + $.util.serviceFor($.query_frontend_deployment), + + query_frontend_discovery_service: $.util.serviceFor($.query_frontend_deployment) + // Make sure that query frontend worker, running in the querier, do resolve // each query-frontend pod IP and NOT the service IP. To make it, we do NOT // use the service cluster IP so that when the service DNS is resolved it // returns the set of query-frontend IPs. service.mixin.spec.withPublishNotReadyAddresses(true) + - service.mixin.spec.withClusterIp('None'), + service.mixin.spec.withClusterIp('None') + + service.mixin.metadata.withName('query-frontend-discovery') } From a2e2b223db3b64f31c5083a70c4ed4ba07d14112 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Wed, 1 Jul 2020 11:22:29 -0400 Subject: [PATCH 052/192] lint Signed-off-by: Joe Elliott --- operations/mimir/query-frontend.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index debb1dfb13c..d73581828bb 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -85,5 +85,5 @@ // returns the set of query-frontend IPs. service.mixin.spec.withPublishNotReadyAddresses(true) + service.mixin.spec.withClusterIp('None') + - service.mixin.metadata.withName('query-frontend-discovery') + service.mixin.metadata.withName('query-frontend-discovery'), } From a02b59cd1053bdb08ceca5355641090fa7da31bc Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 2 Jul 2020 09:45:17 +0200 Subject: [PATCH 053/192] Added PodDisruptionBudget for store-gateway Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 807139754ab..3ca7d1bb4a8 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -188,4 +188,13 @@ store_gateway_service: $.util.serviceFor($.store_gateway_statefulset), + + local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + + store_gateway_pdb: + podDisruptionBudget.new() + + podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + + podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), } From deb23b3b4007da1c3879cc4e7ab334b6e5e9717a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 2 Jul 2020 09:51:41 +0200 Subject: [PATCH 054/192] Allow to configure the blocks replication factor Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 4 +++- operations/mimir/tsdb.libsonnet | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index eabbae6cc89..08175d7a7f7 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -60,6 +60,8 @@ storage_engine: 'chunks', storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', + store_gateway_replication_factor: 3, + // TSDB storage engine doesn't require the table manager. table_manager_enabled: $._config.storage_engine != 'tsdb', @@ -144,7 +146,7 @@ 'experimental.store-gateway.sharding-ring.store': 'consul', 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'experimental.store-gateway.sharding-ring.prefix': '', - 'experimental.store-gateway.replication-factor': 3, + 'experimental.store-gateway.replication-factor': $._config.store_gateway_replication_factor, } ), diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 3ca7d1bb4a8..9ba3e0e40b0 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -196,5 +196,7 @@ podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + - podDisruptionBudget.mixin.spec.withMaxUnavailable(1), + // To avoid any disruption in the read path we need at least 1 replica of each + // block available, so the disruption budget depends on the blocks replication factor. + podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1), } From e36c2afd5077bc63b57c2abedc73f5ea117b646d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 2 Jul 2020 12:31:01 +0200 Subject: [PATCH 055/192] Switch store-gateway StatefulSets to Parallel Pod Management Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 807139754ab..32c9c0b3af6 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -184,7 +184,12 @@ statefulSet.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120), + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + + // Parallelly scale up/down store-gateway instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), store_gateway_service: $.util.serviceFor($.store_gateway_statefulset), From a9ca8e8e9c986fb47f0f1ead337d211e2375067a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Fri, 3 Jul 2020 15:58:07 +0200 Subject: [PATCH 056/192] Ruler should use metadata cache as well, if configured. (https://github.com/grafana/cortex-jsonnet/pull/128) Ruler instantiates querier internally, so it can use metadata cache. --- operations/mimir/tsdb.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 9ba3e0e40b0..3be98fe73f7 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -57,6 +57,7 @@ } else {}, querier_args+:: $.blocks_metadata_caching_config, + ruler_args+:: $.blocks_metadata_caching_config, // The ingesters should persist TSDB blocks and WAL on a persistent // volume in order to be crash resilient. From e259402d664a1c391ed044a211053c0e09dc8204 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Sat, 11 Jul 2020 16:00:29 +0200 Subject: [PATCH 057/192] Allow to customize ingester disk size and class Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 923122441e0..a97b3998d83 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -10,6 +10,10 @@ storage_backend: 'none', storage_engine: 'tsdb', + // Allow to configure the ingester disk. + cortex_ingester_data_disk_size: '100Gi', + cortex_ingester_data_disk_class: 'fast', + // Allow to configure the store-gateway disk. cortex_store_gateway_data_disk_size: '50Gi', cortex_store_gateway_data_disk_class: 'standard', @@ -63,9 +67,9 @@ // volume in order to be crash resilient. local ingester_data_pvc = pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }) + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName('fast') + + pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) + pvc.mixin.metadata.withName('ingester-data'), ingester_deployment: {}, From d3aede754d873c56c954c94e6ea307d861936e4b Mon Sep 17 00:00:00 2001 From: Austin McKinley Date: Fri, 17 Jul 2020 10:58:43 -0700 Subject: [PATCH 058/192] Version bump to 1.2.0 --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 93aa65aa9e2..a4be104ae56 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.1.0', + cortex: 'cortexproject/cortex:v1.2.0', distributor: self.cortex, ingester: self.cortex, From 366e99cc8a8504a866d35f5890b7f97a279c8d0a Mon Sep 17 00:00:00 2001 From: Duologic Date: Mon, 20 Jul 2020 09:57:11 +0200 Subject: [PATCH 059/192] refactor: use jaeger-agent-mixin lib got moved: grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/291 used jb-0.4.0 which updates the jsonnetfile.json format --- operations/mimir/cortex.libsonnet | 2 +- operations/mimir/jsonnetfile.json | 25 ++++++++++++++++--------- operations/mimir/jsonnetfile.lock.json | 26 +++++++++++++++++--------- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/operations/mimir/cortex.libsonnet b/operations/mimir/cortex.libsonnet index 6ad5c57a81b..5341b7c0d50 100644 --- a/operations/mimir/cortex.libsonnet +++ b/operations/mimir/cortex.libsonnet @@ -1,5 +1,5 @@ (import 'ksonnet-util/kausal.libsonnet') + -(import 'ksonnet-util/jaeger.libsonnet') + +(import 'jaeger-agent-mixin/jaeger.libsonnet') + (import 'images.libsonnet') + (import 'common.libsonnet') + (import 'config.libsonnet') + diff --git a/operations/mimir/jsonnetfile.json b/operations/mimir/jsonnetfile.json index 375b98130fb..e83b85fc068 100644 --- a/operations/mimir/jsonnetfile.json +++ b/operations/mimir/jsonnetfile.json @@ -1,44 +1,51 @@ { + "version": 1, "dependencies": [ { - "name": "consul", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "consul" } }, "version": "master" }, { - "name": "etcd-operator", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "etcd-operator" } }, "version": "master" }, { - "name": "ksonnet-util", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "jaeger-agent-mixin" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "ksonnet-util" } }, "version": "master" }, { - "name": "memcached", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "memcached" } }, "version": "master" } - ] + ], + "legacyImports": true } diff --git a/operations/mimir/jsonnetfile.lock.json b/operations/mimir/jsonnetfile.lock.json index c4dcaacf706..0c0e542e836 100644 --- a/operations/mimir/jsonnetfile.lock.json +++ b/operations/mimir/jsonnetfile.lock.json @@ -1,10 +1,10 @@ { + "version": 1, "dependencies": [ { - "name": "consul", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "consul" } }, @@ -12,10 +12,9 @@ "sum": "qlVBnIShhHEPglAl1xYIAmOP/W8LD0wQmHCT0m9sTLU=" }, { - "name": "etcd-operator", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "etcd-operator" } }, @@ -23,10 +22,19 @@ "sum": "RbSlOsk0EBAMOfMOKPBdD0joHN6UKZqeP3zy9LjBQTE=" }, { - "name": "ksonnet-util", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "jaeger-agent-mixin" + } + }, + "version": "65a623593025007ef54549550a3569c0e72f085d", + "sum": "DsdBoqgx5kE3zc6fMYnfiGjW2+9Mx2OXFieWm1oFHgY=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "ksonnet-util" } }, @@ -34,15 +42,15 @@ "sum": "LKsTTBcH8TXX5ANgRUu5I7Y1tf5le4nANFV3/W53I+c=" }, { - "name": "memcached", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "memcached" } }, "version": "c19a92e586a6752f11745b47f309b13f02ef7147", "sum": "GQeyWFtqhwM+hGxQbdywWG1PFJ/KmSC1at0hai7AHXU=" } - ] + ], + "legacyImports": false } From 59b048a8fe46c666049211db25c86aaf52d6eff7 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 09:47:58 +0200 Subject: [PATCH 060/192] Switch blocks storage ingesters to Parallel pod management policy and 4d retention Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 2 +- operations/mimir/tsdb.libsonnet | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 08175d7a7f7..03fddf5e533 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -137,7 +137,7 @@ 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', 'experimental.tsdb.bucket-store.ignore-deletion-marks-delay': '1h', 'experimental.tsdb.block-ranges-period': '2h', - 'experimental.tsdb.retention-period': '13h', + 'experimental.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. 'experimental.tsdb.ship-interval': '1m', 'experimental.tsdb.backend': 'gcs', 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index a97b3998d83..ceb71afa973 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -101,7 +101,12 @@ statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + $.util.configVolumeMount('overrides', '/etc/cortex') + $.util.podPriority('high') + - $.util.antiAffinity, + $.util.antiAffinity + + // Parallelly scale up/down store-gateway instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), ingester_service: $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), From 3c77836f8d5549d6432016089a84afbfd16596e9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 09:49:02 +0200 Subject: [PATCH 061/192] Fixed comment Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index ceb71afa973..fe2da963608 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -102,7 +102,7 @@ $.util.configVolumeMount('overrides', '/etc/cortex') + $.util.podPriority('high') + $.util.antiAffinity + - // Parallelly scale up/down store-gateway instances instead of starting them + // Parallelly scale up/down ingester instances instead of starting them // one by one. This does NOT affect rolling updates: they will continue to be // rolled out one by one (the next pod will be rolled out once the previous is // ready). From 0c1a3c72b45925b74a710317dd54bcd833050a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 27 Jul 2020 12:33:36 +0200 Subject: [PATCH 062/192] Chunks blocks migration (https://github.com/grafana/cortex-jsonnet/pull/148) * Allow configuring querier with second store engine. * Introduced newIngesterStatefulSet and newIngesterPdb functions. * Rename parameters to be more clear. --- operations/mimir/config.libsonnet | 10 ++++++---- operations/mimir/ingester.libsonnet | 10 ++++++---- operations/mimir/querier.libsonnet | 4 +++- operations/mimir/tsdb.libsonnet | 23 ++++++++++++----------- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 03fddf5e533..5b5b3f0700b 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -58,6 +58,8 @@ // Use the Cortex chunks storage engine by default, while giving the ability // to switch to tsdb storage. storage_engine: 'chunks', + // Secondary storage engine is only used for querying. + querier_second_storage_engine: null, storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', store_gateway_replication_factor: 3, @@ -113,7 +115,7 @@ storeConfig: self.storeMemcachedChunksConfig, - storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled && $._config.storage_engine == 'chunks' then + storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then { 'store.chunks-cache.memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, 'store.chunks-cache.memcached.service': 'memcached-client', @@ -131,8 +133,8 @@ // TSDB blocks storage configuration, used only when 'tsdb' storage // engine is explicitly enabled. storageTSDBConfig: ( - if $._config.storage_engine != 'tsdb' then {} else { - 'store.engine': 'tsdb', + if $._config.storage_engine == 'tsdb' || $._config.querier_second_storage_engine == 'tsdb' then { + 'store.engine': $._config.storage_engine, // May still be chunks 'experimental.tsdb.dir': '/data/tsdb', 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', 'experimental.tsdb.bucket-store.ignore-deletion-marks-delay': '1h', @@ -147,7 +149,7 @@ 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'experimental.store-gateway.sharding-ring.prefix': '', 'experimental.store-gateway.replication-factor': $._config.store_gateway_replication_factor, - } + } else {} ), // Shared between the Ruler and Querier diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 2fa7b5ced6c..165fc9829fe 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -75,10 +75,12 @@ local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, - ingester_pdb: + newIngesterPdb(pdbName, ingesterName):: podDisruptionBudget.new() + - podDisruptionBudget.mixin.metadata.withName('ingester-pdb') + - podDisruptionBudget.mixin.metadata.withLabels({ name: 'ingester-pdb' }) + - podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: name }) + + podDisruptionBudget.mixin.metadata.withName(pdbName) + + podDisruptionBudget.mixin.metadata.withLabels({ name: pdbName }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: ingesterName }) + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), + + ingester_pdb: self.newIngesterPdb('ingester-pdb', name), } diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 8dd4bc7b88e..044fd8ce175 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -10,7 +10,7 @@ { target: 'querier', - // Increase HTTP server response write timeout, as we were seeing some + // Increase HTTP server response write timeout, as we were seeing some // queries that return a lot of data timeing out. 'server.http-write-timeout': '1m', @@ -22,6 +22,8 @@ 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, + 'querier.second-store-engine': $._config.querier_second_storage_engine, + 'log.level': 'debug', }, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index fe2da963608..5b13850170b 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -84,18 +84,17 @@ 'ingester.tokens-file-path': '/data/tokens', }, - ingester_container+:: - container.withVolumeMountsMixin([ - volumeMount.new('ingester-data', '/data'), - ]), - - ingester_statefulset: - statefulSet.new('ingester', 3, [$.ingester_container], ingester_data_pvc) + - statefulSet.mixin.spec.withServiceName('ingester') + + newIngesterStatefulSet(name, container):: + statefulSet.new(name, 3, [ + container + $.core.v1.container.withVolumeMountsMixin([ + volumeMount.new('ingester-data', '/data'), + ]), + ], ingester_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'ingester' }) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + @@ -108,6 +107,8 @@ // ready). statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), + ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), + ingester_service: $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), From 67fe1ff698d0a6a63c9b76eeb3e4af9fbf7bf0ee Mon Sep 17 00:00:00 2001 From: Duologic Date: Fri, 31 Jul 2020 16:22:01 +0200 Subject: [PATCH 063/192] refactor(cortex): use first class citizens for: * requiredDuringSchedulingIgnoredDuringExecutionType * portsType These are available from: https://github.com/jsonnet-libs/k8s-alpha --- operations/mimir/consul.libsonnet | 8 ++++---- operations/mimir/gossip.libsonnet | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/operations/mimir/consul.libsonnet b/operations/mimir/consul.libsonnet index 98a32c50cb1..3350916d9e4 100644 --- a/operations/mimir/consul.libsonnet +++ b/operations/mimir/consul.libsonnet @@ -28,6 +28,7 @@ local consul = import 'consul/consul.libsonnet'; local deployment = $.apps.v1.deployment, local podAntiAffinity = deployment.mixin.spec.template.spec.affinity.podAntiAffinity, + local podAffinityTerm = $.core.v1.podAffinityTerm, local volume = $.core.v1.volume, consul_deployment+: @@ -41,10 +42,9 @@ local consul = import 'consul/consul.libsonnet'; // Ensure Consul is not scheduled on the same host as an ingester // (in any namespace - hence other_namespaces). podAntiAffinity.withRequiredDuringSchedulingIgnoredDuringExecutionMixin([ - podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.new() + - podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.mixin.labelSelector.withMatchLabels({ name: 'ingester' }) + - podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.withNamespaces([$._config.namespace] + $._config.other_namespaces) + - podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType.withTopologyKey('kubernetes.io/hostname'), + podAffinityTerm.labelSelector.withMatchLabels({ name: 'ingester' }) + + podAffinityTerm.withNamespaces([$._config.namespace] + $._config.other_namespaces) + + podAffinityTerm.withTopologyKey('kubernetes.io/hostname'), ]) + $.util.podPriority('high'), diff --git a/operations/mimir/gossip.libsonnet b/operations/mimir/gossip.libsonnet index c8238951ac7..5720c156be9 100644 --- a/operations/mimir/gossip.libsonnet +++ b/operations/mimir/gossip.libsonnet @@ -63,7 +63,7 @@ // During migration to gossip, it may be useful to use distributors instead, since they are restarted faster. gossip_ring_service: local service = $.core.v1.service; - local servicePort = service.mixin.spec.portsType; + local servicePort = $.core.v1.servicePort; local ports = [ servicePort.newNamed('gossip-ring', gossipRingPort, gossipRingPort) + servicePort.withProtocol('TCP'), From e9a1740a63d9ff2362657847b8ffbe77ad0ad544 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 7 Aug 2020 09:17:41 +0200 Subject: [PATCH 064/192] Update blocks storage CLI flags Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 41 ++++++++++++------------ operations/mimir/tsdb.libsonnet | 52 ++++++++++++++++--------------- 2 files changed, 47 insertions(+), 46 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 57d3fa7c497..9fb44d9a006 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -16,9 +16,8 @@ // schema is used to generate the storage schema yaml file used by // the Cortex chunks storage: // - More information: https://github.com/cortexproject/cortex/pull/1072 - // - TSDB integration doesn't rely on the Cortex chunks store, so doesn't - // support the schema config. - schema: if $._config.storage_engine != 'tsdb' then + // - Blocks storage doesn't support / uses the schema config. + schema: if $._config.storage_engine != 'blocks' then error 'must specify a schema config' else [], @@ -56,7 +55,7 @@ jaeger_agent_host: null, // Use the Cortex chunks storage engine by default, while giving the ability - // to switch to tsdb storage. + // to switch to blocks storage. storage_engine: 'chunks', // Secondary storage engine is only used for querying. querier_second_storage_engine: null, @@ -64,21 +63,21 @@ store_gateway_replication_factor: 3, - // TSDB storage engine doesn't require the table manager. - table_manager_enabled: $._config.storage_engine != 'tsdb', + // Blocks storage engine doesn't require the table manager. + table_manager_enabled: $._config.storage_engine != 'blocks', - // TSDB storage engine doesn't support index-writes (for writes deduplication) cache. - memcached_index_writes_enabled: $._config.storage_engine != 'tsdb', + // Blocks storage engine doesn't support index-writes (for writes deduplication) cache. + memcached_index_writes_enabled: $._config.storage_engine != 'blocks', memcached_index_writes_max_item_size_mb: 1, - // Index and chunks caches are supported by both TSDB storage engine and chunks engine. + // Index and chunks caches are supported by both blocks storage engine and chunks engine. memcached_index_queries_enabled: true, memcached_index_queries_max_item_size_mb: 5, memcached_chunks_enabled: true, memcached_chunks_max_item_size_mb: 1, - memcached_metadata_enabled: $._config.storage_engine == 'tsdb', + memcached_metadata_enabled: $._config.storage_engine == 'blocks', memcached_metadata_max_item_size_mb: 1, // The query-tee is an optional service which can be used to send @@ -130,19 +129,19 @@ $._config.storageTSDBConfig + { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, - // TSDB blocks storage configuration, used only when 'tsdb' storage + // Blocks storage configuration, used only when 'blocks' storage // engine is explicitly enabled. storageTSDBConfig: ( - if $._config.storage_engine == 'tsdb' || $._config.querier_second_storage_engine == 'tsdb' then { + if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then { 'store.engine': $._config.storage_engine, // May still be chunks - 'experimental.tsdb.dir': '/data/tsdb', - 'experimental.tsdb.bucket-store.sync-dir': '/data/tsdb', - 'experimental.tsdb.bucket-store.ignore-deletion-marks-delay': '1h', - 'experimental.tsdb.block-ranges-period': '2h', - 'experimental.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. - 'experimental.tsdb.ship-interval': '1m', - 'experimental.tsdb.backend': 'gcs', - 'experimental.tsdb.gcs.bucket-name': $._config.storage_tsdb_bucket_name, + 'experimental.blocks-storage.tsdb.dir': '/data/tsdb', + 'experimental.blocks-storage.bucket-store.sync-dir': '/data/tsdb', + 'experimental.blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', + 'experimental.blocks-storage.tsdb.block-ranges-period': '2h', + 'experimental.blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'experimental.blocks-storage.tsdb.ship-interval': '1m', + 'experimental.blocks-storage.backend': 'gcs', + 'experimental.blocks-storage.gcs.bucket-name': $._config.storage_tsdb_bucket_name, 'experimental.store-gateway.sharding-enabled': true, 'experimental.store-gateway.sharding-ring.store': 'consul', 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, @@ -171,7 +170,7 @@ // Don't query the chunk store for data younger than max_chunk_idle. 'querier.query-store-after': $._config.max_chunk_idle, - } else if $._config.storage_engine == 'tsdb' then { + } else if $._config.storage_engine == 'blocks' then { // Ingesters don't have data older than 13h, no need to ask them. 'querier.query-ingesters-within': '13h', diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 5b13850170b..da15176372d 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -6,9 +6,9 @@ local service = $.core.v1.service, _config+:: { - // Enforce TSDB storage + // Enforce blocks storage storage_backend: 'none', - storage_engine: 'tsdb', + storage_engine: 'blocks', // Allow to configure the ingester disk. cortex_ingester_data_disk_size: '100Gi', @@ -29,35 +29,35 @@ blocks_chunks_caching_config:: ( if $._config.memcached_index_queries_enabled then { - 'experimental.tsdb.bucket-store.index-cache.backend': 'memcached', - 'experimental.tsdb.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.tsdb.bucket-store.index-cache.memcached.timeout': '200ms', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, - 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-async-concurrency': '50', - 'experimental.tsdb.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', - 'experimental.tsdb.bucket-store.index-cache.postings-compression-enabled': 'true', + 'experimental.blocks-storage.bucket-store.index-cache.backend': 'memcached', + 'experimental.blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', + 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + 'experimental.blocks-storage.bucket-store.index-cache.postings-compression-enabled': 'true', } else {} ) + ( if $._config.memcached_chunks_enabled then { - 'experimental.tsdb.bucket-store.chunks-cache.backend': 'memcached', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.tsdb.bucket-store.chunks-cache.memcached.timeout': '200ms', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', - 'experimental.tsdb.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + 'experimental.blocks-storage.bucket-store.chunks-cache.backend': 'memcached', + 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', } else {} ), blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { - 'experimental.tsdb.bucket-store.metadata-cache.backend': 'memcached', - 'experimental.tsdb.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.tsdb.bucket-store.metadata-cache.memcached.timeout': '200ms', - 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, - 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', - 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', - 'experimental.tsdb.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + 'experimental.blocks-storage.bucket-store.metadata-cache.backend': 'memcached', + 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, + 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', + 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, + 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', + 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', + 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', } else {}, querier_args+:: $.blocks_metadata_caching_config, @@ -96,7 +96,9 @@ statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + + // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. + // For this reason, we grant an high termination period (80 minutes). + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + $.util.configVolumeMount('overrides', '/etc/cortex') + $.util.podPriority('high') + From 3b4afb7a7dcd5412fb537eea691fa996a5dd326b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 7 Aug 2020 10:11:10 +0200 Subject: [PATCH 065/192] Do not apply blocks storage config to query-frontend, table-manager and purger Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 3 +-- operations/mimir/ingester.libsonnet | 1 + operations/mimir/querier.libsonnet | 1 + operations/mimir/ruler.libsonnet | 1 + operations/mimir/tsdb.libsonnet | 6 ++++-- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 9fb44d9a006..38c92251620 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -126,12 +126,11 @@ $._config.client_configs.aws + $._config.client_configs.cassandra + $._config.client_configs.gcp + - $._config.storageTSDBConfig + { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, // Blocks storage configuration, used only when 'blocks' storage // engine is explicitly enabled. - storageTSDBConfig: ( + blocksStorageConfig: ( if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then { 'store.engine': $._config.storage_engine, // May still be chunks 'experimental.blocks-storage.tsdb.dir': '/data/tsdb', diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 165fc9829fe..2a2b2619bb5 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -5,6 +5,7 @@ $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + + $._config.blocksStorageConfig + $._config.distributorConfig + // This adds the distributor ring flags to the ingester. { target: 'ingester', diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 044fd8ce175..83d6384515f 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -5,6 +5,7 @@ $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + + $._config.blocksStorageConfig + $._config.queryConfig + $._config.distributorConfig + { diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 1aaab71eac1..6a628a8d429 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -5,6 +5,7 @@ $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + + $._config.blocksStorageConfig + $._config.queryConfig + $._config.distributorConfig + $._config.rulerClientConfig + diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index da15176372d..ff17d77906c 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -124,7 +124,8 @@ pvc.mixin.metadata.withName('compactor-data'), compactor_args:: - $._config.storageConfig + $._config.storageConfig + + $._config.blocksStorageConfig + { target: 'compactor', @@ -167,7 +168,8 @@ pvc.mixin.metadata.withName('store-gateway-data'), store_gateway_args:: - $._config.storageConfig + $._config.storageConfig + + $._config.blocksStorageConfig + { target: 'store-gateway', From f87e7878a4c051e985af520d2c4b80967959ec9c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 7 Aug 2020 10:23:04 +0200 Subject: [PATCH 066/192] Cleaned up blocks storage config Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 38c92251620..229ae7baf92 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -59,7 +59,7 @@ storage_engine: 'chunks', // Secondary storage engine is only used for querying. querier_second_storage_engine: null, - storage_tsdb_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', + blocks_storage_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', store_gateway_replication_factor: 3, @@ -140,7 +140,7 @@ 'experimental.blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. 'experimental.blocks-storage.tsdb.ship-interval': '1m', 'experimental.blocks-storage.backend': 'gcs', - 'experimental.blocks-storage.gcs.bucket-name': $._config.storage_tsdb_bucket_name, + 'experimental.blocks-storage.gcs.bucket-name': $._config.blocks_storage_bucket_name, 'experimental.store-gateway.sharding-enabled': true, 'experimental.store-gateway.sharding-ring.store': 'consul', 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, From 2aa49ecbb86450c41ea23936076daeb63471c87d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 12 Aug 2020 10:52:34 +0200 Subject: [PATCH 067/192] Apply chunks-store config if primary or secondary store use chunks. (https://github.com/grafana/cortex-jsonnet/pull/160) --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 229ae7baf92..8bbf1ee2a92 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -177,7 +177,7 @@ 'querier.query-store-after': '12h', } ) + ( - if $._config.memcached_index_queries_enabled && $._config.storage_engine == 'chunks' then + if $._config.memcached_index_queries_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then { // Setting for index cache. 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. From 5150802b84f72d44e27571daa0ea1a3c2c5eab80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 12 Aug 2020 11:26:12 +0200 Subject: [PATCH 068/192] Enable table manager when using chunks storage as secondary storage engine for querier. (https://github.com/grafana/cortex-jsonnet/pull/161) --- operations/mimir/config.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 8bbf1ee2a92..7e3238864fb 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -64,7 +64,9 @@ store_gateway_replication_factor: 3, // Blocks storage engine doesn't require the table manager. - table_manager_enabled: $._config.storage_engine != 'blocks', + // When running blocks with chunks as secondary storage engine for querier only, we need table-manager to apply + // retention policies. + table_manager_enabled: $._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks', // Blocks storage engine doesn't support index-writes (for writes deduplication) cache. memcached_index_writes_enabled: $._config.storage_engine != 'blocks', From 73347f346e8606797e55f84d6086e7663a0665cf Mon Sep 17 00:00:00 2001 From: Duologic Date: Mon, 24 Aug 2020 14:51:07 +0200 Subject: [PATCH 069/192] fix(ksonnet): backwards compatibility with ksonnet --- operations/mimir/consul.libsonnet | 10 ++++++++-- operations/mimir/gossip.libsonnet | 8 +++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/operations/mimir/consul.libsonnet b/operations/mimir/consul.libsonnet index 3350916d9e4..7e017f8fcd7 100644 --- a/operations/mimir/consul.libsonnet +++ b/operations/mimir/consul.libsonnet @@ -28,8 +28,14 @@ local consul = import 'consul/consul.libsonnet'; local deployment = $.apps.v1.deployment, local podAntiAffinity = deployment.mixin.spec.template.spec.affinity.podAntiAffinity, - local podAffinityTerm = $.core.v1.podAffinityTerm, local volume = $.core.v1.volume, + + // backwards compatibility with ksonnet + local podAffinityTerm = + if std.objectHasAll($.core.v1, 'podAffinityTerm') + then $.core.v1.podAffinityTerm + else podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType, + consul_deployment+: // Keep the consul state on a ramdisk, as they are ephemeral to us. @@ -42,7 +48,7 @@ local consul = import 'consul/consul.libsonnet'; // Ensure Consul is not scheduled on the same host as an ingester // (in any namespace - hence other_namespaces). podAntiAffinity.withRequiredDuringSchedulingIgnoredDuringExecutionMixin([ - podAffinityTerm.labelSelector.withMatchLabels({ name: 'ingester' }) + + podAffinityTerm.mixin.labelSelector.withMatchLabels({ name: 'ingester' }) + podAffinityTerm.withNamespaces([$._config.namespace] + $._config.other_namespaces) + podAffinityTerm.withTopologyKey('kubernetes.io/hostname'), ]) + diff --git a/operations/mimir/gossip.libsonnet b/operations/mimir/gossip.libsonnet index 5720c156be9..8b6261641d1 100644 --- a/operations/mimir/gossip.libsonnet +++ b/operations/mimir/gossip.libsonnet @@ -63,7 +63,13 @@ // During migration to gossip, it may be useful to use distributors instead, since they are restarted faster. gossip_ring_service: local service = $.core.v1.service; - local servicePort = $.core.v1.servicePort; + + // backwards compatibility with ksonnet + local servicePort = + if std.objectHasAll($.core.v1, 'servicePort') + then $.core.v1.servicePort + else service.mixin.spec.portsType; + local ports = [ servicePort.newNamed('gossip-ring', gossipRingPort, gossipRingPort) + servicePort.withProtocol('TCP'), From 875012d327b15b2c4b2bdf60390ce7fcc02c2034 Mon Sep 17 00:00:00 2001 From: Sandeep Sukhani Date: Wed, 26 Aug 2020 13:15:01 +0530 Subject: [PATCH 070/192] add overrides config to tsdb store-gateway --- operations/mimir/tsdb.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index ff17d77906c..10ba9c695de 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -172,6 +172,7 @@ $._config.blocksStorageConfig + { target: 'store-gateway', + 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens @@ -204,7 +205,8 @@ // one by one. This does NOT affect rolling updates: they will continue to be // rolled out one by one (the next pod will be rolled out once the previous is // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount('overrides', '/etc/cortex'), store_gateway_service: $.util.serviceFor($.store_gateway_statefulset), From 654a89183f2ca8db6d273d961c360965919a67f6 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 26 Aug 2020 12:44:03 +0000 Subject: [PATCH 071/192] Add jsonnet for ingester StatefulSet with WAL (https://github.com/grafana/cortex-jsonnet/pull/72) * Add jsonnet for ingester StatefulSet with WAL Signed-off-by: Ganesh Vernekar * Add CHANGELOG entry Signed-off-by: Ganesh Vernekar * Fix lint Signed-off-by: Ganesh Vernekar * Fix review comments Signed-off-by: Ganesh Vernekar --- operations/mimir/config.libsonnet | 11 ++++ operations/mimir/ingester.libsonnet | 83 ++++++++++++++++++++++++----- 2 files changed, 80 insertions(+), 14 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 7e3238864fb..5851d067fe2 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -63,6 +63,17 @@ store_gateway_replication_factor: 3, + // By default ingesters will be run as StatefulSet with WAL. + // If this is set to true, ingesters will use staless deployments without WAL. + ingester_deployment_without_wal: false, + + ingester: { + // These config options are only for the chunks storage. + wal_dir: '/wal_data', + statefulset_replicas: 3, + statefulset_disk: '150Gi', + }, + // Blocks storage engine doesn't require the table manager. // When running blocks with chunks as secondary storage engine for querier only, we need table-manager to apply // retention policies. diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 2a2b2619bb5..69b3c2e14fa 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -1,6 +1,4 @@ { - local container = $.core.v1.container, - ingester_args:: $._config.ringConfig + $._config.storeConfig + @@ -40,9 +38,20 @@ else {} ), + ingester_statefulset_args:: { + 'ingester.wal-enabled': true, + 'ingester.checkpoint-enabled': true, + 'ingester.recover-from-wal': true, + 'ingester.wal-dir': $._config.ingester.wal_dir, + 'ingester.checkpoint-duration': '15m', + '-log.level': 'info', + 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', + }, + ingester_ports:: $.util.defaultPorts, local name = 'ingester', + local container = $.core.v1.container, ingester_container:: container.new(name, $._images.ingester) + @@ -53,26 +62,72 @@ $.util.readinessProbe + $.jaeger_mixin, - local deployment = $.apps.v1.deployment, + local volumeMount = $.core.v1.volumeMount, + + ingester_statefulset_container:: + $.ingester_container + + container.withArgsMixin($.util.mapToFlags($.ingester_statefulset_args)) + + container.withVolumeMountsMixin([ + volumeMount.new('ingester-pvc', $._config.ingester.wal_dir), + ]), ingester_deployment_labels:: {}, + local pvc = $.core.v1.persistentVolumeClaim, + local volume = $.core.v1.volume, + local statefulSet = $.apps.v1.statefulSet, + + local ingester_pvc = + pvc.new('ingester-pvc') + + pvc.mixin.spec.resources.withRequests({ storage: $._config.ingester.statefulset_disk }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName('fast'), + + statefulset_storage_config_mixin:: + statefulSet.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), + + ingester_statefulset: + if $._config.ingester_deployment_without_wal == false then + statefulSet.new('ingester', $._config.ingester.statefulset_replicas, [$.ingester_statefulset_container], ingester_pvc) + + statefulSet.mixin.spec.withServiceName('ingester') + + statefulSet.mixin.spec.template.spec.withVolumes([volume.fromPersistentVolumeClaim('ingester-pvc', 'ingester-pvc')]) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'ingester' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.statefulset_storage_config_mixin + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.podPriority('high') + + $.util.antiAffinityStatefulSet + else null, + + local deployment = $.apps.v1.deployment, + ingester_deployment: - deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + - $.util.antiAffinity + - $.util.configVolumeMount('overrides', '/etc/cortex') + - deployment.mixin.metadata.withLabels({ name: name }) + - deployment.mixin.spec.withMinReadySeconds(60) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + - deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + - $.storage_config_mixin + - $.util.podPriority('high'), + if $._config.ingester_deployment_without_wal then + deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + + $.util.antiAffinity + + $.util.configVolumeMount('overrides', '/etc/cortex') + + deployment.mixin.metadata.withLabels({ name: name }) + + deployment.mixin.spec.withMinReadySeconds(60) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + $.storage_config_mixin + + $.util.podPriority('high') + else null, ingester_service_ignored_labels:: [], ingester_service: - $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels), + if $._config.ingester_deployment_without_wal then + $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels) + else + $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, From e464366542cb122c5d6cb4bf9c2815ba9d2b9c81 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 1 Sep 2020 19:12:27 +0200 Subject: [PATCH 072/192] Change max query length to 32 days To allow for comparision over months of 31d Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 5851d067fe2..a72df3c2e1c 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -169,10 +169,11 @@ // Limit the size of the rows we read from the index. 'store.cardinality-limit': 1e6, - // Don't allow individual queries of longer than 31days. Due to day query - // splitting in the frontend, the reality is this only limits rate(foo[31d]) - // type queries. - 'store.max-query-length': '744h', + // Don't allow individual queries of longer than 32days. Due to day query + // splitting in the frontend, the reality is this only limits rate(foo[32d]) + // type queries. 32 days to allow for comparision over the last month (31d) and + // then some. + 'store.max-query-length': '768h', } + ( if $._config.storage_engine == 'chunks' then { // Don't query ingesters for older queries. From f23f8f789de1d540e8dbad46555016664b2abd6e Mon Sep 17 00:00:00 2001 From: Austin McKinley <54160+amckinley@users.noreply.github.com> Date: Thu, 3 Sep 2020 23:20:46 -0700 Subject: [PATCH 073/192] Fix ruler S3 config option (https://github.com/grafana/cortex-jsonnet/pull/174) * Removed -experimental.tsdb.store-gateway-enabled flag Signed-off-by: Marco Pracucci * Use correct config variable for s3 ruler config * restore dropped line Co-authored-by: Marco Pracucci --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index a72df3c2e1c..0fb6ebf4c27 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -236,7 +236,7 @@ 'ruler.storage.gcs.bucketname': $._config.ruler_gcs_bucket_name, }, s3: { - 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], + 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.ruler_s3_bucket_name], }, }[$._config.ruler_client_type], From 969ec8bcc042e1443a7e9b4c2a6c07686baba664 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Thu, 3 Sep 2020 23:22:24 -0700 Subject: [PATCH 074/192] Add support for local ruler_client_type (https://github.com/grafana/cortex-jsonnet/pull/175) --- operations/mimir/config.libsonnet | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 0fb6ebf4c27..d359f22fe56 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -219,7 +219,7 @@ }, ruler_enabled: false, - ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3)', + ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3, local)', // TODO: Generic client generating functions would be nice. ruler_s3_bucket_name: $._config.s3_bucket_name, ruler_gcs_bucket_name: error 'must specify a GCS bucket name', @@ -238,6 +238,9 @@ s3: { 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.ruler_s3_bucket_name], }, + 'local': { + 'ruler.storage.local.directory': $._config.ruler_local_directory, + }, }[$._config.ruler_client_type], overrides: { From 8f86fb89dd28e04b7fb422b93e1cc52a59e6befd Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 24 Jul 2020 15:49:43 +0100 Subject: [PATCH 075/192] Support Alertmanager HA With this, we can now support increasing the number of replicas for a Cortex AM thus enabling HA. Please note that Alerts themselves are not gossiped between Alertmanagers. Each Ruler needs to send the alert to every Alertmanager available thus the reason why a headless service gets created when the number of replicas is more than 1. --- operations/mimir/alertmanager.libsonnet | 36 +++++++++++++++++++++---- operations/mimir/config.libsonnet | 4 +++ operations/mimir/images.libsonnet | 3 +-- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index e5bb15d87f8..d19c543626f 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -4,15 +4,23 @@ local container = $.core.v1.container, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, + local isGossiping = $._config.alertmanager.replicas > 1, + local peers = if isGossiping then + [ + 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager_gossip_port] + for i in std.range(0, $._config.alertmanager.replicas - 1) + ] + else [], alertmanager_args:: { target: 'alertmanager', 'log.level': 'debug', + 'experimental.alertmanager.enable-api': 'true', 'alertmanager.storage.type': 'gcs', 'alertmanager.storage.path': '/data', - 'alertmanager.gcs.bucketname': '%(cluster)s-cortex-configdb-%(namespace)s' % $._config, + 'alertmanager.storage.gcs.bucketname': '%(cluster)s-cortex-%(namespace)s' % $._config, 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, }, @@ -27,8 +35,22 @@ alertmanager_container:: if $._config.alertmanager_enabled then container.new('alertmanager', $._images.alertmanager) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + + container.withPorts( + $.util.defaultPorts + + if isGossiping then [ + $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager_gossip_port), + $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager_gossip_port), + ] + else [], + ) + + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + + container.withArgsMixin( + $.util.mapToFlags($.alertmanager_args) + + if isGossiping then + ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager_gossip_port] + + ['--cluster.peer=%s' % peer for peer in peers] + else [], + ) + container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + $.util.resourcesRequests('100m', '1Gi') + $.util.readinessProbe + @@ -37,7 +59,7 @@ alertmanager_statefulset: if $._config.alertmanager_enabled then - statefulSet.new('alertmanager', 1, [$.alertmanager_container], $.alertmanager_pvc) + + statefulSet.new('alertmanager', $._config.alertmanager.replicas, [$.alertmanager_container], $.alertmanager_pvc) + statefulSet.mixin.spec.withServiceName('alertmanager') + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + @@ -50,6 +72,10 @@ alertmanager_service: if $._config.alertmanager_enabled then - $.util.serviceFor($.alertmanager_statefulset) + if $._config.alertmanager.replicas > 1 then + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.spec.withClusterIp('None') + else + $.util.serviceFor($.alertmanager_statefulset) else {}, } diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index d359f22fe56..87d38bb97cc 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -243,6 +243,10 @@ }, }[$._config.ruler_client_type], + alertmanager: { + replicas: 1, + }, + overrides: { // === Per-tenant usage limits. === // diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index a4be104ae56..91466f8f4c4 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -18,8 +18,7 @@ store_gateway: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', - // TODO(gouthamve/jtlisi): Upstream the ruler and AM configs. - alertmanager: 'jtlisi/cortex:20190819_alertmanager_update-faa66aa43', + alertmanager: 'quay.io/cortexproject/cortex:master-2b41aa38d', testExporter: 'cortexproject/test-exporter:master-be013707', }, } From ba90cd65d1191bbaf8f6b92a8f526ac0bf4887fc Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 24 Jul 2020 16:04:56 +0100 Subject: [PATCH 076/192] Setup the gossip port --- operations/mimir/alertmanager.libsonnet | 6 +++--- operations/mimir/config.libsonnet | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index d19c543626f..23b95de31c0 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -7,7 +7,7 @@ local isGossiping = $._config.alertmanager.replicas > 1, local peers = if isGossiping then [ - 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager_gossip_port] + 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] for i in std.range(0, $._config.alertmanager.replicas - 1) ] else [], @@ -38,8 +38,8 @@ container.withPorts( $.util.defaultPorts + if isGossiping then [ - $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager_gossip_port), - $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager_gossip_port), + $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), + $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), ] else [], ) + diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 87d38bb97cc..5e72045565c 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -245,6 +245,7 @@ alertmanager: { replicas: 1, + gossip_port: 9094, }, overrides: { From f838e68a40d0b4b78811b531a48d51e598ea44da Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 29 Jul 2020 17:59:08 +0100 Subject: [PATCH 077/192] s/isGossiping/isHa --- operations/mimir/alertmanager.libsonnet | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 23b95de31c0..61d04f5bb1d 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -4,8 +4,8 @@ local container = $.core.v1.container, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, - local isGossiping = $._config.alertmanager.replicas > 1, - local peers = if isGossiping then + local isHA = $._config.alertmanager.replicas > 1, + local peers = if isHA then [ 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] for i in std.range(0, $._config.alertmanager.replicas - 1) @@ -37,7 +37,7 @@ container.new('alertmanager', $._images.alertmanager) + container.withPorts( $.util.defaultPorts + - if isGossiping then [ + if isHA then [ $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), ] @@ -46,7 +46,7 @@ container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + - if isGossiping then + if isHA then ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager_gossip_port] + ['--cluster.peer=%s' % peer for peer in peers] else [], @@ -72,7 +72,7 @@ alertmanager_service: if $._config.alertmanager_enabled then - if $._config.alertmanager.replicas > 1 then + if isHA then $.util.serviceFor($.alertmanager_statefulset) + service.mixin.spec.withClusterIp('None') else From a028494ea14d620c1af4dd69aa9b084cbed907d4 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 12:08:48 +0100 Subject: [PATCH 078/192] Bump to 3 replicas by default --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 5e72045565c..35cc73d4044 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -244,7 +244,7 @@ }[$._config.ruler_client_type], alertmanager: { - replicas: 1, + replicas: 3, gossip_port: 9094, }, From 93518246ebfa4bb2cae76e5888e7b489159c5aa7 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 12:18:42 +0100 Subject: [PATCH 079/192] Bump the cortex image, the latest stable is 1.3 --- operations/mimir/images.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 91466f8f4c4..14bc4781679 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,8 +5,9 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.2.0', + cortex: 'cortexproject/cortex:v1.3.0', + alertmanager: self.cortex, distributor: self.cortex, ingester: self.cortex, querier: self.cortex, @@ -18,7 +19,6 @@ store_gateway: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', - alertmanager: 'quay.io/cortexproject/cortex:master-2b41aa38d', testExporter: 'cortexproject/test-exporter:master-be013707', }, } From a911d41202dc0328c62dac052c87db83ef5d617b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 14:49:48 +0100 Subject: [PATCH 080/192] Fix typo in Alertmanager configuration --- operations/mimir/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 61d04f5bb1d..354b6c6af50 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -47,7 +47,7 @@ container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + if isHA then - ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager_gossip_port] + + ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + ['--cluster.peer=%s' % peer for peer in peers] else [], ) + From 787acc8a5641629f5f718803e4ac1d3206934288 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 15:40:28 +0100 Subject: [PATCH 081/192] Alertmanager configuration tweaks - Introduces the `fallback_config` option to allow an Alertmanager to have a fallback config. - Given the headless service a different name to allow seamless switching between 1 or multiple replicas. The cluster field in the service metadata is immutable which made it impossible to create the new service unless you delete the previous one. --- operations/mimir/alertmanager.libsonnet | 35 ++++++++++++++++++++++--- operations/mimir/config.libsonnet | 1 + 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 354b6c6af50..e012b4c1bb3 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -1,10 +1,14 @@ { local pvc = $.core.v1.persistentVolumeClaim, local volumeMount = $.core.v1.volumeMount, + local volume = $.core.v1.volume, local container = $.core.v1.container, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, + local configMap = $.core.v1.configMap, + local isHA = $._config.alertmanager.replicas > 1, + local hasFallbackConfig = std.length($._config.alertmanager.fallback_config) > 0, local peers = if isHA then [ 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] @@ -22,7 +26,18 @@ 'alertmanager.storage.path': '/data', 'alertmanager.storage.gcs.bucketname': '%(cluster)s-cortex-%(namespace)s' % $._config, 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, - }, + } + if hasFallbackConfig then { + 'alertmanager.configs.fallback': '/configs/alertmanager_fallback_config.yaml', + } else {}, + + alertmanager_fallback_config_map: + if hasFallbackConfig then + configMap.new('alertmanager-fallback-config') + + configMap.withData({ + 'alertmanager_fallback_config.yaml': $.util.manifestYaml($._config.alertmanager.fallback_config), + }) + else {}, + alertmanager_pvc:: if $._config.alertmanager_enabled then @@ -51,7 +66,12 @@ ['--cluster.peer=%s' % peer for peer in peers] else [], ) + - container.withVolumeMountsMixin([volumeMount.new('alertmanager-data', '/data')]) + + container.withVolumeMountsMixin( + [volumeMount.new('alertmanager-data', '/data')] + + if hasFallbackConfig then + [volumeMount.new('alertmanager-fallback-config', '/configs')] + else [] + ) + $.util.resourcesRequests('100m', '1Gi') + $.util.readinessProbe + $.jaeger_mixin @@ -67,15 +87,22 @@ statefulSet.mixin.spec.selector.withMatchLabels({ name: 'alertmanager' }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + statefulSet.mixin.spec.template.spec.withVolumesMixin( + if hasFallbackConfig then + [volume.fromConfigMap('alertmanager-fallback-config', 'alertmanager-fallback-config')] + else [] + ) else {}, alertmanager_service: if $._config.alertmanager_enabled then if isHA then $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.metadata.withName('alertmanager-headless') + service.mixin.spec.withClusterIp('None') else - $.util.serviceFor($.alertmanager_statefulset) + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.metadata.withName('alertmanager') else {}, } diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 35cc73d4044..2b5d2edb5a2 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -246,6 +246,7 @@ alertmanager: { replicas: 3, gossip_port: 9094, + fallback_config: {}, }, overrides: { From 2fe4d945612d700ffbe0c473459d2ace57f2e037 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 4 Sep 2020 21:42:53 +0100 Subject: [PATCH 082/192] Remove different name for a headless service Sadly, we can't have a different name for the headless service as the statefulset is configured to match its name. --- operations/mimir/alertmanager.libsonnet | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index e012b4c1bb3..08212b6538e 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -99,10 +99,8 @@ if $._config.alertmanager_enabled then if isHA then $.util.serviceFor($.alertmanager_statefulset) + - service.mixin.metadata.withName('alertmanager-headless') + service.mixin.spec.withClusterIp('None') else - $.util.serviceFor($.alertmanager_statefulset) + - service.mixin.metadata.withName('alertmanager') + $.util.serviceFor($.alertmanager_statefulset) else {}, } From f2fa98cd388406d585c63c74c105eedf5f66e6c4 Mon Sep 17 00:00:00 2001 From: forestsword Date: Mon, 31 Aug 2020 15:18:49 +0200 Subject: [PATCH 083/192] Fix ruler s3 storage configuration --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 2b5d2edb5a2..2e945441a13 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -236,7 +236,7 @@ 'ruler.storage.gcs.bucketname': $._config.ruler_gcs_bucket_name, }, s3: { - 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.ruler_s3_bucket_name], + 'ruler.storage.s3.url': 'https://%s/%s' % [$._config.aws_region, $._config.ruler_s3_bucket_name], }, 'local': { 'ruler.storage.local.directory': $._config.ruler_local_directory, From 796511d87809237c6c291e3280ba5daf8f8d7d94 Mon Sep 17 00:00:00 2001 From: forestsword Date: Fri, 28 Aug 2020 17:21:37 +0200 Subject: [PATCH 084/192] Block storage support for s3 --- operations/mimir/config.libsonnet | 55 ++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 2e945441a13..93410be8caa 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -5,7 +5,7 @@ replication_factor: 3, external_url: error 'must define external url for cluster', - storage_backend: error 'must specify storage backend (cassandra, gcp)', + storage_backend: error 'must specify storage backend (cassandra, gcp, aws)', table_prefix: $._config.namespace, cassandra_addresses: error 'must specify cassandra addresses', bigtable_instance: error 'must specify bigtable instance', @@ -56,10 +56,13 @@ // Use the Cortex chunks storage engine by default, while giving the ability // to switch to blocks storage. - storage_engine: 'chunks', + storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' + blocks_storage_backend: 'gcs', + blocks_storage_bucket_name: error 'must specify blocks storage bucket name', + blocks_storage_s3_endpoint: 's3.dualstack.us-east-1.amazonaws.com', + // Secondary storage engine is only used for querying. querier_second_storage_engine: null, - blocks_storage_bucket_name: error 'must specify GCS bucket name to store TSDB blocks', store_gateway_replication_factor: 3, @@ -141,25 +144,39 @@ $._config.client_configs.gcp + { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, + genericBlocksStorageConfig:: { + 'store.engine': $._config.storage_engine, // May still be chunks + 'experimental.blocks-storage.tsdb.dir': '/data/tsdb', + 'experimental.blocks-storage.bucket-store.sync-dir': '/data/tsdb', + 'experimental.blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', + 'experimental.blocks-storage.tsdb.block-ranges-period': '2h', + 'experimental.blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'experimental.blocks-storage.tsdb.ship-interval': '1m', + + 'experimental.store-gateway.sharding-enabled': true, + 'experimental.store-gateway.sharding-ring.store': 'consul', + 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'experimental.store-gateway.sharding-ring.prefix': '', + 'experimental.store-gateway.replication-factor': $._config.store_gateway_replication_factor, + + }, + gcsBlocksStorageConfig:: $._config.genericBlocksStorageConfig { + 'experimental.blocks-storage.backend': 'gcs', + 'experimental.blocks-storage.gcs.bucket-name': $._config.blocks_storage_bucket_name, + }, + s3BlocksStorageConfig:: $._config.genericBlocksStorageConfig { + 'experimental.blocks-storage.backend': 's3', + 'experimental.blocks-storage.s3.bucket-name': $._config.blocks_storage_bucket_name, + 'experimental.blocks-storage.s3.endpoint': $._config.blocks_storage_s3_endpoint, + }, // Blocks storage configuration, used only when 'blocks' storage // engine is explicitly enabled. blocksStorageConfig: ( - if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then { - 'store.engine': $._config.storage_engine, // May still be chunks - 'experimental.blocks-storage.tsdb.dir': '/data/tsdb', - 'experimental.blocks-storage.bucket-store.sync-dir': '/data/tsdb', - 'experimental.blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', - 'experimental.blocks-storage.tsdb.block-ranges-period': '2h', - 'experimental.blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. - 'experimental.blocks-storage.tsdb.ship-interval': '1m', - 'experimental.blocks-storage.backend': 'gcs', - 'experimental.blocks-storage.gcs.bucket-name': $._config.blocks_storage_bucket_name, - 'experimental.store-gateway.sharding-enabled': true, - 'experimental.store-gateway.sharding-ring.store': 'consul', - 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'experimental.store-gateway.sharding-ring.prefix': '', - 'experimental.store-gateway.replication-factor': $._config.store_gateway_replication_factor, - } else {} + if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then ( + if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig + else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig + else $._config.genericBlocksStorageConfig + ) else {} ), // Shared between the Ruler and Querier From 869d2c7da8452519985d7724bc242d822fea4a6d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Sep 2020 17:24:45 +0200 Subject: [PATCH 085/192] Added Azure support to blocks storage Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 93410be8caa..0b9711c863a 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -57,9 +57,11 @@ // Use the Cortex chunks storage engine by default, while giving the ability // to switch to blocks storage. storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' - blocks_storage_backend: 'gcs', + blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' blocks_storage_bucket_name: error 'must specify blocks storage bucket name', blocks_storage_s3_endpoint: 's3.dualstack.us-east-1.amazonaws.com', + blocks_storage_azure_account_name: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account name' else '', + blocks_storage_azure_account_key: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account key' else '', // Secondary storage engine is only used for querying. querier_second_storage_engine: null, @@ -169,12 +171,19 @@ 'experimental.blocks-storage.s3.bucket-name': $._config.blocks_storage_bucket_name, 'experimental.blocks-storage.s3.endpoint': $._config.blocks_storage_s3_endpoint, }, + azureBlocksStorageConfig:: $._config.genericBlocksStorageConfig { + 'experimental.blocks-storage.backend': 'azure', + 'experimental.blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, + 'experimental.blocks-storage.azure.account-name': $._config.blocks_storage_account_name, + 'experimental.blocks-storage.azure.account-key': $._config.blocks_storage_account_key, + }, // Blocks storage configuration, used only when 'blocks' storage // engine is explicitly enabled. blocksStorageConfig: ( if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then ( if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig + else if $._config.blocks_storage_backend == 'azure' then $._config.azureBlocksStorageConfig else $._config.genericBlocksStorageConfig ) else {} ), From 680822fcc6c595b04db44df094b74b442949bfd5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Sep 2020 17:26:51 +0200 Subject: [PATCH 086/192] Fixed linter Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 0b9711c863a..0abca70463a 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -57,7 +57,7 @@ // Use the Cortex chunks storage engine by default, while giving the ability // to switch to blocks storage. storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' - blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' + blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' blocks_storage_bucket_name: error 'must specify blocks storage bucket name', blocks_storage_s3_endpoint: 's3.dualstack.us-east-1.amazonaws.com', blocks_storage_azure_account_name: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account name' else '', From 70c36917b3ac8ade3a89f3506029a25e33c237e5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 23 Sep 2020 09:28:49 +0200 Subject: [PATCH 087/192] Removed the experimental prefix from blocks storage CLI flags Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 43 ++++++++++++++--------------- operations/mimir/tsdb.libsonnet | 46 +++++++++++++++---------------- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 0abca70463a..a1b68257a70 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -148,34 +148,33 @@ genericBlocksStorageConfig:: { 'store.engine': $._config.storage_engine, // May still be chunks - 'experimental.blocks-storage.tsdb.dir': '/data/tsdb', - 'experimental.blocks-storage.bucket-store.sync-dir': '/data/tsdb', - 'experimental.blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', - 'experimental.blocks-storage.tsdb.block-ranges-period': '2h', - 'experimental.blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. - 'experimental.blocks-storage.tsdb.ship-interval': '1m', - - 'experimental.store-gateway.sharding-enabled': true, - 'experimental.store-gateway.sharding-ring.store': 'consul', - 'experimental.store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'experimental.store-gateway.sharding-ring.prefix': '', - 'experimental.store-gateway.replication-factor': $._config.store_gateway_replication_factor, - + 'blocks-storage.tsdb.dir': '/data/tsdb', + 'blocks-storage.bucket-store.sync-dir': '/data/tsdb', + 'blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', + 'blocks-storage.tsdb.block-ranges-period': '2h', + 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'blocks-storage.tsdb.ship-interval': '1m', + + 'store-gateway.sharding-enabled': true, + 'store-gateway.sharding-ring.store': 'consul', + 'store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'store-gateway.sharding-ring.prefix': '', + 'store-gateway.replication-factor': $._config.store_gateway_replication_factor, }, gcsBlocksStorageConfig:: $._config.genericBlocksStorageConfig { - 'experimental.blocks-storage.backend': 'gcs', - 'experimental.blocks-storage.gcs.bucket-name': $._config.blocks_storage_bucket_name, + 'blocks-storage.backend': 'gcs', + 'blocks-storage.gcs.bucket-name': $._config.blocks_storage_bucket_name, }, s3BlocksStorageConfig:: $._config.genericBlocksStorageConfig { - 'experimental.blocks-storage.backend': 's3', - 'experimental.blocks-storage.s3.bucket-name': $._config.blocks_storage_bucket_name, - 'experimental.blocks-storage.s3.endpoint': $._config.blocks_storage_s3_endpoint, + 'blocks-storage.backend': 's3', + 'blocks-storage.s3.bucket-name': $._config.blocks_storage_bucket_name, + 'blocks-storage.s3.endpoint': $._config.blocks_storage_s3_endpoint, }, azureBlocksStorageConfig:: $._config.genericBlocksStorageConfig { - 'experimental.blocks-storage.backend': 'azure', - 'experimental.blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, - 'experimental.blocks-storage.azure.account-name': $._config.blocks_storage_account_name, - 'experimental.blocks-storage.azure.account-key': $._config.blocks_storage_account_key, + 'blocks-storage.backend': 'azure', + 'blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, + 'blocks-storage.azure.account-name': $._config.blocks_storage_account_name, + 'blocks-storage.azure.account-key': $._config.blocks_storage_account_key, }, // Blocks storage configuration, used only when 'blocks' storage // engine is explicitly enabled. diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 10ba9c695de..fbce8effbfd 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -29,35 +29,35 @@ blocks_chunks_caching_config:: ( if $._config.memcached_index_queries_enabled then { - 'experimental.blocks-storage.bucket-store.index-cache.backend': 'memcached', - 'experimental.blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', - 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, - 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', - 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', - 'experimental.blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', - 'experimental.blocks-storage.bucket-store.index-cache.postings-compression-enabled': 'true', + 'blocks-storage.bucket-store.index-cache.backend': 'memcached', + 'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.index-cache.postings-compression-enabled': 'true', } else {} ) + ( if $._config.memcached_chunks_enabled then { - 'experimental.blocks-storage.bucket-store.chunks-cache.backend': 'memcached', - 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', - 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, - 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', - 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', - 'experimental.blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.chunks-cache.backend': 'memcached', + 'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', } else {} ), blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { - 'experimental.blocks-storage.bucket-store.metadata-cache.backend': 'memcached', - 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, - 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', - 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, - 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', - 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', - 'experimental.blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.metadata-cache.backend': 'memcached', + 'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', } else {}, querier_args+:: $.blocks_metadata_caching_config, @@ -176,7 +176,7 @@ // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens - 'experimental.store-gateway.tokens-file-path': '/data/tokens', + 'store-gateway.tokens-file-path': '/data/tokens', } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config, store_gateway_ports:: $.util.defaultPorts, From ddca9bebf1703457a98b800ac26cace72c207ba6 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 23 Sep 2020 17:33:08 +0100 Subject: [PATCH 088/192] Lower default ingestion limits and create a new overrides user --- operations/mimir/config.libsonnet | 24 +++++++++++++++++++----- operations/mimir/distributor.libsonnet | 4 ++-- operations/mimir/ingester.libsonnet | 4 ++-- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index a1b68257a70..22fd5b2c4d4 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -280,20 +280,34 @@ // These are the defaults. Distributor limits will be 5x (#replicas) higher, // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. // - // small_user: { - // ingestion_rate: 100,000 - // ingestion_burst_size: 1,000,000 + // extra_small_user: { + // ingestion_rate: 60,000 + // ingestion_burst_size: 600,000 // // max_series_per_user: 0 (disabled) // max_series_per_metric: 0 (disabled) // - // max_global_series_per_user: 1,000,000 - // max_global_series_per_metric: 100,000 + // max_global_series_per_user: 600,000 + // max_global_series_per_metric: 60,000 // // max_series_per_query: 10,000 // max_samples_per_query: 100,000 // }, + small_user:: { + ingestion_rate: 100000, + ingestion_burst_size: 1000000, + + max_series_per_user: 0, + max_series_per_metric: 0, + + max_global_series_per_user: 1000000, + max_global_series_per_metric: 100000, + + max_series_per_query: 10000, + max_samples_per_query: 100000, + }, + medium_user:: { max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 24169840dc5..5f4851dfd88 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -29,8 +29,8 @@ 'server.grpc.keepalive.max-connection-idle': '1m', 'distributor.ingestion-rate-limit-strategy': 'global', - 'distributor.ingestion-rate-limit': 100000, // 100K - 'distributor.ingestion-burst-size': 1000000, // 1M + 'distributor.ingestion-rate-limit': 60000, // 60K + 'distributor.ingestion-burst-size': 600000, // 600K // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 69b3c2e14fa..54c092884b1 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -22,8 +22,8 @@ // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, - 'ingester.max-global-series-per-user': 1000000, // 1M - 'ingester.max-global-series-per-metric': 100000, // 100K + 'ingester.max-global-series-per-user': 600000, // 600K + 'ingester.max-global-series-per-metric': 60000, // 60K 'ingester.max-series-per-user': 0, // Disabled in favour of the max global limit 'ingester.max-series-per-metric': 0, // Disabled in favour of the max global limit 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', From f4947798e0a065cd8a8e5184d0dd64a6ffc8c747 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 24 Sep 2020 12:51:00 +0100 Subject: [PATCH 089/192] Address review feedback --- operations/mimir/config.libsonnet | 8 ++++---- operations/mimir/distributor.libsonnet | 4 ++-- operations/mimir/ingester.libsonnet | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 22fd5b2c4d4..e5622b276df 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -281,14 +281,14 @@ // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. // // extra_small_user: { - // ingestion_rate: 60,000 - // ingestion_burst_size: 600,000 + // ingestion_rate: 10,000 + // ingestion_burst_size: 200,000 // // max_series_per_user: 0 (disabled) // max_series_per_metric: 0 (disabled) // - // max_global_series_per_user: 600,000 - // max_global_series_per_metric: 60,000 + // max_global_series_per_user: 100,000 + // max_global_series_per_metric: 20,000 // // max_series_per_query: 10,000 // max_samples_per_query: 100,000 diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 5f4851dfd88..7c8e49d4796 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -29,8 +29,8 @@ 'server.grpc.keepalive.max-connection-idle': '1m', 'distributor.ingestion-rate-limit-strategy': 'global', - 'distributor.ingestion-rate-limit': 60000, // 60K - 'distributor.ingestion-burst-size': 600000, // 600K + 'distributor.ingestion-rate-limit': 10000, // 10K + 'distributor.ingestion-burst-size': 200000, // 200k // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 54c092884b1..d4969de1bb0 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -22,8 +22,8 @@ // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, - 'ingester.max-global-series-per-user': 600000, // 600K - 'ingester.max-global-series-per-metric': 60000, // 60K + 'ingester.max-global-series-per-user': 100000, // 100K + 'ingester.max-global-series-per-metric': 20000, // 20K 'ingester.max-series-per-user': 0, // Disabled in favour of the max global limit 'ingester.max-series-per-metric': 0, // Disabled in favour of the max global limit 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', From 007ab924ddb7538d5d05634c8629536943b2bba2 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 24 Sep 2020 14:56:11 +0100 Subject: [PATCH 090/192] Bump default series limit by 50% --- operations/mimir/config.libsonnet | 3 ++- operations/mimir/ingester.libsonnet | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index e5622b276df..d8ce8d6b73d 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -287,7 +287,8 @@ // max_series_per_user: 0 (disabled) // max_series_per_metric: 0 (disabled) // - // max_global_series_per_user: 100,000 + // // Our limit should be 100k, but we need some room of about ~50% to take rollouts into account + // max_global_series_per_user: 150,000 // max_global_series_per_metric: 20,000 // // max_series_per_query: 10,000 diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index d4969de1bb0..2201351cb58 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -22,7 +22,7 @@ // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, - 'ingester.max-global-series-per-user': 100000, // 100K + 'ingester.max-global-series-per-user': 150000, // 150K 'ingester.max-global-series-per-metric': 20000, // 20K 'ingester.max-series-per-user': 0, // Disabled in favour of the max global limit 'ingester.max-series-per-metric': 0, // Disabled in favour of the max global limit From a5fd8e8c326a73d6997e85d71e39bb815d2babeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 25 Sep 2020 09:25:44 +0200 Subject: [PATCH 091/192] Add flusher job for blocks. --- operations/mimir/flusher-job-blocks.libsonnet | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 operations/mimir/flusher-job-blocks.libsonnet diff --git a/operations/mimir/flusher-job-blocks.libsonnet b/operations/mimir/flusher-job-blocks.libsonnet new file mode 100644 index 00000000000..7a99b005333 --- /dev/null +++ b/operations/mimir/flusher-job-blocks.libsonnet @@ -0,0 +1,49 @@ +{ + // Usage example: + // + // local flusher_job = import 'cortex/flusher-job-blocks.libsonnet'; + // + // flusher_job { + // 'flusher-25': $.flusher_job_func('flusher-25', 'ingester-data-ingester-25'), + // } + // + // Where 'flusher-25' is a job name, and 'ingester-data-ingester-25' is PVC to flush. + + local container = $.core.v1.container, + local job = $.batch.v1.job, + local volumeMount = $.core.v1.volumeMount, + local volume = $.core.v1.volume, + + flusher_container:: + container.new('flusher', $._images.flusher) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ingester_args { + target: 'flusher', + 'blocks-storage.tsdb.retention-period': '10000h', // don't delete old blocks too soon. + })) + + $.util.resourcesRequests('4', '15Gi') + + $.util.resourcesLimits(null, '25Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + flusher_job_func(jobName, pvcName):: + job.new() + + job.mixin.spec.template.spec.withContainers([ + $.flusher_container + + container.withVolumeMountsMixin([ + volumeMount.new('flusher-data', '/data'), + ]), + ]) + + job.mixin.spec.template.spec.withRestartPolicy('Never') + + job.mixin.spec.template.spec.withVolumes([ + volume.fromPersistentVolumeClaim('flusher-data', pvcName), + ]) + + job.mixin.metadata.withName(jobName) + + job.mixin.metadata.withNamespace($._config.namespace) + + job.mixin.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.podPriority('high'), +} From c96f81da186821ebb2213090c88fa0f9a8730b8d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 28 Sep 2020 14:33:08 +0200 Subject: [PATCH 092/192] Fixed Azure account name/key config Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index d8ce8d6b73d..eb777610d39 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -173,8 +173,8 @@ azureBlocksStorageConfig:: $._config.genericBlocksStorageConfig { 'blocks-storage.backend': 'azure', 'blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, - 'blocks-storage.azure.account-name': $._config.blocks_storage_account_name, - 'blocks-storage.azure.account-key': $._config.blocks_storage_account_key, + 'blocks-storage.azure.account-name': $._config.blocks_storage_azure_account_name, + 'blocks-storage.azure.account-key': $._config.blocks_storage_azure_account_key, }, // Blocks storage configuration, used only when 'blocks' storage // engine is explicitly enabled. From 2f87772302ea913f97d0c4bd466f5d9bdb529db5 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 30 Sep 2020 11:01:01 +0200 Subject: [PATCH 093/192] Rename changed flags for 1.4 release. Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 2 +- operations/mimir/tsdb.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index eb777610d39..f80fbcc56f2 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -159,7 +159,7 @@ 'store-gateway.sharding-ring.store': 'consul', 'store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'store-gateway.sharding-ring.prefix': '', - 'store-gateway.replication-factor': $._config.store_gateway_replication_factor, + 'store-gateway.sharding-ring.replication-factor': $._config.store_gateway_replication_factor, }, gcsBlocksStorageConfig:: $._config.genericBlocksStorageConfig { 'blocks-storage.backend': 'gcs', diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index fbce8effbfd..65770f08139 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -176,7 +176,7 @@ // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens - 'store-gateway.tokens-file-path': '/data/tokens', + 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config, store_gateway_ports:: $.util.defaultPorts, From 8c7f703c924f65da93676ccf558657ca20ce5452 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 30 Sep 2020 11:56:40 +0200 Subject: [PATCH 094/192] Make sure only a single ruler rolls out at a time Signed-off-by: Goutham Veeramachaneni --- operations/mimir/ruler.libsonnet | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 6a628a8d429..5e2ae18cae7 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -36,6 +36,8 @@ ruler_deployment: if $._config.ruler_enabled then deployment.new('ruler', 2, [$.ruler_container]) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + $.util.antiAffinity + $.util.configVolumeMount('overrides', '/etc/cortex') + From aa748cbe6da9756f224c4bd0c70398a94f66174e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Oct 2020 11:34:54 +0200 Subject: [PATCH 095/192] Cut 1.4.0 Signed-off-by: Marco Pracucci --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 14bc4781679..8bd8e0fa672 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.3.0', + cortex: 'cortexproject/cortex:v1.4.0', alertmanager: self.cortex, distributor: self.cortex, From 49b9989385faa550f5d97f323e7e6a40cd3e39cb Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Fri, 9 Oct 2020 15:47:30 +0100 Subject: [PATCH 096/192] Add overrides exporter Overrides exporter part of grafana/cortex-tools and exposes runtime overrides and related presets of Cortex as metrics. Signed-off-by: Christian Simon --- operations/mimir/images.libsonnet | 1 + operations/mimir/overrides-exporter.libsonnet | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 operations/mimir/overrides-exporter.libsonnet diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 8bd8e0fa672..c02f51b28a2 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -18,6 +18,7 @@ ruler: self.cortex, store_gateway: self.cortex, + cortex_tools: 'grafana/cortex-tools:v0.4.0', query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', testExporter: 'cortexproject/test-exporter:master-be013707', }, diff --git a/operations/mimir/overrides-exporter.libsonnet b/operations/mimir/overrides-exporter.libsonnet new file mode 100644 index 00000000000..2c59c1d4635 --- /dev/null +++ b/operations/mimir/overrides-exporter.libsonnet @@ -0,0 +1,64 @@ +// this enables overrides exporter, which will expose the configured +// overrides and presets (if configured). Those metrics can be potentially +// high cardinality. +{ + local name = 'overrides-exporter', + + _config+: { + // overrides exporter can also make the configured presets available, this + // list references entries within $._config.overrides + overrides_exporter_presets:: [ + 'medium_user', + 'big_user', + 'super_user', + 'mega_user', + ], + }, + + local presets_enabled = std.length($._config.overrides_exporter_presets) > 0, + + local configMap = $.core.v1.configMap, + overrides_exporter_presets_configmap: + if presets_enabled then + configMap.new('overrides-presets') + + configMap.withData({ + 'overrides-presets.yaml': $.util.manifestYaml( + { + presets: { + [key]: $._config.overrides[key] + for key in $._config.overrides_exporter_presets + }, + } + ), + }), + + local containerPort = $.core.v1.containerPort, + overrides_exporter_port:: containerPort.newNamed(name='http-metrics', containerPort=9683), + + overrides_exporter_args:: { + 'overrides-file': '/etc/cortex/overrides.yaml', + } + if presets_enabled then { + 'presets-file': '/etc/cortex_presets/overrides-presets.yaml', + } else {}, + + local container = $.core.v1.container, + overrides_exporter_container:: + container.new(name, $._images.cortex_tools) + + container.withPorts([ + $.overrides_exporter_port, + ]) + + container.withArgsMixin([name] + $.util.mapToFlags($.overrides_exporter_args, prefix='--')) + + $.util.resourcesRequests('0.5', '0.5Gi') + + $.util.readinessProbe + + container.mixin.readinessProbe.httpGet.withPort($.overrides_exporter_port.name), + + local deployment = $.apps.v1.deployment, + overrides_exporter_deployment: + deployment.new(name, 1, [$.overrides_exporter_container], { name: name }) + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount('overrides-presets', '/etc/cortex_presets') + + deployment.mixin.metadata.withLabels({ name: name }), + + overrides_exporter_service: + $.util.serviceFor($.overrides_exporter_deployment), +} From 3a61f950b27eb357021341f95502261aed309751 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Mon, 12 Oct 2020 11:41:30 +0100 Subject: [PATCH 097/192] Refactor limits and overrides Ensure we expose 'extra_small_user' and reference it setting the "default" values. This will raise the limits of the 'small_user' preset to the defaults for `ingester.max-samples-per-query` and `ingester.max-series-per-query`. Signed-off-by: Christian Simon --- operations/mimir/config.libsonnet | 53 ++++++++++--------- operations/mimir/distributor.libsonnet | 4 +- operations/mimir/ingester.libsonnet | 10 ++-- operations/mimir/overrides-exporter.libsonnet | 3 ++ 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index f80fbcc56f2..291a01ab933 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -274,39 +274,40 @@ fallback_config: {}, }, + // === Per-tenant usage limits. === + // + // These are the defaults. Distributor limits will be 5x (#replicas) higher, + // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. + limits: $._config.overrides.extra_small_user, + overrides: { - // === Per-tenant usage limits. === - // - // These are the defaults. Distributor limits will be 5x (#replicas) higher, - // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. - // - // extra_small_user: { - // ingestion_rate: 10,000 - // ingestion_burst_size: 200,000 - // - // max_series_per_user: 0 (disabled) - // max_series_per_metric: 0 (disabled) - // - // // Our limit should be 100k, but we need some room of about ~50% to take rollouts into account - // max_global_series_per_user: 150,000 - // max_global_series_per_metric: 20,000 - // - // max_series_per_query: 10,000 - // max_samples_per_query: 100,000 - // }, + extra_small_user:: { + max_series_per_user: 0, // Disabled in favour of the max global limit + max_series_per_metric: 0, // Disabled in favour of the max global limit - small_user:: { - ingestion_rate: 100000, - ingestion_burst_size: 1000000, + // Our limit should be 100k, but we need some room of about ~50% to take rollouts into account + max_global_series_per_user: 150000, + max_global_series_per_metric: 20000, - max_series_per_user: 0, - max_series_per_metric: 0, + max_series_per_query: 100000, + max_samples_per_query: 1000000, + + ingestion_rate: 10000, + ingestion_burst_size: 200000, + }, + + small_user:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit max_global_series_per_user: 1000000, max_global_series_per_metric: 100000, - max_series_per_query: 10000, - max_samples_per_query: 100000, + max_series_per_query: 100000, + max_samples_per_query: 1000000, + + ingestion_rate: 100000, + ingestion_burst_size: 1000000, }, medium_user:: { diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 7c8e49d4796..d18fb88a6cb 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -29,8 +29,8 @@ 'server.grpc.keepalive.max-connection-idle': '1m', 'distributor.ingestion-rate-limit-strategy': 'global', - 'distributor.ingestion-rate-limit': 10000, // 10K - 'distributor.ingestion-burst-size': 200000, // 200k + 'distributor.ingestion-rate-limit': $._config.limits.ingestion_rate, + 'distributor.ingestion-burst-size': $._config.limits.ingestion_burst_size, // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 2201351cb58..e057b5828ea 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -22,10 +22,12 @@ // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, - 'ingester.max-global-series-per-user': 150000, // 150K - 'ingester.max-global-series-per-metric': 20000, // 20K - 'ingester.max-series-per-user': 0, // Disabled in favour of the max global limit - 'ingester.max-series-per-metric': 0, // Disabled in favour of the max global limit + 'ingester.max-series-per-user': $._config.limits.max_series_per_user, + 'ingester.max-series-per-metric': $._config.limits.max_series_per_metric, + 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, + 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, + 'ingester.max-series-per-query': $._config.limits.max_series_per_query, + 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 100000, } + ( diff --git a/operations/mimir/overrides-exporter.libsonnet b/operations/mimir/overrides-exporter.libsonnet index 2c59c1d4635..671d5d26f75 100644 --- a/operations/mimir/overrides-exporter.libsonnet +++ b/operations/mimir/overrides-exporter.libsonnet @@ -7,7 +7,10 @@ _config+: { // overrides exporter can also make the configured presets available, this // list references entries within $._config.overrides + overrides_exporter_presets:: [ + 'extra_small_user', + 'small_user', 'medium_user', 'big_user', 'super_user', From 4bdfc007b43b9ae9ffb240e61d90155f116be141 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 29 Oct 2020 10:00:52 +0100 Subject: [PATCH 098/192] Removed support for ingester.statefulset_replicas Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 1 - operations/mimir/ingester.libsonnet | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 291a01ab933..3cd50cd7c29 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -75,7 +75,6 @@ ingester: { // These config options are only for the chunks storage. wal_dir: '/wal_data', - statefulset_replicas: 3, statefulset_disk: '150Gi', }, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index e057b5828ea..785542a658c 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -91,7 +91,7 @@ ingester_statefulset: if $._config.ingester_deployment_without_wal == false then - statefulSet.new('ingester', $._config.ingester.statefulset_replicas, [$.ingester_statefulset_container], ingester_pvc) + + statefulSet.new('ingester', 3, [$.ingester_statefulset_container], ingester_pvc) + statefulSet.mixin.spec.withServiceName('ingester') + statefulSet.mixin.spec.template.spec.withVolumes([volume.fromPersistentVolumeClaim('ingester-pvc', 'ingester-pvc')]) + statefulSet.mixin.metadata.withNamespace($._config.namespace) + From 18d1a4b7d3f12435d0dee7d29939f227d59aa0c1 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 10 Nov 2020 16:54:37 +0100 Subject: [PATCH 099/192] Switch compactor statefulset to Parallel pod management policy Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 65770f08139..1aa975f3fac 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -157,7 +157,12 @@ statefulSet.mixin.spec.selector.withMatchLabels({ name: 'compactor' }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900), + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + // Parallelly scale up/down compactor instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), // The store-gateway runs a statefulset. local store_gateway_data_pvc = From 026e5ab6d010908f7c28ce89414b34b4bc3d80a7 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 12 Nov 2020 09:24:56 +0100 Subject: [PATCH 100/192] Cut 1.5.0 release Signed-off-by: Marco Pracucci --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index c02f51b28a2..64b2f6e388a 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.4.0', + cortex: 'cortexproject/cortex:v1.5.0', alertmanager: self.cortex, distributor: self.cortex, From a3208c687b6ccbc7601a00783ddd905974f7bbcc Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 9 Nov 2020 14:29:30 +0000 Subject: [PATCH 101/192] Add ruler limits Sets default presets for for all the 'users' when it comes to ruler limits. --- operations/mimir/config.libsonnet | 23 +++++++++++++++++++++++ operations/mimir/ruler.libsonnet | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 3cd50cd7c29..127d047995d 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -293,6 +293,10 @@ ingestion_rate: 10000, ingestion_burst_size: 200000, + + // 300 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 20, }, small_user:: { @@ -307,6 +311,10 @@ ingestion_rate: 100000, ingestion_burst_size: 1000000, + + // 450 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 30, }, medium_user:: { @@ -321,6 +329,10 @@ ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M + + // 600 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 40, }, big_user:: { @@ -335,6 +347,10 @@ ingestion_rate: 700000, // 700K ingestion_burst_size: 7000000, // 7M + + // 750 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 50, }, super_user:: { @@ -349,6 +365,10 @@ ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M + + // 900 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 60, }, // This user class has limits increased by +50% compared to the previous one. @@ -364,6 +384,9 @@ ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M + + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 70, }, }, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 5e2ae18cae7..7228c470208 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -18,6 +18,10 @@ // Ring Configs 'ruler.enable-sharding': true, 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + + // Limits + 'ruler.max-rules-per-rule-group': $._config.limits.max_rules_per_rule_group, + 'ruler.max-rule-groups-per-tenant': $._config.limits.max_rule_groups_per_tenant, }, ruler_container:: From fdd325f913bf31a0fa55a3a4accccf714d0557f1 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 18 Nov 2020 12:30:48 +0000 Subject: [PATCH 102/192] Add for the last user --- operations/mimir/config.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 127d047995d..17519ae78dc 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -385,6 +385,7 @@ ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M + // 1050 rules max_rules_per_rule_group: 15, max_rule_groups_per_tenant: 70, }, From d72dae507720f50376c8a35fae41b0a98bfa85ec Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 20 Nov 2020 09:45:40 +0100 Subject: [PATCH 103/192] Enabled compactor sharding Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 65770f08139..f6bec5e641a 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -134,6 +134,12 @@ 'compactor.data-dir': '/data', 'compactor.compaction-interval': '30m', 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, + + // Enable sharding. + 'compactor.sharding-enabled': true, + 'compactor.ring.store': 'consul', + 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'compactor.ring.prefix': '', }, compactor_ports:: $.util.defaultPorts, From 2c012085a32fb4f62f36605cab271df628e4e84d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 20 Nov 2020 11:19:49 +0100 Subject: [PATCH 104/192] Rollback PR 213 Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 24 ------------------------ operations/mimir/ruler.libsonnet | 4 ---- 2 files changed, 28 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 17519ae78dc..3cd50cd7c29 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -293,10 +293,6 @@ ingestion_rate: 10000, ingestion_burst_size: 200000, - - // 300 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 20, }, small_user:: { @@ -311,10 +307,6 @@ ingestion_rate: 100000, ingestion_burst_size: 1000000, - - // 450 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 30, }, medium_user:: { @@ -329,10 +321,6 @@ ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M - - // 600 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 40, }, big_user:: { @@ -347,10 +335,6 @@ ingestion_rate: 700000, // 700K ingestion_burst_size: 7000000, // 7M - - // 750 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 50, }, super_user:: { @@ -365,10 +349,6 @@ ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M - - // 900 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 60, }, // This user class has limits increased by +50% compared to the previous one. @@ -384,10 +364,6 @@ ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M - - // 1050 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 70, }, }, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 7228c470208..5e2ae18cae7 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -18,10 +18,6 @@ // Ring Configs 'ruler.enable-sharding': true, 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - - // Limits - 'ruler.max-rules-per-rule-group': $._config.limits.max_rules_per_rule_group, - 'ruler.max-rule-groups-per-tenant': $._config.limits.max_rule_groups_per_tenant, }, ruler_container:: From 5c3c71ac900d998cd88e0d7ffbc833fed4647041 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 20 Nov 2020 11:48:43 +0100 Subject: [PATCH 105/192] Re-introduce ruler limits Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 24 ++++++++++++++++++++++++ operations/mimir/ruler.libsonnet | 4 ++++ 2 files changed, 28 insertions(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 3cd50cd7c29..17519ae78dc 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -293,6 +293,10 @@ ingestion_rate: 10000, ingestion_burst_size: 200000, + + // 300 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 20, }, small_user:: { @@ -307,6 +311,10 @@ ingestion_rate: 100000, ingestion_burst_size: 1000000, + + // 450 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 30, }, medium_user:: { @@ -321,6 +329,10 @@ ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M + + // 600 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 40, }, big_user:: { @@ -335,6 +347,10 @@ ingestion_rate: 700000, // 700K ingestion_burst_size: 7000000, // 7M + + // 750 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 50, }, super_user:: { @@ -349,6 +365,10 @@ ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M + + // 900 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 60, }, // This user class has limits increased by +50% compared to the previous one. @@ -364,6 +384,10 @@ ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M + + // 1050 rules + max_rules_per_rule_group: 15, + max_rule_groups_per_tenant: 70, }, }, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 5e2ae18cae7..7228c470208 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -18,6 +18,10 @@ // Ring Configs 'ruler.enable-sharding': true, 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + + // Limits + 'ruler.max-rules-per-rule-group': $._config.limits.max_rules_per_rule_group, + 'ruler.max-rule-groups-per-tenant': $._config.limits.max_rule_groups_per_tenant, }, ruler_container:: From e7af91a1934af82bf31aa799b5e6df47b5e0f5d7 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 20 Nov 2020 13:13:49 +0000 Subject: [PATCH 106/192] [fixup] ruler limits config key name Ruler limits have a prefix of `ruler_` on the config key name. This makes the key match and then uses them as the value for the flags. --- operations/mimir/config.libsonnet | 24 ++++++++++++------------ operations/mimir/ruler.libsonnet | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 17519ae78dc..d65e497e87d 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -295,8 +295,8 @@ ingestion_burst_size: 200000, // 300 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 20, + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 20, }, small_user:: { @@ -313,8 +313,8 @@ ingestion_burst_size: 1000000, // 450 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 30, + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 30, }, medium_user:: { @@ -331,8 +331,8 @@ ingestion_burst_size: 3500000, // 3.5M // 600 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 40, + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 40, }, big_user:: { @@ -349,8 +349,8 @@ ingestion_burst_size: 7000000, // 7M // 750 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 50, + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 50, }, super_user:: { @@ -367,8 +367,8 @@ ingestion_burst_size: 15000000, // 15M // 900 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 60, + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 60, }, // This user class has limits increased by +50% compared to the previous one. @@ -386,8 +386,8 @@ ingestion_burst_size: 22500000, // 22.5M // 1050 rules - max_rules_per_rule_group: 15, - max_rule_groups_per_tenant: 70, + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 70, }, }, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 7228c470208..8a6902cc3eb 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -20,8 +20,8 @@ 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, // Limits - 'ruler.max-rules-per-rule-group': $._config.limits.max_rules_per_rule_group, - 'ruler.max-rule-groups-per-tenant': $._config.limits.max_rule_groups_per_tenant, + 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, + 'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant, }, ruler_container:: From 51d8e00efcbf7b5af3b5a267a8f844078ba2aeb8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Dec 2020 12:48:31 +0100 Subject: [PATCH 107/192] Removed postings-compression-enabled Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 1850b5e55d9..98b2ebd97ef 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -36,7 +36,6 @@ 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', - 'blocks-storage.bucket-store.index-cache.postings-compression-enabled': 'true', } else {} ) + ( if $._config.memcached_chunks_enabled then { From 3019e2323541d83eb61cd1fa4699a72995e0a1e0 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Dec 2020 13:09:23 +0100 Subject: [PATCH 108/192] Fine-tuned gRPC keepalive pings settings Signed-off-by: Marco Pracucci --- operations/mimir/alertmanager.libsonnet | 1 + operations/mimir/config.libsonnet | 5 ++ operations/mimir/distributor.libsonnet | 1 + operations/mimir/ingester.libsonnet | 1 + operations/mimir/querier.libsonnet | 1 + operations/mimir/query-frontend.libsonnet | 91 ++++++++++++----------- operations/mimir/ruler.libsonnet | 1 + operations/mimir/tsdb.libsonnet | 2 + 8 files changed, 60 insertions(+), 43 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 08212b6538e..76c069da35d 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -17,6 +17,7 @@ else [], alertmanager_args:: + $._config.grpcConfig + { target: 'alertmanager', 'log.level': 'debug', diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index d65e497e87d..ae8aae2d233 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -139,6 +139,11 @@ } else {}, + grpcConfig:: { + 'server.grpc.keepalive.min-time-between-pings': '10s', + 'server.grpc.keepalive.ping-without-stream-allowed': true, + }, + storageConfig: $._config.client_configs.aws + $._config.client_configs.cassandra + diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index d18fb88a6cb..69147ff50a0 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -3,6 +3,7 @@ local containerPort = $.core.v1.containerPort, distributor_args:: + $._config.grpcConfig + $._config.ringConfig + $._config.distributorConfig + { diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 785542a658c..7916b8f9896 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -1,5 +1,6 @@ { ingester_args:: + $._config.grpcConfig + $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 83d6384515f..fceee38a681 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -2,6 +2,7 @@ local container = $.core.v1.container, querier_args:: + $._config.grpcConfig + $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index d73581828bb..e262713ecb4 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -1,49 +1,54 @@ { local container = $.core.v1.container, - query_frontend_args:: { - target: 'query-frontend', - - // Need log.level=debug so all queries are logged, needed for analyse.py. - 'log.level': 'debug', - - // Increase HTTP server response write timeout, as we were seeing some - // queries that return a lot of data timeing out. - 'server.http-write-timeout': '1m', - - // Split long queries up into multiple day-long queries. - 'querier.split-queries-by-interval': '24h', - - // Cache query results. - 'querier.align-querier-with-step': true, - 'querier.cache-results': true, - 'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace, - 'frontend.memcached.service': 'memcached-client', - 'frontend.memcached.timeout': '500ms', - - // So that exporters like cloudwatch can still send in data and be un-cached. - 'frontend.max-cache-freshness': '10m', - - // Compress HTTP responses; improves latency for very big results and slow - // connections. - 'querier.compress-http-responses': true, - - // So it can recieve big responses from the querier. - 'server.grpc-max-recv-msg-size-bytes': 100 << 20, - - // Limit queries to 500 days, allow this to be override per-user. - 'store.max-query-length': '12000h', // 500 Days - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', - } + if $._config.queryFrontend.sharded_queries_enabled then { - 'querier.parallelise-shardable-queries': 'true', - - // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. - // basically base * shard_factor * query_split_factor / num_frontends where - 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), - - 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], - } + $._config.storageConfig - else {}, + query_frontend_args:: + $._config.ringConfig + + { + target: 'query-frontend', + + // Need log.level=debug so all queries are logged, needed for analyse.py. + 'log.level': 'debug', + + // Increase HTTP server response write timeout, as we were seeing some + // queries that return a lot of data timeing out. + 'server.http-write-timeout': '1m', + + // Split long queries up into multiple day-long queries. + 'querier.split-queries-by-interval': '24h', + + // Cache query results. + 'querier.align-querier-with-step': true, + 'querier.cache-results': true, + 'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace, + 'frontend.memcached.service': 'memcached-client', + 'frontend.memcached.timeout': '500ms', + + // So that exporters like cloudwatch can still send in data and be un-cached. + 'frontend.max-cache-freshness': '10m', + + // Compress HTTP responses; improves latency for very big results and slow + // connections. + 'querier.compress-http-responses': true, + + // So it can receive big responses from the querier. + 'server.grpc-max-recv-msg-size-bytes': 100 << 20, + + // Limit queries to 500 days, allow this to be override per-user. + 'store.max-query-length': '12000h', // 500 Days + 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + } + ( + if $._config.queryFrontend.sharded_queries_enabled then + { + 'querier.parallelise-shardable-queries': 'true', + + // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. + // basically base * shard_factor * query_split_factor / num_frontends where + 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), + + 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], + } + $._config.storageConfig + else {} + ), query_frontend_container:: container.new('query-frontend', $._images.query_frontend) + diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 8a6902cc3eb..c342586f83e 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -2,6 +2,7 @@ local container = $.core.v1.container, ruler_args:: + $._config.grpcConfig + $._config.ringConfig + $._config.storeConfig + $._config.storageConfig + diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 98b2ebd97ef..054a30e59b2 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -123,6 +123,7 @@ pvc.mixin.metadata.withName('compactor-data'), compactor_args:: + $._config.grpcConfig + $._config.storageConfig + $._config.blocksStorageConfig + { @@ -178,6 +179,7 @@ pvc.mixin.metadata.withName('store-gateway-data'), store_gateway_args:: + $._config.grpcConfig + $._config.storageConfig + $._config.blocksStorageConfig + { From aeb58635bd4baf2ca0a3f0cd0a3ec70926429267 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Dec 2020 14:21:34 +0100 Subject: [PATCH 109/192] Fixed gRPC settings Signed-off-by: Marco Pracucci --- operations/mimir/ingester.libsonnet | 20 +++++++++++--------- operations/mimir/query-frontend.libsonnet | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 7916b8f9896..d27a78a56b4 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -41,15 +41,17 @@ else {} ), - ingester_statefulset_args:: { - 'ingester.wal-enabled': true, - 'ingester.checkpoint-enabled': true, - 'ingester.recover-from-wal': true, - 'ingester.wal-dir': $._config.ingester.wal_dir, - 'ingester.checkpoint-duration': '15m', - '-log.level': 'info', - 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', - }, + ingester_statefulset_args:: + $._config.grpcConfig + + { + 'ingester.wal-enabled': true, + 'ingester.checkpoint-enabled': true, + 'ingester.recover-from-wal': true, + 'ingester.wal-dir': $._config.ingester.wal_dir, + 'ingester.checkpoint-duration': '15m', + '-log.level': 'info', + 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', + }, ingester_ports:: $.util.defaultPorts, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index e262713ecb4..b306b23d069 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -2,7 +2,7 @@ local container = $.core.v1.container, query_frontend_args:: - $._config.ringConfig + + $._config.grpcConfig + { target: 'query-frontend', From 9a3d0acbece20616a0668224a1f7d4c774e36d97 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 10:39:26 +0100 Subject: [PATCH 110/192] Release 1.6.0 Signed-off-by: Marco Pracucci --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 64b2f6e388a..9714b3022fd 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.5.0', + cortex: 'cortexproject/cortex:v1.6.0', alertmanager: self.cortex, distributor: self.cortex, From dffba444e53e71604041d18e681a516723ec345b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 10:35:19 +0100 Subject: [PATCH 111/192] Add option to configure unregister ingesters on shutdown Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 6 ++++++ operations/mimir/ingester.libsonnet | 6 ++++++ operations/mimir/querier.libsonnet | 8 +++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index ae8aae2d233..765ad216a7c 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -13,6 +13,12 @@ aws_region: error 'must specify AWS region', s3_bucket_name: error 'must specify S3 bucket name', + // If disabled, ingesters are not unregistered on shutdown and left in the ring with + // the LEAVING state. Disabling it prevents series resharding during ingesters rollouts, + // but requires to manually forget ingesters on scale down and that ingester ID is preserved + // during rollouts. + unregister_ingesters_on_shutdown: true, + // schema is used to generate the storage schema yaml file used by // the Cortex chunks storage: // - More information: https://github.com/cortexproject/cortex/pull/1072 diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index d27a78a56b4..a6b9ca483f6 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -39,6 +39,12 @@ 'store.index-cache-write.memcached.service': 'memcached-client', } else {} + ) + ( + if !$._config.unregister_ingesters_on_shutdown then + { + 'ingester.unregister-on-shutdown': false, + } + else {} ), ingester_statefulset_args:: diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index fceee38a681..c488231ac01 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -27,7 +27,13 @@ 'querier.second-store-engine': $._config.querier_second_storage_engine, 'log.level': 'debug', - }, + } + ( + if !$._config.unregister_ingesters_on_shutdown then + { + 'distributor.extend-writes': false, + } + else {} + ), querier_ports:: $.util.defaultPorts, From 30f9d64daea0c641d611dd4ee75a08583bd8b483 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 10:52:31 +0100 Subject: [PATCH 112/192] Fixed config Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 8 +++++++- operations/mimir/querier.libsonnet | 8 +------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 69147ff50a0..64f9c46daa0 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -36,7 +36,13 @@ // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'distributor.ring.prefix': '', - }, + } + ( + if !$._config.unregister_ingesters_on_shutdown then + { + 'distributor.extend-writes': false, + } + else {} + ), distributor_ports:: $.util.defaultPorts, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index c488231ac01..fceee38a681 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -27,13 +27,7 @@ 'querier.second-store-engine': $._config.querier_second_storage_engine, 'log.level': 'debug', - } + ( - if !$._config.unregister_ingesters_on_shutdown then - { - 'distributor.extend-writes': false, - } - else {} - ), + }, querier_ports:: $.util.defaultPorts, From b2dd6d5b568ca88528802e856a323e66f39f2e00 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 10:53:20 +0100 Subject: [PATCH 113/192] Improved comment Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 765ad216a7c..2dde68c7537 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -13,8 +13,8 @@ aws_region: error 'must specify AWS region', s3_bucket_name: error 'must specify S3 bucket name', - // If disabled, ingesters are not unregistered on shutdown and left in the ring with - // the LEAVING state. Disabling it prevents series resharding during ingesters rollouts, + // If false, ingesters are not unregistered on shutdown and left in the ring with + // the LEAVING state. Setting to false prevents series resharding during ingesters rollouts, // but requires to manually forget ingesters on scale down and that ingester ID is preserved // during rollouts. unregister_ingesters_on_shutdown: true, From 061ed16b7fb918aac9f63f3285784177302c4525 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 12:04:56 +0100 Subject: [PATCH 114/192] Updated doc Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 2dde68c7537..03f5c8d984c 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -15,8 +15,9 @@ // If false, ingesters are not unregistered on shutdown and left in the ring with // the LEAVING state. Setting to false prevents series resharding during ingesters rollouts, - // but requires to manually forget ingesters on scale down and that ingester ID is preserved - // during rollouts. + // but requires to: + // 1. Either manually forget ingesters on scale down or invoke the /shutdown endpoint + // 2. Ensure ingester ID is preserved during rollouts unregister_ingesters_on_shutdown: true, // schema is used to generate the storage schema yaml file used by From 680189a90ae1951449143d949c15b3acec1676f6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 12:10:05 +0100 Subject: [PATCH 115/192] Removed ifs Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 10 ++++------ operations/mimir/ingester.libsonnet | 7 +------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 64f9c46daa0..caa9441f83c 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -36,12 +36,10 @@ // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'distributor.ring.prefix': '', - } + ( - if !$._config.unregister_ingesters_on_shutdown then - { - 'distributor.extend-writes': false, - } - else {} + + // Do not extend the replication set on unhealthy ingester when "unregister on shutdown" + // is disabled. + 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, ), distributor_ports:: $.util.defaultPorts, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index a6b9ca483f6..161fe38abf7 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -15,6 +15,7 @@ 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. 'ingester.heartbeat-period': '15s', 'ingester.max-stale-chunk-idle': '5m', + 'ingester.unregister-on-shutdown': $._config.unregister_ingesters_on_shutdown, // Chunk building/flushing config. 'ingester.chunk-encoding': 3, // Bigchunk encoding @@ -39,12 +40,6 @@ 'store.index-cache-write.memcached.service': 'memcached-client', } else {} - ) + ( - if !$._config.unregister_ingesters_on_shutdown then - { - 'ingester.unregister-on-shutdown': false, - } - else {} ), ingester_statefulset_args:: From eb981b551bc144ae5d2e3cb08752efb27e3b9695 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 12:43:40 +0100 Subject: [PATCH 116/192] Updated comment Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index caa9441f83c..d808cb11a0f 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -37,8 +37,8 @@ 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'distributor.ring.prefix': '', - // Do not extend the replication set on unhealthy ingester when "unregister on shutdown" - // is disabled. + // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" + // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, ), From 003ca9e430456e8ef5648ba20a79b9cfd9206c68 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 13:20:01 +0100 Subject: [PATCH 117/192] Fixed syntax error Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index d808cb11a0f..8d0e33caeb8 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -40,7 +40,7 @@ // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, - ), + }, distributor_ports:: $.util.defaultPorts, From 2e64bf63187c662ab0de2ee46a9312f20f2d921d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 8 Jan 2021 13:54:41 +0100 Subject: [PATCH 118/192] Remove misleading comment (https://github.com/grafana/cortex-jsonnet/pull/243) Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 03f5c8d984c..5550077fe84 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -287,8 +287,7 @@ // === Per-tenant usage limits. === // - // These are the defaults. Distributor limits will be 5x (#replicas) higher, - // ingester limits are 6s (#replicas) / 3x (#replication factor) higher. + // These are the defaults. limits: $._config.overrides.extra_small_user, overrides: { From cca8c2e4921a7dd497ccb94f16e60943238057ce Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Mon, 11 Jan 2021 19:25:47 +0100 Subject: [PATCH 119/192] Add option to customise the configmap name Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 4 +++- operations/mimir/distributor.libsonnet | 2 +- operations/mimir/flusher-job-blocks.libsonnet | 2 +- operations/mimir/flusher-job.libsonnet | 2 +- operations/mimir/ingester.libsonnet | 4 ++-- operations/mimir/overrides-exporter.libsonnet | 2 +- operations/mimir/querier.libsonnet | 2 +- operations/mimir/query-frontend.libsonnet | 2 +- operations/mimir/ruler.libsonnet | 2 +- operations/mimir/tsdb.libsonnet | 4 ++-- 10 files changed, 14 insertions(+), 12 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 5550077fe84..6a20ddfe806 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -290,6 +290,8 @@ // These are the defaults. limits: $._config.overrides.extra_small_user, + overrides_configmap: 'overrides', + overrides: { extra_small_user:: { max_series_per_user: 0, // Disabled in favour of the max global limit @@ -415,7 +417,7 @@ local configMap = $.core.v1.configMap, overrides_config: - configMap.new('overrides') + + configMap.new($._config.overrides_configmap) + configMap.withData({ 'overrides.yaml': $.util.manifestYaml( { diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 8d0e33caeb8..19f66d8f84f 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -60,7 +60,7 @@ distributor_deployment: deployment.new('distributor', 3, [$.distributor_container], $.distributor_deployment_labels) + $.util.antiAffinity + - $.util.configVolumeMount('overrides', '/etc/cortex'), + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), local service = $.core.v1.service, diff --git a/operations/mimir/flusher-job-blocks.libsonnet b/operations/mimir/flusher-job-blocks.libsonnet index 7a99b005333..1e6266caf72 100644 --- a/operations/mimir/flusher-job-blocks.libsonnet +++ b/operations/mimir/flusher-job-blocks.libsonnet @@ -44,6 +44,6 @@ job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high'), } diff --git a/operations/mimir/flusher-job.libsonnet b/operations/mimir/flusher-job.libsonnet index 78eadeb7c5a..4d9a576240c 100644 --- a/operations/mimir/flusher-job.libsonnet +++ b/operations/mimir/flusher-job.libsonnet @@ -46,6 +46,6 @@ job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high'), } diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 161fe38abf7..a0b6ac7dcb8 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -106,7 +106,7 @@ statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + $.statefulset_storage_config_mixin + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high') + $.util.antiAffinityStatefulSet else null, @@ -117,7 +117,7 @@ if $._config.ingester_deployment_without_wal then deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + $.util.antiAffinity + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + deployment.mixin.metadata.withLabels({ name: name }) + deployment.mixin.spec.withMinReadySeconds(60) + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + diff --git a/operations/mimir/overrides-exporter.libsonnet b/operations/mimir/overrides-exporter.libsonnet index 671d5d26f75..d8eb411ad27 100644 --- a/operations/mimir/overrides-exporter.libsonnet +++ b/operations/mimir/overrides-exporter.libsonnet @@ -58,7 +58,7 @@ local deployment = $.apps.v1.deployment, overrides_exporter_deployment: deployment.new(name, 1, [$.overrides_exporter_container], { name: name }) + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.configVolumeMount('overrides-presets', '/etc/cortex_presets') + deployment.mixin.metadata.withLabels({ name: name }), diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index fceee38a681..0f92b8039e8 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -56,7 +56,7 @@ querier_deployment: deployment.new('querier', $._config.querier.replicas, [$.querier_container], $.querier_deployment_labels) + $.util.antiAffinity + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.storage_config_mixin, local service = $.core.v1.service, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index b306b23d069..3386a312159 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -70,7 +70,7 @@ query_frontend_deployment: deployment.new('query-frontend', $._config.queryFrontend.replicas, [$.query_frontend_container]) + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.antiAffinity + // inject storage schema in order to know what/how to shard if $._config.queryFrontend.sharded_queries_enabled then diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index c342586f83e..81412899b16 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -45,7 +45,7 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + $.util.antiAffinity + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.storage_config_mixin else {}, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 054a30e59b2..eb43f8851cc 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -99,7 +99,7 @@ // For this reason, we grant an high termination period (80 minutes). statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high') + $.util.antiAffinity + // Parallelly scale up/down ingester instances instead of starting them @@ -218,7 +218,7 @@ // rolled out one by one (the next pod will be rolled out once the previous is // ready). statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - $.util.configVolumeMount('overrides', '/etc/cortex'), + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), store_gateway_service: $.util.serviceFor($.store_gateway_statefulset), From b327bd60e7851e909cbdf90c0bad6ba1b7d0d2d4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 19 Jan 2021 17:49:57 +0100 Subject: [PATCH 120/192] Fix for real Signed-off-by: Marco Pracucci --- operations/mimir/ruler.libsonnet | 3 +++ 1 file changed, 3 insertions(+) diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 81412899b16..a394b88a8b7 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -23,6 +23,9 @@ // Limits 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, 'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant, + + // Storage + 'querier.second-store-engine': $._config.querier_second_storage_engine, }, ruler_container:: From 18a162aabfc33f414fa1d68c395935bee37393a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 20 Jan 2021 16:33:44 +0100 Subject: [PATCH 121/192] Added bucket index flag, and enable bucket index by default. (https://github.com/grafana/cortex-jsonnet/pull/254) --- operations/mimir/tsdb.libsonnet | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index eb43f8851cc..92ecbf9b74e 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -24,6 +24,12 @@ // Allow to fine tune compactor. cortex_compactor_max_concurrency: 1, + // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval + cortex_compactor_cleanup_interval: '15m', + + // Enable use of bucket index by querier, ruler and store-gateway. + // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. + cortex_bucket_index_enabled: false, }, blocks_chunks_caching_config:: @@ -59,8 +65,15 @@ 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', } else {}, - querier_args+:: $.blocks_metadata_caching_config, - ruler_args+:: $.blocks_metadata_caching_config, + bucket_index_config:: if $._config.cortex_bucket_index_enabled then { + 'blocks-storage.bucket-store.bucket-index.enabled': true, + + // Bucket index is updated by compactor on each cleanup cycle. + 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, + } else {}, + + querier_args+:: $.blocks_metadata_caching_config + $.bucket_index_config, + ruler_args+:: $.blocks_metadata_caching_config + $.bucket_index_config, // The ingesters should persist TSDB blocks and WAL on a persistent // volume in order to be crash resilient. @@ -134,6 +147,7 @@ 'compactor.data-dir': '/data', 'compactor.compaction-interval': '30m', 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, + 'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval, // Enable sharding. 'compactor.sharding-enabled': true, @@ -189,7 +203,10 @@ // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', - } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config, + } + + $.blocks_chunks_caching_config + + $.blocks_metadata_caching_config + + $.bucket_index_config, store_gateway_ports:: $.util.defaultPorts, From 758d4f81fa7d7877e4b15bd42f6a9a9e89d649a3 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 5 Feb 2021 09:55:59 +0100 Subject: [PATCH 122/192] Cleanup blocks storage config Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 6 ++---- operations/mimir/tsdb.libsonnet | 10 ++++++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 6a20ddfe806..65498121bea 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -159,12 +159,10 @@ genericBlocksStorageConfig:: { 'store.engine': $._config.storage_engine, // May still be chunks - 'blocks-storage.tsdb.dir': '/data/tsdb', + }, + queryBlocksStorageConfig:: { 'blocks-storage.bucket-store.sync-dir': '/data/tsdb', 'blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', - 'blocks-storage.tsdb.block-ranges-period': '2h', - 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. - 'blocks-storage.tsdb.ship-interval': '1m', 'store-gateway.sharding-enabled': true, 'store-gateway.sharding-ring.store': 'consul', diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 92ecbf9b74e..56ff249d86b 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -72,8 +72,8 @@ 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, } else {}, - querier_args+:: $.blocks_metadata_caching_config + $.bucket_index_config, - ruler_args+:: $.blocks_metadata_caching_config + $.bucket_index_config, + querier_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, + ruler_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, // The ingesters should persist TSDB blocks and WAL on a persistent // volume in order to be crash resilient. @@ -87,6 +87,11 @@ ingester_deployment: {}, ingester_args+:: { + 'blocks-storage.tsdb.dir': '/data/tsdb', + 'blocks-storage.tsdb.block-ranges-period': '2h', + 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'blocks-storage.tsdb.ship-interval': '1m', + // Disable TSDB blocks transfer because of persistent volumes 'ingester.max-transfer-retries': 0, 'ingester.join-after': '0s', @@ -196,6 +201,7 @@ $._config.grpcConfig + $._config.storageConfig + $._config.blocksStorageConfig + + $._config.queryBlocksStorageConfig + { target: 'store-gateway', 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', From 6f12977fcec153995f9c57a56c3f5e1e24b04f8c Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 8 Feb 2021 15:26:25 -0500 Subject: [PATCH 123/192] feat: allow for Alertmanager to configure multiple storage backends Signed-off-by: Jacob Lisi --- operations/mimir/alertmanager.libsonnet | 4 +--- operations/mimir/config.libsonnet | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 76c069da35d..b1fa5551869 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -18,14 +18,12 @@ alertmanager_args:: $._config.grpcConfig + + $._config.alertmanagerClientConfig + { target: 'alertmanager', 'log.level': 'debug', - 'experimental.alertmanager.enable-api': 'true', - 'alertmanager.storage.type': 'gcs', 'alertmanager.storage.path': '/data', - 'alertmanager.storage.gcs.bucketname': '%(cluster)s-cortex-%(namespace)s' % $._config, 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, } + if hasFallbackConfig then { 'alertmanager.configs.fallback': '/configs/alertmanager_fallback_config.yaml', diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 65498121bea..cffe32bd537 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -283,6 +283,29 @@ fallback_config: {}, }, + alertmanager_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3, local)', + alertmanager_s3_bucket_name: $._config.s3_bucket_name, + alertmanager_gcs_bucket_name: error 'must specify a GCS bucket name', + + alertmanagerClientConfig: + { + 'alertmanager.storage.type': $._config.alertmanager_client_type, + } + + { + configdb: { + configs_api_url: 'config.%s.svc.cluster.local' % $._config.namespace, + }, + gcs: { + 'alertmanager.storage.gcs.bucketname': $._config.alertmanager_gcs_bucket_name, + }, + s3: { + 'alertmanager.storage.s3.url': 'https://%s/%s' % [$._config.aws_region, $._config.alertmanager_s3_bucket_name], + }, + 'local': { + 'alertmanager.storage.local.directory': $._config.alertmanager_local_directory, + }, + }[$._config.alertmanager_client_type], + // === Per-tenant usage limits. === // // These are the defaults. From e961968dd6001303c5ea8413f03f1a43e4290493 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 23 Feb 2021 10:56:12 -0500 Subject: [PATCH 124/192] Update cortex/config.libsonnet Co-authored-by: gotjosh --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index cffe32bd537..e8e55f4f9e1 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -287,7 +287,7 @@ alertmanager_s3_bucket_name: $._config.s3_bucket_name, alertmanager_gcs_bucket_name: error 'must specify a GCS bucket name', - alertmanagerClientConfig: + alertmanagerStorageClientConfig: { 'alertmanager.storage.type': $._config.alertmanager_client_type, } + From 8268f43960420438c3a382aa30ddbc4e858da9ca Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 23 Feb 2021 10:56:16 -0500 Subject: [PATCH 125/192] Update cortex/alertmanager.libsonnet Co-authored-by: gotjosh --- operations/mimir/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index b1fa5551869..af2c0ec0003 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -18,7 +18,7 @@ alertmanager_args:: $._config.grpcConfig + - $._config.alertmanagerClientConfig + + $._config. alertmanagerStorageClientConfig + { target: 'alertmanager', 'log.level': 'debug', From b2de4ffeff1192d1c83b401ac7417efee2982067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 24 Feb 2021 16:45:46 +0100 Subject: [PATCH 126/192] Release 1.7.0. (https://github.com/grafana/cortex-jsonnet/pull/260) * Release 1.7.0. --- operations/mimir/images.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 9714b3022fd..77088f06a37 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.6.0', + cortex: 'cortexproject/cortex:v1.7.0', alertmanager: self.cortex, distributor: self.cortex, @@ -19,7 +19,7 @@ store_gateway: self.cortex, cortex_tools: 'grafana/cortex-tools:v0.4.0', - query_tee: 'quay.io/cortexproject/query-tee:master-5d7b05c3', - testExporter: 'cortexproject/test-exporter:master-be013707', + query_tee: 'quay.io/cortexproject/query-tee:v1.7.0', + testExporter: 'cortexproject/test-exporter:v1.7.0', }, } From 72aeada61bb56591dfc462b9fb7d8358d3307eff Mon Sep 17 00:00:00 2001 From: Alex Martin Date: Tue, 2 Mar 2021 12:49:52 -0600 Subject: [PATCH 127/192] cortex: config: Fix error message for alertmanager_client_type. --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index e8e55f4f9e1..57e34c48b25 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -283,7 +283,7 @@ fallback_config: {}, }, - alertmanager_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3, local)', + alertmanager_client_type: error 'you must specify a storage backend type for the alertmanager (azure, configdb, gcs, s3, local)', alertmanager_s3_bucket_name: $._config.s3_bucket_name, alertmanager_gcs_bucket_name: error 'must specify a GCS bucket name', From 00dee47592c2eb67be6b7b34d0bba6db7713a415 Mon Sep 17 00:00:00 2001 From: Alex Martin Date: Tue, 2 Mar 2021 13:01:47 -0600 Subject: [PATCH 128/192] cortex: alertmanager: Remove space in dot notation. --- operations/mimir/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index af2c0ec0003..618d2f11a00 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -18,7 +18,7 @@ alertmanager_args:: $._config.grpcConfig + - $._config. alertmanagerStorageClientConfig + + $._config.alertmanagerStorageClientConfig + { target: 'alertmanager', 'log.level': 'debug', From e99bd2c44c7cf61723840247f2a2cdff1eafbeee Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 3 Mar 2021 15:30:19 +0100 Subject: [PATCH 129/192] Up metadata connection limits --- operations/mimir/memcached.libsonnet | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet index a7600c33719..e303f51f9b0 100644 --- a/operations/mimir/memcached.libsonnet +++ b/operations/mimir/memcached.libsonnet @@ -54,12 +54,9 @@ memcached { // Save memory by more tightly provisioning memcached chunks. memory_limit_mb: 6 * 1024, overprovision_factor: 1.05, + connection_limit: 4096, local container = $.core.v1.container, - - // Raise connection limits now our clusters are bigger. - memcached_container+:: - container.withArgsMixin(['-c 4096']), } else {}, @@ -68,6 +65,7 @@ memcached { $.memcached { name: 'memcached-metadata', max_item_size: '%dm' % [$._config.memcached_metadata_max_item_size_mb], + connection_limit: 4096, // Metadata cache doesn't need much memory. memory_limit_mb: 512, From b5b73a46fdcd7ca0ef4cfb2e46afc19b8dae1289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 17 Mar 2021 11:04:26 +0100 Subject: [PATCH 130/192] Add flag to enable streaming of chunks. (https://github.com/grafana/cortex-jsonnet/pull/276) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- operations/mimir/config.libsonnet | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 57e34c48b25..14fc54f9b62 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -433,6 +433,9 @@ enable_pod_priorities: true, alertmanager_enabled: false, + + // Enables streaming of chunks from ingesters using blocks. + ingester_stream_chunks_when_using_blocks: true, }, local configMap = $.core.v1.configMap, @@ -441,11 +444,9 @@ configMap.new($._config.overrides_configmap) + configMap.withData({ 'overrides.yaml': $.util.manifestYaml( - { - overrides: $._config.overrides, - } + if std.length($._config.multi_kv_config) > 0 then { - multi_kv_config: $._config.multi_kv_config, - } else {} + { overrides: $._config.overrides } + + (if std.length($._config.multi_kv_config) > 0 then { multi_kv_config: $._config.multi_kv_config } else {}) + + (if $._config.ingester_stream_chunks_when_using_blocks then { ingester_stream_chunks_when_using_blocks: true } else {}) ), }), From 892ecb2e70bc935b13e08696782293fe20b22c45 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 18 Mar 2021 12:58:12 +0000 Subject: [PATCH 131/192] Add recording rules to calculate Cortex scaling - Update dashboard so it only shows under provisioned services and why - Add sizing rules based on limits. - Add some docs to the dashboard. Signed-off-by: Tom Wilkie --- operations/mimir/ingester.libsonnet | 2 +- operations/mimir/query-frontend.libsonnet | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index a0b6ac7dcb8..e11bee0ce6f 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -43,7 +43,7 @@ ), ingester_statefulset_args:: - $._config.grpcConfig + + $._config.grpcConfig { 'ingester.wal-enabled': true, 'ingester.checkpoint-enabled': true, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 3386a312159..d64e205b7ca 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -2,7 +2,7 @@ local container = $.core.v1.container, query_frontend_args:: - $._config.grpcConfig + + $._config.grpcConfig { target: 'query-frontend', @@ -38,17 +38,17 @@ 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', } + ( if $._config.queryFrontend.sharded_queries_enabled then - { - 'querier.parallelise-shardable-queries': 'true', + { + 'querier.parallelise-shardable-queries': 'true', - // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. - // basically base * shard_factor * query_split_factor / num_frontends where - 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), + // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. + // basically base * shard_factor * query_split_factor / num_frontends where + 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), - 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], - } + $._config.storageConfig - else {} - ), + 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], + } + $._config.storageConfig + else {} + ), query_frontend_container:: container.new('query-frontend', $._images.query_frontend) + From 37d2c67ad5253c2984fcf48ac17939d11d01c57e Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Thu, 25 Mar 2021 09:53:09 -0400 Subject: [PATCH 132/192] chore: update lib to use new API paths Signed-off-by: Jacob Lisi --- operations/mimir/test-exporter.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/test-exporter.libsonnet b/operations/mimir/test-exporter.libsonnet index 535686b88a3..9d69abee682 100644 --- a/operations/mimir/test-exporter.libsonnet +++ b/operations/mimir/test-exporter.libsonnet @@ -4,7 +4,7 @@ test_exporter_args:: { 'user-id': $._config.test_exporter_user_id, - 'prometheus-address': 'http://query-frontend.%(namespace)s.svc.cluster.local/api/prom' % $._config, + 'prometheus-address': 'http://query-frontend.%(namespace)s.svc.cluster.local/prometheus' % $._config, 'test-query-start': $._config.test_exporter_start_time, 'extra-selectors': 'job="%(namespace)s/test-exporter"' % $._config, 'test-query-min-size': '1m', From 1a7571b1ec9a588d1655d9997c1c403d473f03a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 25 Mar 2021 15:33:06 +0100 Subject: [PATCH 133/192] Create 1.8.0 release. (https://github.com/grafana/cortex-jsonnet/pull/282) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create 1.8.0 release. Signed-off-by: Peter Štibraný * Update image tags. Signed-off-by: Peter Štibraný --- operations/mimir/images.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 77088f06a37..7546fb6f4ef 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.7.0', + cortex: 'cortexproject/cortex:v1.8.0', alertmanager: self.cortex, distributor: self.cortex, @@ -19,7 +19,7 @@ store_gateway: self.cortex, cortex_tools: 'grafana/cortex-tools:v0.4.0', - query_tee: 'quay.io/cortexproject/query-tee:v1.7.0', - testExporter: 'cortexproject/test-exporter:v1.7.0', + query_tee: 'quay.io/cortexproject/query-tee:v1.8.0', + testExporter: 'cortexproject/test-exporter:v1.8.0', }, } From 505c4a3c73f37f3b41fa5583248a4ac903bc9f40 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 31 Mar 2021 15:21:26 +0200 Subject: [PATCH 134/192] Do not use deprecated Alertmanager cluster flags Signed-off-by: Marco Pracucci --- operations/mimir/alertmanager.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 618d2f11a00..832ef34d9d6 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -61,8 +61,8 @@ container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + if isHA then - ['--cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + - ['--cluster.peer=%s' % peer for peer in peers] + ['--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + + ['--alertmanager.cluster.peers=%s' % peer for peer in peers] else [], ) + container.withVolumeMountsMixin( From 642a4ff3921f0f3843ba1bcb84a853befe05005c Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Wed, 31 Mar 2021 16:08:00 +0100 Subject: [PATCH 135/192] fix: Update ksonnet-util vendor lock The previous version `c19a92e586a6752f11745b47f309b13f02ef7147` is incompatible with the library in its current form. For example in `tsdb.libsonnet` L81, we use `pvc.new('ingester-pvc')` but at the locked version, in `ksonnet-util/kausal.libsonnet` the `pvc.new` function takes no arguments. Signed-off-by: Jack Baldry --- operations/mimir/jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/jsonnetfile.lock.json b/operations/mimir/jsonnetfile.lock.json index 0c0e542e836..999706bb761 100644 --- a/operations/mimir/jsonnetfile.lock.json +++ b/operations/mimir/jsonnetfile.lock.json @@ -38,8 +38,8 @@ "subdir": "ksonnet-util" } }, - "version": "c19a92e586a6752f11745b47f309b13f02ef7147", - "sum": "LKsTTBcH8TXX5ANgRUu5I7Y1tf5le4nANFV3/W53I+c=" + "version": "8fa7669cc7b1b1822eb0220f2eda9c6aaa5c5119", + "sum": "/l/RofjusGrnNpJMD0ST+jDgtARyjvBP5vC7kEjPoQI=" }, { "source": { From 3a13fd94dfe4e8d17c58ab20195680c9938454a0 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Apr 2021 09:48:30 +0200 Subject: [PATCH 136/192] Add function to customize compactor statefulset Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 56ff249d86b..8fdc78fa945 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -173,13 +173,13 @@ $.util.readinessProbe + $.jaeger_mixin, - compactor_statefulset: - statefulSet.new('compactor', 1, [$.compactor_container], compactor_data_pvc) + - statefulSet.mixin.spec.withServiceName('compactor') + + newCompactorStatefulSet(name, container):: + statefulSet.new(name, 1, [container], compactor_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'compactor' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'compactor' }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'compactor' }) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + @@ -189,6 +189,9 @@ // ready). statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), + compactor_statefulset: + $.newCompactorStatefulSet('compactor', $.compactor_container), + // The store-gateway runs a statefulset. local store_gateway_data_pvc = pvc.new() + From a1a1b827825ffc8b32c577fe484e0dd0f437ffd9 Mon Sep 17 00:00:00 2001 From: Victor Tsang Hi Date: Fri, 9 Apr 2021 08:03:13 -0400 Subject: [PATCH 137/192] Add querier_service_ignored_labels (https://github.com/grafana/cortex-jsonnet/pull/291) Co-authored-by: Victor Tsang Hi --- operations/mimir/querier.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 0f92b8039e8..5dc3c834aa6 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -61,6 +61,8 @@ local service = $.core.v1.service, + querier_service_ignored_labels:: [], + querier_service: - $.util.serviceFor($.querier_deployment), + $.util.serviceFor($.querier_deployment, $.querier_service_ignored_labels), } From a2c66cd3176430af32be224f3213bf07893b29d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 22 Apr 2021 11:18:06 +0200 Subject: [PATCH 138/192] Introduce ingester instance limits to configuration, and add alerts. (https://github.com/grafana/cortex-jsonnet/pull/296) * Introduce ingester instance limits to configuration, and add alerts. * CHANGELOG.md * Address (internal) review feedback. --- operations/mimir/config.libsonnet | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 14fc54f9b62..c4173bd062f 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -436,6 +436,15 @@ // Enables streaming of chunks from ingesters using blocks. ingester_stream_chunks_when_using_blocks: true, + + // Ingester limits are put directly into runtime config, if not null. Available limits: + // ingester_instance_limits: { + // max_inflight_push_requests: 0, // Max inflight push requests per ingester. 0 = no limit. + // max_ingestion_rate: 0, // Max ingestion rate (samples/second) per ingester. 0 = no limit. + // max_series: 0, // Max number of series per ingester. 0 = no limit. + // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. + // }, + ingester_instance_limits: null, }, local configMap = $.core.v1.configMap, @@ -447,6 +456,7 @@ { overrides: $._config.overrides } + (if std.length($._config.multi_kv_config) > 0 then { multi_kv_config: $._config.multi_kv_config } else {}) + (if $._config.ingester_stream_chunks_when_using_blocks then { ingester_stream_chunks_when_using_blocks: true } else {}) + + (if $._config.ingester_instance_limits != null then { ingester_limits: $._config.ingester_instance_limits } else {}), ), }), From 8384ea859f97ec1239f4d50979fdacf09c362c8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 22 Apr 2021 11:18:18 +0200 Subject: [PATCH 139/192] Add `query-scheduler.libsonnet` (https://github.com/grafana/cortex-jsonnet/pull/295) * Add query-scheduler.libsonnet. * CHANGELOG.md * Use flag to enable query-scheduler. * Fix image. --- operations/mimir/config.libsonnet | 3 ++ operations/mimir/cortex.libsonnet | 1 + operations/mimir/images.libsonnet | 1 + operations/mimir/query-scheduler.libsonnet | 52 ++++++++++++++++++++++ 4 files changed, 57 insertions(+) create mode 100644 operations/mimir/query-scheduler.libsonnet diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index c4173bd062f..8e455f17ef8 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -434,6 +434,9 @@ alertmanager_enabled: false, + // Enables query-scheduler component, and reconfigures querier and query-frontend to use it. + query_scheduler_enabled: false, + // Enables streaming of chunks from ingesters using blocks. ingester_stream_chunks_when_using_blocks: true, diff --git a/operations/mimir/cortex.libsonnet b/operations/mimir/cortex.libsonnet index 5341b7c0d50..b8716d19cc5 100644 --- a/operations/mimir/cortex.libsonnet +++ b/operations/mimir/cortex.libsonnet @@ -13,6 +13,7 @@ (import 'table-manager.libsonnet') + (import 'ruler.libsonnet') + (import 'alertmanager.libsonnet') + +(import 'query-scheduler.libsonnet') + // Supporting services (import 'etcd.libsonnet') + diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 7546fb6f4ef..a80baded0e0 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -17,6 +17,7 @@ flusher: self.cortex, ruler: self.cortex, store_gateway: self.cortex, + query_scheduler: self.cortex, cortex_tools: 'grafana/cortex-tools:v0.4.0', query_tee: 'quay.io/cortexproject/query-tee:v1.8.0', diff --git a/operations/mimir/query-scheduler.libsonnet b/operations/mimir/query-scheduler.libsonnet new file mode 100644 index 00000000000..531bb1c3057 --- /dev/null +++ b/operations/mimir/query-scheduler.libsonnet @@ -0,0 +1,52 @@ +// Query-scheduler is optional service. When query-scheduler.libsonnet is added to Cortex, querier and frontend +// are reconfigured to use query-scheduler service. +{ + local container = $.core.v1.container, + local deployment = $.apps.v1.deployment, + local service = $.core.v1.service, + + query_scheduler_args+:: + $._config.grpcConfig + { + target: 'query-scheduler', + 'log.level': 'debug', + 'query-scheduler.max-outstanding-requests-per-tenant': 100, + }, + + query_scheduler_container:: + container.new('query-scheduler', $._images.query_scheduler) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + + $.jaeger_mixin + + $.util.readinessProbe + + $.util.resourcesRequests('2', '1Gi') + + $.util.resourcesLimits(null, '2Gi'), + + + query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else + deployment.new('query-scheduler', 2, [$.query_scheduler_container]) + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.antiAffinity, + + query_scheduler_service: if !$._config.query_scheduler_enabled then {} else + $.util.serviceFor($.query_scheduler_deployment), + + // Headless to make sure resolution gets IP address of target pods, and not service IP. + query_scheduler_discovery_service: if !$._config.query_scheduler_enabled then {} else + $.util.serviceFor($.query_scheduler_deployment) + + service.mixin.spec.withPublishNotReadyAddresses(true) + + service.mixin.spec.withClusterIp('None') + + service.mixin.metadata.withName('query-scheduler-discovery'), + + // Reconfigure querier and query-frontend to use scheduler. + querier_args+:: if !$._config.query_scheduler_enabled then {} else { + 'querier.worker-match-max-concurrent': 'true', + 'querier.worker-parallelism': null, // Disabled since we set worker-match-max-concurrent. + 'querier.frontend-address': null, + 'querier.scheduler-address': 'query-scheduler-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, + }, + + query_frontend_args+:: if !$._config.query_scheduler_enabled then {} else { + 'frontend.scheduler-address': 'query-scheduler-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, + }, +} From 4283cd461760b12c1c9a51751b0df1724f6d4a94 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Tue, 4 May 2021 14:00:56 -0400 Subject: [PATCH 140/192] Replace use of querier.compress-http-responses removed in Cortex 1.9 Signed-off-by: Nick Pillitteri --- operations/mimir/query-frontend.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index d64e205b7ca..d3dedb0ead8 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -26,9 +26,9 @@ // So that exporters like cloudwatch can still send in data and be un-cached. 'frontend.max-cache-freshness': '10m', - // Compress HTTP responses; improves latency for very big results and slow + // Use GZIP compression for API responses; improves latency for very big results and slow // connections. - 'querier.compress-http-responses': true, + 'api.response-compression-enabled': true, // So it can receive big responses from the querier. 'server.grpc-max-recv-msg-size-bytes': 100 << 20, From 0a924bc7234c4db10e2fbd9f728fba9e6aac1378 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 12 May 2021 11:42:45 +0200 Subject: [PATCH 141/192] Enable index-header lazy loading in store-gateway Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 8fdc78fa945..b70902f841f 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -212,6 +212,10 @@ // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', + + // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. + 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', + 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config + From f7b54f2f5d6a2bf7ac17dd54933e1f050debb6e4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 12 May 2021 11:50:52 +0200 Subject: [PATCH 142/192] Do not use deprecated/removed flag -limits.per-user-override-config Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 2 +- operations/mimir/distributor.libsonnet | 2 +- operations/mimir/ingester.libsonnet | 2 +- operations/mimir/query-frontend.libsonnet | 2 +- operations/mimir/tsdb.libsonnet | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 8e455f17ef8..15b5da3b216 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -198,7 +198,7 @@ // Shared between the Ruler and Querier queryConfig: { - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'runtime-config.file': '/etc/cortex/overrides.yaml', // Limit the size of the rows we read from the index. 'store.cardinality-limit': 1e6, diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 19f66d8f84f..02c8767b51f 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -11,7 +11,7 @@ 'validation.reject-old-samples': true, 'validation.reject-old-samples.max-age': '12h', - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'runtime-config.file': '/etc/cortex/overrides.yaml', 'distributor.remote-timeout': '20s', 'distributor.ha-tracker.enable': true, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index e11bee0ce6f..ba5c7e01dbe 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -30,7 +30,7 @@ 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, 'ingester.max-series-per-query': $._config.limits.max_series_per_query, 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'runtime-config.file': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 100000, } + ( if $._config.memcached_index_writes_enabled then diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index d3dedb0ead8..be1127ad961 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -35,7 +35,7 @@ // Limit queries to 500 days, allow this to be override per-user. 'store.max-query-length': '12000h', // 500 Days - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'runtime-config.file': '/etc/cortex/overrides.yaml', } + ( if $._config.queryFrontend.sharded_queries_enabled then { diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index b70902f841f..dbf005a738e 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -207,7 +207,7 @@ $._config.queryBlocksStorageConfig + { target: 'store-gateway', - 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', + 'runtime-config.file': '/etc/cortex/overrides.yaml', // Persist ring tokens so that when the store-gateway will be restarted // it will pick the same tokens From 3db36ef0dd40451ba0f960dc344f485136b75062 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 14 May 2021 19:07:17 +0200 Subject: [PATCH 143/192] Use new ruler storage config and enable API compression Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 9 +++++---- operations/mimir/ruler.libsonnet | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 15b5da3b216..21913b75427 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -260,20 +260,21 @@ rulerClientConfig: { - 'ruler.storage.type': $._config.ruler_client_type, + 'ruler-storage.backend': $._config.ruler_client_type, } + { configdb: { configs_api_url: 'config.%s.svc.cluster.local' % $._config.namespace, }, gcs: { - 'ruler.storage.gcs.bucketname': $._config.ruler_gcs_bucket_name, + 'ruler-storage.gcs.bucket-name': $._config.ruler_gcs_bucket_name, }, s3: { - 'ruler.storage.s3.url': 'https://%s/%s' % [$._config.aws_region, $._config.ruler_s3_bucket_name], + 'ruler-storage.s3.region': $._config.aws_region, + 'ruler-storage.s3.bucket-name': $._config.ruler_s3_bucket_name, }, 'local': { - 'ruler.storage.local.directory': $._config.ruler_local_directory, + 'ruler-storage.local.directory': $._config.ruler_local_directory, }, }[$._config.ruler_client_type], diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index a394b88a8b7..6171da4fe46 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -15,6 +15,7 @@ // Alertmanager configs 'ruler.alertmanager-url': 'http://alertmanager.%s.svc.cluster.local/alertmanager' % $._config.namespace, 'experimental.ruler.enable-api': true, + 'api.response-compression-enabled': true, // Ring Configs 'ruler.enable-sharding': true, From 594070b49c57f24817d0b703e1fcce053ed08f4d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 17 May 2021 12:42:38 +0200 Subject: [PATCH 144/192] Changed alertmanager config to use the new storage config Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 21913b75427..b53b54abcb3 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -253,9 +253,8 @@ }, ruler_enabled: false, - ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, configdb, gcs, s3, local)', - // TODO: Generic client generating functions would be nice. - ruler_s3_bucket_name: $._config.s3_bucket_name, + ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, gcs, s3, local)', + ruler_s3_bucket_name: error 'you must specify the ruler S3 bucket name', ruler_gcs_bucket_name: error 'must specify a GCS bucket name', rulerClientConfig: @@ -263,9 +262,6 @@ 'ruler-storage.backend': $._config.ruler_client_type, } + { - configdb: { - configs_api_url: 'config.%s.svc.cluster.local' % $._config.namespace, - }, gcs: { 'ruler-storage.gcs.bucket-name': $._config.ruler_gcs_bucket_name, }, @@ -284,26 +280,24 @@ fallback_config: {}, }, - alertmanager_client_type: error 'you must specify a storage backend type for the alertmanager (azure, configdb, gcs, s3, local)', - alertmanager_s3_bucket_name: $._config.s3_bucket_name, + alertmanager_client_type: error 'you must specify a storage backend type for the alertmanager (azure, gcs, s3, local)', + alertmanager_s3_bucket_name: error 'you must specify the alertmanager S3 bucket name', alertmanager_gcs_bucket_name: error 'must specify a GCS bucket name', alertmanagerStorageClientConfig: { - 'alertmanager.storage.type': $._config.alertmanager_client_type, + 'alertmanager-storage.backend': $._config.alertmanager_client_type, } + { - configdb: { - configs_api_url: 'config.%s.svc.cluster.local' % $._config.namespace, - }, gcs: { - 'alertmanager.storage.gcs.bucketname': $._config.alertmanager_gcs_bucket_name, + 'alertmanager-storage.gcs.bucket-name': $._config.alertmanager_gcs_bucket_name, }, s3: { - 'alertmanager.storage.s3.url': 'https://%s/%s' % [$._config.aws_region, $._config.alertmanager_s3_bucket_name], + 'alertmanager-storage.s3.region': $._config.aws_region, + 'alertmanager-storage.s3.bucket-name': $._config.alertmanager_s3_bucket_name, }, 'local': { - 'alertmanager.storage.local.directory': $._config.alertmanager_local_directory, + 'alertmanager-storage.local.path': $._config.alertmanager_local_directory, }, }[$._config.alertmanager_client_type], From 4257612bf05cd2de9a72deab9f53bc5c15bea577 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 18 May 2021 13:04:34 +0200 Subject: [PATCH 145/192] Cut release 1.9.0 Signed-off-by: Goutham Veeramachaneni --- operations/mimir/images.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index a80baded0e0..7d873831afa 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.8.0', + cortex: 'cortexproject/cortex:v1.9.0', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, cortex_tools: 'grafana/cortex-tools:v0.4.0', - query_tee: 'quay.io/cortexproject/query-tee:v1.8.0', - testExporter: 'cortexproject/test-exporter:v1.8.0', + query_tee: 'quay.io/cortexproject/query-tee:v1.9.0', + testExporter: 'cortexproject/test-exporter:v1.9.0', }, } From 93cadfabbce74195ca935cb5a88ea184bbab3ea6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 27 May 2021 14:51:23 +0200 Subject: [PATCH 146/192] Mount overrides configmap to alertmanager too Signed-off-by: Marco Pracucci --- operations/mimir/alertmanager.libsonnet | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 832ef34d9d6..6c5e82ccb1b 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -22,6 +22,7 @@ { target: 'alertmanager', 'log.level': 'debug', + 'runtime-config.file': '/etc/cortex/overrides.yaml', 'experimental.alertmanager.enable-api': 'true', 'alertmanager.storage.path': '/data', 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, @@ -87,6 +88,7 @@ statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + statefulSet.mixin.spec.template.spec.withVolumesMixin( if hasFallbackConfig then [volume.fromConfigMap('alertmanager-fallback-config', 'alertmanager-fallback-config')] From 48d5ef19a9785326279bdc96fa6d089b8c6039d5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 27 May 2021 15:31:09 +0200 Subject: [PATCH 147/192] Upgrade memcached Signed-off-by: Marco Pracucci --- operations/mimir/images.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet index 7d873831afa..87a9dc61ace 100644 --- a/operations/mimir/images.libsonnet +++ b/operations/mimir/images.libsonnet @@ -1,7 +1,7 @@ { _images+:: { // Various third-party images. - memcached: 'memcached:1.5.17-alpine', + memcached: 'memcached:1.6.9-alpine', memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. From f3d45f84bf5cf0e2ccf736a3feaf234bb745a988 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 8 Jun 2021 09:48:34 +0200 Subject: [PATCH 148/192] Increase default store-gateway memory request and limit Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index dbf005a738e..6c394a0794c 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -216,6 +216,8 @@ // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', + + 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config + @@ -228,8 +230,8 @@ container.withPorts($.store_gateway_ports) + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + - $.util.resourcesRequests('1', '6Gi') + - $.util.resourcesLimits(null, '6Gi') + + k.util.resourcesRequests('1', '12Gi') + + k.util.resourcesLimits(null, '18Gi') + $.util.readinessProbe + $.jaeger_mixin, From 137bff246de7d84822dbddf12f9d784c780d3cc7 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 8 Jun 2021 09:51:53 +0200 Subject: [PATCH 149/192] Fix Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 6c394a0794c..06ddc20ab79 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -230,8 +230,8 @@ container.withPorts($.store_gateway_ports) + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + - k.util.resourcesRequests('1', '12Gi') + - k.util.resourcesLimits(null, '18Gi') + + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '18Gi') + $.util.readinessProbe + $.jaeger_mixin, From d00e8b42f76995a6429b841738c2c6e54583e858 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Mon, 14 Jun 2021 09:28:24 +0200 Subject: [PATCH 150/192] Set -server.grpc-max-*-msg-size-bytes for ruler and ingester. (https://github.com/grafana/cortex-jsonnet/pull/326) --- operations/mimir/ingester.libsonnet | 2 ++ operations/mimir/ruler.libsonnet | 2 ++ 2 files changed, 4 insertions(+) diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index ba5c7e01dbe..34b4d9872e2 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -32,6 +32,8 @@ 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, 'runtime-config.file': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 100000, + 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, + 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, } + ( if $._config.memcached_index_writes_enabled then { diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 6171da4fe46..73029a20360 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -24,6 +24,8 @@ // Limits 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, 'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant, + 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, + 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, // Storage 'querier.second-store-engine': $._config.querier_second_storage_engine, From 95b2e615366e25d0e8b0a003bc87828aed9e6571 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 16 Jun 2021 15:16:22 +0200 Subject: [PATCH 151/192] Fixed --alertmanager.cluster.peers Signed-off-by: Marco Pracucci --- operations/mimir/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 6c5e82ccb1b..5156cc539d3 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -63,7 +63,7 @@ $.util.mapToFlags($.alertmanager_args) + if isHA then ['--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + - ['--alertmanager.cluster.peers=%s' % peer for peer in peers] + ['--alertmanager.cluster.peers=%s' % std.join(',', peers)] else [], ) + container.withVolumeMountsMixin( From 9ee775c9d85a886db1888a632a9b70349c023411 Mon Sep 17 00:00:00 2001 From: Hamish Date: Fri, 2 Jul 2021 16:44:13 +1200 Subject: [PATCH 152/192] Set empty alertmanager listen address with 1 replica Alertmanager tries to start clustering unless the flag is explicitly set as an empty string https://github.com/prometheus/alertmanager#turn-off-high-availability --- operations/mimir/alertmanager.libsonnet | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 5156cc539d3..9566b0e9355 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -26,9 +26,16 @@ 'experimental.alertmanager.enable-api': 'true', 'alertmanager.storage.path': '/data', 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, - } + if hasFallbackConfig then { - 'alertmanager.configs.fallback': '/configs/alertmanager_fallback_config.yaml', - } else {}, + } + + (if hasFallbackConfig then { + 'alertmanager.configs.fallback': '/configs/alertmanager_fallback_config.yaml', + } else {}) + + (if isHA then { + 'alertmanager.cluster.listen-address': '[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port, + 'alertmanager.cluster.peers': std.join(',', peers), + } else { + 'alertmanager.cluster.listen-address': '', + }), alertmanager_fallback_config_map: if hasFallbackConfig then @@ -59,13 +66,7 @@ else [], ) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + - container.withArgsMixin( - $.util.mapToFlags($.alertmanager_args) + - if isHA then - ['--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + - ['--alertmanager.cluster.peers=%s' % std.join(',', peers)] - else [], - ) + + container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + container.withVolumeMountsMixin( [volumeMount.new('alertmanager-data', '/data')] + if hasFallbackConfig then From e1285bd2c5486da775d9134c897bb4d6587d44d4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 22 Jul 2021 15:16:41 +0200 Subject: [PATCH 153/192] Add option to disable anti-affinity in newIngesterStatefulSet() Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 06ddc20ab79..5c69bf004fc 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -101,7 +101,7 @@ 'ingester.tokens-file-path': '/data/tokens', }, - newIngesterStatefulSet(name, container):: + newIngesterStatefulSet(name, container, with_anti_affinity=true):: statefulSet.new(name, 3, [ container + $.core.v1.container.withVolumeMountsMixin([ volumeMount.new('ingester-data', '/data'), @@ -119,12 +119,12 @@ statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high') + - $.util.antiAffinity + // Parallelly scale up/down ingester instances instead of starting them // one by one. This does NOT affect rolling updates: they will continue to be // rolled out one by one (the next pod will be rolled out once the previous is // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + (if with_anti_affinity then $.util.antiAffinity else {}), ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), From fc6ce7c64631511de824fb0f656b56324dabbfa5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 22 Jul 2021 15:58:33 +0200 Subject: [PATCH 154/192] Fix alertmanager config change introduced in https://github.com/grafana/cortex-jsonnet/pull/344 Signed-off-by: Marco Pracucci --- operations/mimir/alertmanager.libsonnet | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index 9566b0e9355..ece40ec6acc 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -29,13 +29,7 @@ } + (if hasFallbackConfig then { 'alertmanager.configs.fallback': '/configs/alertmanager_fallback_config.yaml', - } else {}) + - (if isHA then { - 'alertmanager.cluster.listen-address': '[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port, - 'alertmanager.cluster.peers': std.join(',', peers), - } else { - 'alertmanager.cluster.listen-address': '', - }), + } else {}), alertmanager_fallback_config_map: if hasFallbackConfig then @@ -66,7 +60,16 @@ else [], ) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + - container.withArgsMixin($.util.mapToFlags($.alertmanager_args)) + + container.withArgsMixin( + $.util.mapToFlags($.alertmanager_args) + + ( + if isHA then + ['--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + + ['--alertmanager.cluster.peers=%s' % std.join(',', peers)] + else + ['-alertmanager.cluster.listen-address=""'] + ) + ) + container.withVolumeMountsMixin( [volumeMount.new('alertmanager-data', '/data')] + if hasFallbackConfig then From 9f114e75b229f8ee78f2dfc8c5ebd6d7bdc39b9d Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 28 Jul 2021 10:54:11 +0200 Subject: [PATCH 155/192] Create another tier with 300K active series The other tiers have a 3x jump except when we go from 100K to 1Mil. I think we should have a 3x jump for the first tier too. Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index b53b54abcb3..57fb7a2db30 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -328,6 +328,25 @@ ruler_max_rule_groups_per_tenant: 20, }, + medium_small_user:: { + max_series_per_user: 0, // Disabled in favour of the max global limit + max_series_per_metric: 0, // Disabled in favour of the max global limit + + // Our limit should be 100k, but we need some room of about ~50% to take rollouts into account + max_global_series_per_user: 300000, + max_global_series_per_metric: 30000, + + max_series_per_query: 100000, + max_samples_per_query: 1000000, + + ingestion_rate: 30000, + ingestion_burst_size: 300000, + + // 375 rules + ruler_max_rules_per_rule_group: 15, + ruler_max_rule_groups_per_tenant: 25, + }, + small_user:: { max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit From c08f7f64b8aa5ac8317da060f41610649e3c9e9d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 13 Aug 2021 08:54:16 +0200 Subject: [PATCH 156/192] Improve config settings based on recent learnings Signed-off-by: Marco Pracucci --- operations/mimir/ingester.libsonnet | 2 +- operations/mimir/querier.libsonnet | 4 ++++ operations/mimir/ruler.libsonnet | 4 ++++ operations/mimir/tsdb.libsonnet | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index 34b4d9872e2..e0753d843ee 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -31,7 +31,7 @@ 'ingester.max-series-per-query': $._config.limits.max_series_per_query, 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, 'runtime-config.file': '/etc/cortex/overrides.yaml', - 'server.grpc-max-concurrent-streams': 100000, + 'server.grpc-max-concurrent-streams': 10000, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, } + ( diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 5dc3c834aa6..574358d8b0a 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -26,6 +26,10 @@ 'querier.second-store-engine': $._config.querier_second_storage_engine, + // We request high memory but the Go heap is typically very low (< 100MB) and this causes + // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. + 'mem-ballast-size-bytes': 1 << 28, // 256M + 'log.level': 'debug', }, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 73029a20360..dfb5727e9e7 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -29,6 +29,10 @@ // Storage 'querier.second-store-engine': $._config.querier_second_storage_engine, + + // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" + // is set to false. + 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, }, ruler_container:: diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 5c69bf004fc..1c77abd99e3 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -115,7 +115,7 @@ statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. // For this reason, we grant an high termination period (80 minutes). - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high') + From 9e7813082543e550e270cf9f445fee5644b8dc7b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 23 Aug 2021 15:51:32 +0200 Subject: [PATCH 157/192] Added functions to create query-frontend and querier deployments Signed-off-by: Marco Pracucci --- operations/mimir/querier.libsonnet | 7 +++++-- operations/mimir/query-frontend.libsonnet | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 574358d8b0a..664c759bebe 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -57,12 +57,15 @@ querier_deployment_labels: {}, - querier_deployment: - deployment.new('querier', $._config.querier.replicas, [$.querier_container], $.querier_deployment_labels) + + newQuerierDeployment(name, container):: + deployment.new(name, $._config.querier.replicas, [container], $.querier_deployment_labels) + $.util.antiAffinity + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.storage_config_mixin, + querier_deployment: + self.newQuerierDeployment('querier', $.querier_container), + local service = $.core.v1.service, querier_service_ignored_labels:: [], diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index be1127ad961..add0788ae7d 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -68,8 +68,8 @@ local deployment = $.apps.v1.deployment, - query_frontend_deployment: - deployment.new('query-frontend', $._config.queryFrontend.replicas, [$.query_frontend_container]) + + newQueryFrontendDeployment(name, container):: + deployment.new(name, $._config.queryFrontend.replicas, [container]) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.antiAffinity + // inject storage schema in order to know what/how to shard @@ -77,6 +77,8 @@ $.storage_config_mixin else {}, + query_frontend_deployment: self.newQueryFrontendDeployment('query-frontend', $.query_frontend_container), + local service = $.core.v1.service, query_frontend_service: From 5b73ff7c18710fa3b8bcb47560256c32cf42d342 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 23 Aug 2021 17:32:48 +0200 Subject: [PATCH 158/192] Added function to create query-scheduler deployment Signed-off-by: Marco Pracucci --- operations/mimir/query-scheduler.libsonnet | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/operations/mimir/query-scheduler.libsonnet b/operations/mimir/query-scheduler.libsonnet index 531bb1c3057..130325e2d44 100644 --- a/operations/mimir/query-scheduler.libsonnet +++ b/operations/mimir/query-scheduler.libsonnet @@ -22,12 +22,14 @@ $.util.resourcesRequests('2', '1Gi') + $.util.resourcesLimits(null, '2Gi'), - - query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else - deployment.new('query-scheduler', 2, [$.query_scheduler_container]) + + newQuerySchedulerDeployment(name, container):: + deployment.new(name, 2, [container]) + $.util.configVolumeMount('overrides', '/etc/cortex') + $.util.antiAffinity, + query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else + self.newQuerySchedulerDeployment('query-scheduler', $.query_scheduler_container), + query_scheduler_service: if !$._config.query_scheduler_enabled then {} else $.util.serviceFor($.query_scheduler_deployment), From 41cff4b554ce3e836d530f703f754b7c832d2174 Mon Sep 17 00:00:00 2001 From: Javier Palomo Date: Mon, 1 Mar 2021 18:53:07 +0100 Subject: [PATCH 159/192] chore: upgrade to latest etcd-operator Brings: https://github.com/grafana/jsonnet-libs/pull/480 --- operations/mimir/jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/mimir/jsonnetfile.lock.json b/operations/mimir/jsonnetfile.lock.json index 999706bb761..3751cc20928 100644 --- a/operations/mimir/jsonnetfile.lock.json +++ b/operations/mimir/jsonnetfile.lock.json @@ -18,8 +18,8 @@ "subdir": "etcd-operator" } }, - "version": "c19a92e586a6752f11745b47f309b13f02ef7147", - "sum": "RbSlOsk0EBAMOfMOKPBdD0joHN6UKZqeP3zy9LjBQTE=" + "version": "815b0364886cc7bdf6bde2cdcd424bb8cef842b8", + "sum": "dnKsZ5FkKBtCycNVVSYa1AMNjCLofO4VGFrmzoz4344=" }, { "source": { From 7553b4beeec147a322b69b555f03d70503d48b10 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 25 Aug 2021 17:08:02 +0100 Subject: [PATCH 160/192] Alertmanager: Allow storage configuration to support Azure The alertmanager configuration did not have support for Azure. Let's add it. --- operations/mimir/config.libsonnet | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 57fb7a2db30..8a3507349d1 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -282,13 +282,20 @@ alertmanager_client_type: error 'you must specify a storage backend type for the alertmanager (azure, gcs, s3, local)', alertmanager_s3_bucket_name: error 'you must specify the alertmanager S3 bucket name', - alertmanager_gcs_bucket_name: error 'must specify a GCS bucket name', + alertmanager_gcs_bucket_name: error 'you must specify a GCS bucket name', + alertmanager_azure_container_name: error 'you must specify an Azure container name', + alertmanagerStorageClientConfig: { 'alertmanager-storage.backend': $._config.alertmanager_client_type, } + { + azure: { + 'alertmanager-storage.azure.account-key': $._config.alertmanager_azure_account_key, + 'alertmanager-storage.azure.account-name': $._config.alertmanager_azure_account_name, + 'alertmanager-storage.azure.container-name': $._config.alertmanager_azure_container_name, + }, gcs: { 'alertmanager-storage.gcs.bucket-name': $._config.alertmanager_gcs_bucket_name, }, From 608e9ea6cb014ff254a7c6390bfae2824219b352 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 25 Aug 2021 17:13:26 +0100 Subject: [PATCH 161/192] remove new line --- operations/mimir/config.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 8a3507349d1..488e11fe1e1 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -285,13 +285,12 @@ alertmanager_gcs_bucket_name: error 'you must specify a GCS bucket name', alertmanager_azure_container_name: error 'you must specify an Azure container name', - alertmanagerStorageClientConfig: { 'alertmanager-storage.backend': $._config.alertmanager_client_type, } + { - azure: { + azure: { 'alertmanager-storage.azure.account-key': $._config.alertmanager_azure_account_key, 'alertmanager-storage.azure.account-name': $._config.alertmanager_azure_account_name, 'alertmanager-storage.azure.container-name': $._config.alertmanager_azure_container_name, From 7c5d37032d5d2947364014efecb64c7dc504d108 Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Fri, 27 Aug 2021 11:02:36 +0200 Subject: [PATCH 162/192] Fix comment on medium_small_user config It says it should be 100k + 50%, but that's what extra_small_user is. Here we have 300k, which is 200k + 50%. Signed-off-by: Oleg Zaytsev --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 488e11fe1e1..962b9caa489 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -338,7 +338,7 @@ max_series_per_user: 0, // Disabled in favour of the max global limit max_series_per_metric: 0, // Disabled in favour of the max global limit - // Our limit should be 100k, but we need some room of about ~50% to take rollouts into account + // Our limit should be 200k, but we need some room of about ~50% to take rollouts into account max_global_series_per_user: 300000, max_global_series_per_metric: 30000, From 2e8b8e55c796c1ca2df82a1b35df74716c00de7d Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Fri, 27 Aug 2021 11:10:47 +0200 Subject: [PATCH 163/192] Remove wrong comment Signed-off-by: Oleg Zaytsev --- operations/mimir/config.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 962b9caa489..f4ad8310acb 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -338,7 +338,6 @@ max_series_per_user: 0, // Disabled in favour of the max global limit max_series_per_metric: 0, // Disabled in favour of the max global limit - // Our limit should be 200k, but we need some room of about ~50% to take rollouts into account max_global_series_per_user: 300000, max_global_series_per_metric: 30000, From 60c5a8b197cfcad6de27dc024851d24b1ed80116 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Thu, 2 Sep 2021 13:01:00 +0200 Subject: [PATCH 164/192] Add overrides to compactor Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 3 +++ operations/mimir/tsdb.libsonnet | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index f4ad8310acb..3c4cf5453b6 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -332,6 +332,9 @@ // 300 rules ruler_max_rules_per_rule_group: 15, ruler_max_rule_groups_per_tenant: 20, + + // No retention for now. + compactor_blocks_retention_period: 0, }, medium_small_user:: { diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 1c77abd99e3..2a91f8b4d3b 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -159,6 +159,10 @@ 'compactor.ring.store': 'consul', 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'compactor.ring.prefix': '', + + // Limits config. + 'runtime-config.file': '/etc/cortex/overrides.yaml', + 'compactor.blocks-retention-period': $._config.limits.compactor_blocks_retention_period, }, compactor_ports:: $.util.defaultPorts, @@ -187,7 +191,8 @@ // one by one. This does NOT affect rolling updates: they will continue to be // rolled out one by one (the next pod will be rolled out once the previous is // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel'), + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), compactor_statefulset: $.newCompactorStatefulSet('compactor', $.compactor_container), From 9dfcdd0597f311a5496cc96d9f2453257442da34 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Mon, 13 Sep 2021 10:59:29 +0200 Subject: [PATCH 165/192] Split limits config into a variable we can reuse Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 23 +++++++++++++++++++++++ operations/mimir/distributor.libsonnet | 3 +-- operations/mimir/ingester.libsonnet | 7 +------ operations/mimir/ruler.libsonnet | 2 -- operations/mimir/tsdb.libsonnet | 2 +- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 3c4cf5453b6..2c27e9dc2ba 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -312,6 +312,29 @@ // These are the defaults. limits: $._config.overrides.extra_small_user, + // These are all the flags for the default limits. + distributorLimitsConfig: { + 'distributor.ingestion-rate-limit': $._config.limits.ingestion_rate, + 'distributor.ingestion-burst-size': $._config.limits.ingestion_burst_size, + }, + ingesterLimitsConfig: { + 'ingester.max-series-per-user': $._config.limits.max_series_per_user, + 'ingester.max-series-per-metric': $._config.limits.max_series_per_metric, + 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, + 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, + 'ingester.max-series-per-query': $._config.limits.max_series_per_query, + 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, + }, + rulerLimitsConfig: { + 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, + 'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant, + }, + compactorLimitsConfig: { + 'compactor.blocks-retention-period': $._config.limits.compactor_blocks_retention_period, + }, + + limitsConfig: self.distributorLimitsConfig + self.ingesterLimitsConfig + self.rulerLimitsConfig + self.compactorLimitsConfig, + overrides_configmap: 'overrides', overrides: { diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 02c8767b51f..c9df411ce98 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -6,6 +6,7 @@ $._config.grpcConfig + $._config.ringConfig + $._config.distributorConfig + + $._config.distributorLimitsConfig + { target: 'distributor', @@ -30,8 +31,6 @@ 'server.grpc.keepalive.max-connection-idle': '1m', 'distributor.ingestion-rate-limit-strategy': 'global', - 'distributor.ingestion-rate-limit': $._config.limits.ingestion_rate, - 'distributor.ingestion-burst-size': $._config.limits.ingestion_burst_size, // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet index e0753d843ee..3078db366bd 100644 --- a/operations/mimir/ingester.libsonnet +++ b/operations/mimir/ingester.libsonnet @@ -6,6 +6,7 @@ $._config.storageConfig + $._config.blocksStorageConfig + $._config.distributorConfig + // This adds the distributor ring flags to the ingester. + $._config.ingesterLimitsConfig + { target: 'ingester', @@ -24,12 +25,6 @@ // Limits config. 'ingester.max-chunk-idle': $._config.max_chunk_idle, - 'ingester.max-series-per-user': $._config.limits.max_series_per_user, - 'ingester.max-series-per-metric': $._config.limits.max_series_per_metric, - 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, - 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, - 'ingester.max-series-per-query': $._config.limits.max_series_per_query, - 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, 'runtime-config.file': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 10000, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index dfb5727e9e7..c503defe08f 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -22,8 +22,6 @@ 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, // Limits - 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, - 'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 2a91f8b4d3b..1ce3c028563 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -144,6 +144,7 @@ $._config.grpcConfig + $._config.storageConfig + $._config.blocksStorageConfig + + $._config.compactorLimitsConfig + { target: 'compactor', @@ -162,7 +163,6 @@ // Limits config. 'runtime-config.file': '/etc/cortex/overrides.yaml', - 'compactor.blocks-retention-period': $._config.limits.compactor_blocks_retention_period, }, compactor_ports:: $.util.defaultPorts, From 2f255acb14417e9fc9d2d46c7bcf05b58cfc85ba Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Mon, 13 Sep 2021 15:17:00 +0200 Subject: [PATCH 166/192] Review feedback Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 1 + operations/mimir/distributor.libsonnet | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 2c27e9dc2ba..6369513cd63 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -314,6 +314,7 @@ // These are all the flags for the default limits. distributorLimitsConfig: { + 'distributor.ingestion-rate-limit-strategy': 'global', 'distributor.ingestion-rate-limit': $._config.limits.ingestion_rate, 'distributor.ingestion-burst-size': $._config.limits.ingestion_burst_size, }, diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index c9df411ce98..436c7fc8020 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -30,8 +30,6 @@ 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', - 'distributor.ingestion-rate-limit-strategy': 'global', - // The ingestion rate global limit requires the distributors to form a ring. 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'distributor.ring.prefix': '', From ce5f18bf298cd71d5c690c63737fd0fa2184cef5 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Mon, 13 Sep 2021 15:31:10 +0200 Subject: [PATCH 167/192] Fix missing ruler limits Damn, missed this in https://github.com/grafana/cortex-jsonnet/pull/391 Signed-off-by: Goutham Veeramachaneni --- operations/mimir/ruler.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index c503defe08f..6fbed6bc9b1 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -10,6 +10,7 @@ $._config.queryConfig + $._config.distributorConfig + $._config.rulerClientConfig + + $._config.rulerLimitsConfig + { target: 'ruler', // Alertmanager configs From 67f6503a43544e0b2d4d8086d7660c058af44f0b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 15 Sep 2021 13:36:42 +0200 Subject: [PATCH 168/192] Alertmanager: Add sharding configuration. --- operations/mimir/alertmanager.libsonnet | 85 ++++++++++++++++--------- operations/mimir/config.libsonnet | 4 ++ 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet index ece40ec6acc..480112d3820 100644 --- a/operations/mimir/alertmanager.libsonnet +++ b/operations/mimir/alertmanager.libsonnet @@ -7,18 +7,62 @@ local service = $.core.v1.service, local configMap = $.core.v1.configMap, - local isHA = $._config.alertmanager.replicas > 1, + // The Alertmanager has three operational modes. + local haType = if $._config.alertmanager.sharding_enabled then + 'sharding' + else if $._config.alertmanager.replicas > 1 then + 'gossip_multi_replica' + else + 'gossip_single_replica', + // mode represents which operational mode the alertmanager runs in. + // ports: array of container ports used for gossiping. + // args: arguments that are eventually converted to flags on the container + // flags: arguments directly added to the container. For legacy reasons, we need to use -- as a prefix for some flags. + // service: the service definition + local mode = { + sharding: { + ports: [], + args: { + 'alertmanager.sharding-enabled': true, + 'alertmanager.sharding-ring.store': $._config.alertmanager.ring_store, + 'alertmanager.sharding-ring.consul.hostname': $._config.alertmanager.ring_hostname, + 'alertmanager.sharding-ring.replication-factor': $._config.alertmanager.ring_replication_factor, + }, + flags: [], + service: + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.spec.withClusterIp('None'), + }, + gossip_multi_replica: { + ports: [ + $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), + $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), + ], + args: {}, + flags: [ + '--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port, + '--alertmanager.cluster.peers=%s' % std.join(',', peers), + ], + service: + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.spec.withClusterIp('None'), + }, + gossip_single_replica: { + ports: [], + args: {}, + flags: ['--alertmanager.cluster.listen-address=""'], + service: $.util.serviceFor($.alertmanager_statefulset), + }, + }[haType], local hasFallbackConfig = std.length($._config.alertmanager.fallback_config) > 0, - local peers = if isHA then - [ - 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] - for i in std.range(0, $._config.alertmanager.replicas - 1) - ] - else [], - + local peers = [ + 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] + for i in std.range(0, $._config.alertmanager.replicas - 1) + ], alertmanager_args:: $._config.grpcConfig + $._config.alertmanagerStorageClientConfig + + mode.args + { target: 'alertmanager', 'log.level': 'debug', @@ -51,24 +95,11 @@ alertmanager_container:: if $._config.alertmanager_enabled then container.new('alertmanager', $._images.alertmanager) + - container.withPorts( - $.util.defaultPorts + - if isHA then [ - $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), - $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), - ] - else [], - ) + + container.withPorts($.util.defaultPorts + mode.ports) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + - ( - if isHA then - ['--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] + - ['--alertmanager.cluster.peers=%s' % std.join(',', peers)] - else - ['-alertmanager.cluster.listen-address=""'] - ) + mode.flags ) + container.withVolumeMountsMixin( [volumeMount.new('alertmanager-data', '/data')] + @@ -101,11 +132,5 @@ else {}, alertmanager_service: - if $._config.alertmanager_enabled then - if isHA then - $.util.serviceFor($.alertmanager_statefulset) + - service.mixin.spec.withClusterIp('None') - else - $.util.serviceFor($.alertmanager_statefulset) - else {}, + if $._config.alertmanager_enabled then mode.service else {}, } diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 6369513cd63..12f3576195d 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -276,8 +276,12 @@ alertmanager: { replicas: 3, + sharding_enabled: false, gossip_port: 9094, fallback_config: {}, + ring_store: 'consul', + ring_hostname: 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + ring_replication_factor: $._config.replication_factor, }, alertmanager_client_type: error 'you must specify a storage backend type for the alertmanager (azure, gcs, s3, local)', From f098a10fbf883b35a7d7abaa6563e5df7823066d Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Wed, 15 Sep 2021 14:50:22 +0200 Subject: [PATCH 169/192] Fix `compactor_blocks_retention_period` type in `extra_small_user` (https://github.com/grafana/cortex-jsonnet/pull/395) * Fix `compactor_blocks_retention_period` type in `extra_small_user` The actual type of `compactor_blocks_retention_period` is `model.Duration`. Which comes from prometheus `common` package. The problem is that `model.Duration` have custom JSON unmarshal which treat the incoming value as string. https://github.com/prometheus/common/blob/main/model/time.go#L276 So setting it as integer, won't work when unmarshalling with JSON. NOTE: This won't be an issue for YamlUnmarshal, as it always treating it as string (even though you put it as integer) https://github.com/prometheus/common/blob/main/model/time.go#L307 * update CHANGELOG --- operations/mimir/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 6369513cd63..f52b38d954f 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -358,7 +358,7 @@ ruler_max_rule_groups_per_tenant: 20, // No retention for now. - compactor_blocks_retention_period: 0, + compactor_blocks_retention_period: '0', }, medium_small_user:: { From 00d8fdea54792a521506857ee187967cb9ec4af0 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 15 Sep 2021 15:43:18 +0200 Subject: [PATCH 170/192] Update rule limits to be inline with customer expectations We built the initial rules on guesswork and now we're updating them based on what the customers are asking for. Further, the ruler can be horizontally scaled and we're happy letting our users have more rules! Signed-off-by: Goutham Veeramachaneni --- operations/mimir/config.libsonnet | 42 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index f52b38d954f..4746b301bae 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -353,9 +353,9 @@ ingestion_rate: 10000, ingestion_burst_size: 200000, - // 300 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 20, + // 700 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 35, // No retention for now. compactor_blocks_retention_period: '0', @@ -374,9 +374,9 @@ ingestion_rate: 30000, ingestion_burst_size: 300000, - // 375 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 25, + // 1000 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 50, }, small_user:: { @@ -392,9 +392,9 @@ ingestion_rate: 100000, ingestion_burst_size: 1000000, - // 450 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 30, + // 1400 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 70, }, medium_user:: { @@ -410,9 +410,9 @@ ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M - // 600 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 40, + // 1800 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 90, }, big_user:: { @@ -428,9 +428,9 @@ ingestion_rate: 700000, // 700K ingestion_burst_size: 7000000, // 7M - // 750 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 50, + // 2200 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 110, }, super_user:: { @@ -446,9 +446,9 @@ ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M - // 900 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 60, + // 2600 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 130, }, // This user class has limits increased by +50% compared to the previous one. @@ -465,9 +465,9 @@ ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M - // 1050 rules - ruler_max_rules_per_rule_group: 15, - ruler_max_rule_groups_per_tenant: 70, + // 3000 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 150, }, }, From a46a22cc7a4bd46a685d90ecdcbdfbe8e3fd7293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 16 Sep 2021 17:42:56 +0200 Subject: [PATCH 171/192] Remove max_samples_per_query limit. (https://github.com/grafana/cortex-jsonnet/pull/397) * Remove max_samples_per_query limit. * Fixed CHANGELOG.md --- operations/mimir/config.libsonnet | 8 -------- 1 file changed, 8 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index a86f86c0da0..5187b841b27 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -328,7 +328,6 @@ 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, 'ingester.max-series-per-query': $._config.limits.max_series_per_query, - 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, }, rulerLimitsConfig: { 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, @@ -352,7 +351,6 @@ max_global_series_per_metric: 20000, max_series_per_query: 100000, - max_samples_per_query: 1000000, ingestion_rate: 10000, ingestion_burst_size: 200000, @@ -373,7 +371,6 @@ max_global_series_per_metric: 30000, max_series_per_query: 100000, - max_samples_per_query: 1000000, ingestion_rate: 30000, ingestion_burst_size: 300000, @@ -391,7 +388,6 @@ max_global_series_per_metric: 100000, max_series_per_query: 100000, - max_samples_per_query: 1000000, ingestion_rate: 100000, ingestion_burst_size: 1000000, @@ -409,7 +405,6 @@ max_global_series_per_metric: 300000, // 300K max_series_per_query: 100000, - max_samples_per_query: 1000000, ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M @@ -424,7 +419,6 @@ max_series_per_user: 0, // Disabled in favour of the max global limit max_series_per_query: 100000, - max_samples_per_query: 1000000, max_global_series_per_user: 6000000, // 6M max_global_series_per_metric: 600000, // 600K @@ -445,7 +439,6 @@ max_global_series_per_metric: 1200000, // 1.2M max_series_per_query: 100000, - max_samples_per_query: 1000000, ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M @@ -464,7 +457,6 @@ max_global_series_per_metric: 1600000, // 1.6M max_series_per_query: 100000, - max_samples_per_query: 1000000, ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M From f7cd61c4557449e4a0a6186b3e5ab654e319f11a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 20 Sep 2021 14:43:27 +0200 Subject: [PATCH 172/192] Removed chunks storage query sharding config support Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 19 ++------------ operations/mimir/querier.libsonnet | 8 ++---- operations/mimir/query-frontend.libsonnet | 31 +++-------------------- 3 files changed, 8 insertions(+), 50 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 5187b841b27..2342f151736 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -35,28 +35,13 @@ test_exporter_start_time: error 'must specify test exporter start time', test_exporter_user_id: error 'must specify test exporter used id', - // The expectation is that if sharding is enabled, we can force more (smaller) - // queries on the queriers. However this can't be extended too far because most queries - // concern recent (ingester) data, which isn't sharded. Therefore, we must strike a balance - // which allows us to process more sharded queries in parallel when requested, but not overload - // queriers during normal queries. querier: { - replicas: if $._config.queryFrontend.sharded_queries_enabled then 12 else 6, - concurrency: if $._config.queryFrontend.sharded_queries_enabled then 16 else 8, + replicas: 6, + concurrency: 8, }, queryFrontend: { replicas: 2, - shard_factor: 16, // v10 schema shard factor - sharded_queries_enabled: false, - // Queries can technically be sharded an arbitrary number of times. Thus query_split_factor is used - // as a coefficient to multiply the frontend tenant queues by. The idea is that this - // yields a bit of headroom so tenant queues aren't underprovisioned. Therefore the split factor - // should be represent the highest reasonable split factor for a query. If too low, a long query - // (i.e. 30d) with a high split factor (i.e. 5) would result in - // (day_splits * shard_factor * split_factor) or 30 * 16 * 5 = 2400 sharded queries, which may be - // more than the max queue size and thus would always error. - query_split_factor:: 3, }, jaeger_agent_host: null, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 664c759bebe..a727383b9cc 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -46,12 +46,8 @@ $.jaeger_mixin + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + - if $._config.queryFrontend.sharded_queries_enabled then - $.util.resourcesRequests('3', '12Gi') + - $.util.resourcesLimits(null, '24Gi') - else - $.util.resourcesRequests('1', '12Gi') + - $.util.resourcesLimits(null, '24Gi'), + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1.deployment, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index add0788ae7d..9c5f99d82d3 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -36,19 +36,7 @@ // Limit queries to 500 days, allow this to be override per-user. 'store.max-query-length': '12000h', // 500 Days 'runtime-config.file': '/etc/cortex/overrides.yaml', - } + ( - if $._config.queryFrontend.sharded_queries_enabled then - { - 'querier.parallelise-shardable-queries': 'true', - - // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. - // basically base * shard_factor * query_split_factor / num_frontends where - 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), - - 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], - } + $._config.storageConfig - else {} - ), + }, query_frontend_container:: container.new('query-frontend', $._images.query_frontend) + @@ -56,26 +44,15 @@ container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + $.jaeger_mixin + $.util.readinessProbe + - if $._config.queryFrontend.sharded_queries_enabled then - $.util.resourcesRequests('2', '2Gi') + - $.util.resourcesLimits(null, '6Gi') + - container.withEnvMap({ - JAEGER_REPORTER_MAX_QUEUE_SIZE: '5000', - }) - else - $.util.resourcesRequests('2', '600Mi') + - $.util.resourcesLimits(null, '1200Mi'), + $.util.resourcesRequests('2', '600Mi') + + $.util.resourcesLimits(null, '1200Mi'), local deployment = $.apps.v1.deployment, newQueryFrontendDeployment(name, container):: deployment.new(name, $._config.queryFrontend.replicas, [container]) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.antiAffinity + - // inject storage schema in order to know what/how to shard - if $._config.queryFrontend.sharded_queries_enabled then - $.storage_config_mixin - else {}, + $.util.antiAffinity, query_frontend_deployment: self.newQueryFrontendDeployment('query-frontend', $.query_frontend_container), From 6739bd4de6427bce3d5b51ceb14fb1f717d58ce7 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 20 Sep 2021 14:59:00 +0200 Subject: [PATCH 173/192] Add queryEngineConfig Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 7 ++++++- operations/mimir/querier.libsonnet | 1 + operations/mimir/ruler.libsonnet | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 5187b841b27..fdb8bbb0abf 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -196,7 +196,7 @@ ) else {} ), - // Shared between the Ruler and Querier + // Querier component config (shared between the ruler and querier). queryConfig: { 'runtime-config.file': '/etc/cortex/overrides.yaml', @@ -239,6 +239,11 @@ else {} ), + // PromQL query engine config (shared between all services running PromQL engine, like the ruler and querier). + queryEngineConfig: { + // Keep it even if empty, to allow downstream projects to easily configure it. + }, + ringConfig: { 'consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'ring.prefix': '', diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 664c759bebe..2ec8d868577 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -8,6 +8,7 @@ $._config.storageConfig + $._config.blocksStorageConfig + $._config.queryConfig + + $._config.queryEngineConfig + $._config.distributorConfig + { target: 'querier', diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 6fbed6bc9b1..1e98282023d 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -8,6 +8,7 @@ $._config.storageConfig + $._config.blocksStorageConfig + $._config.queryConfig + + $._config.queryEngineConfig + $._config.distributorConfig + $._config.rulerClientConfig + $._config.rulerLimitsConfig + From d8a64ea20e0fd61978bf0306d7f8dd3c0240f2e5 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 2 Nov 2021 10:00:08 +0100 Subject: [PATCH 174/192] tsdb: Add multi concurrency and max idle connections store gateway params Signed-off-by: Arve Knudsen --- operations/mimir/tsdb.libsonnet | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 1ce3c028563..eb76fa2effd 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -222,7 +222,17 @@ 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', + local indexMaxConcurrency = 100, + local chunksMaxConcurrency = 100, + local metaMaxConcurrency = 100, 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': indexMaxConcurrency, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': chunksMaxConcurrency, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': metaMaxConcurrency, + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': indexMaxConcurrency, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': chunksMaxConcurrency, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': metaMaxConcurrency, + } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config + From fa8552c4255bf1f95090987040604c88bb494fe9 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 2 Nov 2021 10:37:59 +0100 Subject: [PATCH 175/192] Update cortex/tsdb.libsonnet Co-authored-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index eb76fa2effd..508821794d9 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -229,9 +229,9 @@ 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': indexMaxConcurrency, 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': chunksMaxConcurrency, 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': metaMaxConcurrency, - 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': indexMaxConcurrency, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': chunksMaxConcurrency, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': metaMaxConcurrency, + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], } + $.blocks_chunks_caching_config + From 163e6a797b5d153dfef5d29e17ef85d640070cf5 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 2 Nov 2021 10:39:23 +0100 Subject: [PATCH 176/192] Fix formatting Signed-off-by: Arve Knudsen --- operations/mimir/tsdb.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 508821794d9..cb489421653 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -229,9 +229,9 @@ 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': indexMaxConcurrency, 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': chunksMaxConcurrency, 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': metaMaxConcurrency, - 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], } + $.blocks_chunks_caching_config + From 75ae394b4a0d1be75dcac8259ef1f9ebfa59e2ee Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 2 Nov 2021 13:19:45 +0100 Subject: [PATCH 177/192] tsdb: Use literal numbers instead of variables Signed-off-by: Arve Knudsen --- operations/mimir/tsdb.libsonnet | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index cb489421653..491adb02c5c 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -222,17 +222,21 @@ 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', - local indexMaxConcurrency = 100, - local chunksMaxConcurrency = 100, - local metaMaxConcurrency = 100, 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, - 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': indexMaxConcurrency, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': chunksMaxConcurrency, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': metaMaxConcurrency, + + // We should keep a number of idle connections equal to the max "get" concurrency, + // in order to avoid re-opening connections continuously (this would be slower + // and fill up the conntrack table too). + // + // The downside of this approach is that we'll end up with an higher number of + // active connections to memcached, so we have to make sure connections limit + // set in memcached is high enough. + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], - } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config + From 9129be28250afca8d0f08afe5a5e6a1920108c77 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 9 Nov 2021 16:46:53 +0100 Subject: [PATCH 178/192] cortex: Make ruler object storage support generic Signed-off-by: Arve Knudsen --- operations/mimir/config.libsonnet | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 8dc473aad33..8d12ded8ff6 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -244,8 +244,7 @@ ruler_enabled: false, ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, gcs, s3, local)', - ruler_s3_bucket_name: error 'you must specify the ruler S3 bucket name', - ruler_gcs_bucket_name: error 'must specify a GCS bucket name', + ruler_storage_bucket_name: error 'must specify the ruler storage bucket name', rulerClientConfig: { @@ -253,11 +252,15 @@ } + { gcs: { - 'ruler-storage.gcs.bucket-name': $._config.ruler_gcs_bucket_name, + 'ruler-storage.gcs.bucket-name': $._config.ruler_storage_bucket_name, }, s3: { 'ruler-storage.s3.region': $._config.aws_region, - 'ruler-storage.s3.bucket-name': $._config.ruler_s3_bucket_name, + 'ruler-storage.s3.bucket-name': $._config.ruler_storage_bucket_name, + }, + azure: { + // TODO: Is this the correct flag?? + 'ruler-storage.gcs.bucket-name': $._config.ruler_storage_bucket_name, }, 'local': { 'ruler-storage.local.directory': $._config.ruler_local_directory, From e3ad38200049169c593ce408b6d761be07cb4cea Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 9 Nov 2021 17:31:46 +0100 Subject: [PATCH 179/192] Remove ruler-storage.gcs.bucket-name for Azure Signed-off-by: Arve Knudsen --- operations/mimir/config.libsonnet | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 8d12ded8ff6..1761bed9004 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -258,10 +258,7 @@ 'ruler-storage.s3.region': $._config.aws_region, 'ruler-storage.s3.bucket-name': $._config.ruler_storage_bucket_name, }, - azure: { - // TODO: Is this the correct flag?? - 'ruler-storage.gcs.bucket-name': $._config.ruler_storage_bucket_name, - }, + azure: {}, 'local': { 'ruler-storage.local.directory': $._config.ruler_local_directory, }, From 4c565cef081e383454a820a394bdfca8119661cd Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 9 Nov 2021 18:09:08 +0100 Subject: [PATCH 180/192] cortex: Define Azure ruler args Signed-off-by: Arve Knudsen --- operations/mimir/config.libsonnet | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 1761bed9004..10020981ddf 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -258,7 +258,11 @@ 'ruler-storage.s3.region': $._config.aws_region, 'ruler-storage.s3.bucket-name': $._config.ruler_storage_bucket_name, }, - azure: {}, + azure: { + 'ruler-storage.azure.container-name': '%(cluster)s-%(namespace)s-ruler' % $._config, + 'ruler-storage.azure.account-name': '$(BLOCKS_STORAGE_AZURE_ACCOUNT_NAME)', + 'ruler-storage.azure.account-key': '$(BLOCKS_STORAGE_AZURE_ACCOUNT_KEY)', + }, 'local': { 'ruler-storage.local.directory': $._config.ruler_local_directory, }, From db807aaa94d687ee8dc95eadcbd9124992dbc773 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 10 Nov 2021 16:21:45 +0100 Subject: [PATCH 181/192] Parameterize Signed-off-by: Arve Knudsen --- operations/mimir/config.libsonnet | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 10020981ddf..5ce9bff7818 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -245,6 +245,8 @@ ruler_enabled: false, ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, gcs, s3, local)', ruler_storage_bucket_name: error 'must specify the ruler storage bucket name', + ruler_storage_azure_account_name: error 'must specify the ruler storage Azure account name', + ruler_storage_azure_account_key: error 'must specify the ruler storage Azure account key', rulerClientConfig: { @@ -259,9 +261,9 @@ 'ruler-storage.s3.bucket-name': $._config.ruler_storage_bucket_name, }, azure: { - 'ruler-storage.azure.container-name': '%(cluster)s-%(namespace)s-ruler' % $._config, - 'ruler-storage.azure.account-name': '$(BLOCKS_STORAGE_AZURE_ACCOUNT_NAME)', - 'ruler-storage.azure.account-key': '$(BLOCKS_STORAGE_AZURE_ACCOUNT_KEY)', + 'ruler-storage.azure.container-name': $._config.ruler_storage_bucket_name, + 'ruler-storage.azure.account-name': $._config.ruler_storage_azure_account_name, + 'ruler-storage.azure.account-key': $._config.ruler_storage_azure_account_key, }, 'local': { 'ruler-storage.local.directory': $._config.ruler_local_directory, From 13f31331f6b9f71968fe581241ff6a33f6308ee1 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Mon, 15 Nov 2021 14:34:15 +0100 Subject: [PATCH 182/192] Further document ingester_stream_chunks_when_using_blocks parameter Signed-off-by: Arve Knudsen --- operations/mimir/config.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 5ce9bff7818..a5cd7d2216b 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -476,6 +476,7 @@ query_scheduler_enabled: false, // Enables streaming of chunks from ingesters using blocks. + // Changing it will not cause new rollout of ingesters, as it gets passed to them via runtime-config. ingester_stream_chunks_when_using_blocks: true, // Ingester limits are put directly into runtime config, if not null. Available limits: From 944570cdb77276ce6e0c1669ca41f00c936d0f35 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 17 Nov 2021 15:50:09 +0100 Subject: [PATCH 183/192] Add options to disable anti-affinity Signed-off-by: Marco Pracucci --- operations/mimir/config.libsonnet | 6 ++++++ operations/mimir/distributor.libsonnet | 2 +- operations/mimir/querier.libsonnet | 2 +- operations/mimir/query-frontend.libsonnet | 2 +- operations/mimir/ruler.libsonnet | 2 +- 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index a5cd7d2216b..a1bff15454c 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -20,6 +20,12 @@ // 2. Ensure ingester ID is preserved during rollouts unregister_ingesters_on_shutdown: true, + // Controls whether multiple pods for the same service can be scheduled on the same node. + cortex_distributor_allow_multiple_replicas_on_same_node: false, + cortex_ruler_allow_multiple_replicas_on_same_node: false, + cortex_querier_allow_multiple_replicas_on_same_node: false, + cortex_query_frontend_allow_multiple_replicas_on_same_node: false, + // schema is used to generate the storage schema yaml file used by // the Cortex chunks storage: // - More information: https://github.com/cortexproject/cortex/pull/1072 diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index 436c7fc8020..ae1c9ffd900 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -56,7 +56,7 @@ distributor_deployment: deployment.new('distributor', 3, [$.distributor_container], $.distributor_deployment_labels) + - $.util.antiAffinity + + (if $._config.cortex_distributor_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), local service = $.core.v1.service, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index ce34e9e11da..0256a91f193 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -56,7 +56,7 @@ newQuerierDeployment(name, container):: deployment.new(name, $._config.querier.replicas, [container], $.querier_deployment_labels) + - $.util.antiAffinity + + (if $._config.cortex_querier_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.storage_config_mixin, diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 9c5f99d82d3..7cc5a8cfc3c 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -52,7 +52,7 @@ newQueryFrontendDeployment(name, container):: deployment.new(name, $._config.queryFrontend.replicas, [container]) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.antiAffinity, + (if $._config.cortex_query_frontend_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity), query_frontend_deployment: self.newQueryFrontendDeployment('query-frontend', $.query_frontend_container), diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet index 1e98282023d..a7df54fd54f 100644 --- a/operations/mimir/ruler.libsonnet +++ b/operations/mimir/ruler.libsonnet @@ -54,7 +54,7 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + - $.util.antiAffinity + + (if $._config.cortex_ruler_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.storage_config_mixin else {}, From e7759ad42e444055f7c977be91dfc7ecd4f784b8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 22 Nov 2021 11:03:39 +0100 Subject: [PATCH 184/192] Upstream some config improvements Signed-off-by: Marco Pracucci --- operations/mimir/distributor.libsonnet | 4 +++- operations/mimir/memcached.libsonnet | 2 +- operations/mimir/querier.libsonnet | 2 ++ operations/mimir/query-frontend.libsonnet | 6 ++++-- operations/mimir/query-scheduler.libsonnet | 5 ++++- operations/mimir/tsdb.libsonnet | 5 +++-- 6 files changed, 17 insertions(+), 7 deletions(-) diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet index ae1c9ffd900..ea22523e6fb 100644 --- a/operations/mimir/distributor.libsonnet +++ b/operations/mimir/distributor.libsonnet @@ -57,7 +57,9 @@ distributor_deployment: deployment.new('distributor', 3, [$.distributor_container], $.distributor_deployment_labels) + (if $._config.cortex_distributor_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), local service = $.core.v1.service, diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet index e303f51f9b0..6fecb9ffbd9 100644 --- a/operations/mimir/memcached.libsonnet +++ b/operations/mimir/memcached.libsonnet @@ -65,7 +65,7 @@ memcached { $.memcached { name: 'memcached-metadata', max_item_size: '%dm' % [$._config.memcached_metadata_max_item_size_mb], - connection_limit: 4096, + connection_limit: 16384, // Metadata cache doesn't need much memory. memory_limit_mb: 512, diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet index 0256a91f193..eb807ee28be 100644 --- a/operations/mimir/querier.libsonnet +++ b/operations/mimir/querier.libsonnet @@ -58,6 +58,8 @@ deployment.new(name, $._config.querier.replicas, [container], $.querier_deployment_labels) + (if $._config.cortex_querier_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + $.storage_config_mixin, querier_deployment: diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet index 7cc5a8cfc3c..80f36d04736 100644 --- a/operations/mimir/query-frontend.libsonnet +++ b/operations/mimir/query-frontend.libsonnet @@ -17,7 +17,7 @@ 'querier.split-queries-by-interval': '24h', // Cache query results. - 'querier.align-querier-with-step': true, + 'querier.align-querier-with-step': false, 'querier.cache-results': true, 'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace, 'frontend.memcached.service': 'memcached-client', @@ -52,7 +52,9 @@ newQueryFrontendDeployment(name, container):: deployment.new(name, $._config.queryFrontend.replicas, [container]) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - (if $._config.cortex_query_frontend_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity), + (if $._config.cortex_query_frontend_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), query_frontend_deployment: self.newQueryFrontendDeployment('query-frontend', $.query_frontend_container), diff --git a/operations/mimir/query-scheduler.libsonnet b/operations/mimir/query-scheduler.libsonnet index 130325e2d44..604d258a6c5 100644 --- a/operations/mimir/query-scheduler.libsonnet +++ b/operations/mimir/query-scheduler.libsonnet @@ -25,7 +25,10 @@ newQuerySchedulerDeployment(name, container):: deployment.new(name, 2, [container]) + $.util.configVolumeMount('overrides', '/etc/cortex') + - $.util.antiAffinity, + $.util.antiAffinity + + // Do not run more query-schedulers than expected. + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else self.newQuerySchedulerDeployment('query-scheduler', $.query_scheduler_container), diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index 491adb02c5c..aa47082959e 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -172,8 +172,9 @@ container.withPorts($.compactor_ports) + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + - $.util.resourcesRequests('1', '6Gi') + - $.util.resourcesLimits($._config.cortex_compactor_max_concurrency, '6Gi') + + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. + $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + + $.util.resourcesLimits(null, '6Gi') + $.util.readinessProbe + $.jaeger_mixin, From 2bbb7ded09b86985db288ccfbc76d2dd55e0ccb5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 22 Nov 2021 11:12:13 +0100 Subject: [PATCH 185/192] Increased max connections for memcached chunks and index-queries too Signed-off-by: Marco Pracucci --- operations/mimir/memcached.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet index 6fecb9ffbd9..011328c33b0 100644 --- a/operations/mimir/memcached.libsonnet +++ b/operations/mimir/memcached.libsonnet @@ -34,6 +34,7 @@ memcached { $.memcached { name: 'memcached-index-queries', max_item_size: '%dm' % [$._config.memcached_index_queries_max_item_size_mb], + connection_limit: 16384, } else {}, @@ -54,7 +55,7 @@ memcached { // Save memory by more tightly provisioning memcached chunks. memory_limit_mb: 6 * 1024, overprovision_factor: 1.05, - connection_limit: 4096, + connection_limit: 16384, local container = $.core.v1.container, } From 2b1e0215a6f7f611ebbeb12b2e7af92772159cc0 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Mon, 22 Nov 2021 11:35:04 +0100 Subject: [PATCH 186/192] Ruler: Pass `-ruler-storage.s3.endpoint` to ruler when using S3. This argument is is required, without it, the following error appears: ``` no s3 endpoint in config file ``` --- operations/mimir/config.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index a1bff15454c..7cf316b73e2 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -265,6 +265,7 @@ s3: { 'ruler-storage.s3.region': $._config.aws_region, 'ruler-storage.s3.bucket-name': $._config.ruler_storage_bucket_name, + 'ruler-storage.s3.endpoint': 's3.dualstack.%s.amazonaws.com' % $._config.aws_region, }, azure: { 'ruler-storage.azure.container-name': $._config.ruler_storage_bucket_name, From 3c9841093e5fd3903d610f55a8fc1b986f01f503 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 14 Dec 2021 16:07:40 +0100 Subject: [PATCH 187/192] Allow to create custom store-gateway StatefulSets via newStoreGatewayStatefulSet() Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index aa47082959e..b79cd9d8e63 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -255,13 +255,13 @@ $.util.readinessProbe + $.jaeger_mixin, - store_gateway_statefulset: - statefulSet.new('store-gateway', 3, [$.store_gateway_container], store_gateway_data_pvc) + - statefulSet.mixin.spec.withServiceName('store-gateway') + + newStoreGatewayStatefulSet(name, container):: + statefulSet.new(name, 3, [$.store_gateway_container], store_gateway_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'store-gateway' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'store-gateway' }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + @@ -272,6 +272,8 @@ statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container), + store_gateway_service: $.util.serviceFor($.store_gateway_statefulset), From bb2c109ba145e70901956a01d81a71fc24e0732d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 14 Dec 2021 17:33:20 +0100 Subject: [PATCH 188/192] Fix newStoreGatewayStatefulSet() to use input container Signed-off-by: Marco Pracucci --- operations/mimir/tsdb.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet index b79cd9d8e63..15902099355 100644 --- a/operations/mimir/tsdb.libsonnet +++ b/operations/mimir/tsdb.libsonnet @@ -256,7 +256,7 @@ $.jaeger_mixin, newStoreGatewayStatefulSet(name, container):: - statefulSet.new(name, 3, [$.store_gateway_container], store_gateway_data_pvc) + + statefulSet.new(name, 3, [container], store_gateway_data_pvc) + statefulSet.mixin.spec.withServiceName(name) + statefulSet.mixin.metadata.withNamespace($._config.namespace) + statefulSet.mixin.metadata.withLabels({ name: name }) + From 5751c2e1c7e36e4f8e6d01040865e2846c9c9a6d Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Mon, 20 Dec 2021 12:37:20 +0000 Subject: [PATCH 189/192] Add CI check for jsonnet manifests --- .github/workflows/test-build-deploy.yml | 2 ++ Makefile | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index ace56ce4c76..6f60778e7c5 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -45,6 +45,8 @@ jobs: run: make BUILD_IN_CONTAINER=false check-doc - name: Check Mixin run: make BUILD_IN_CONTAINER=false check-mixin + - name: Check Jsonnet Manifests + run: make BUILD_IN_CONTAINER=false check-jsonnet-manifests - name: Check White Noise. run: make BUILD_IN_CONTAINER=false check-white-noise - name: Check License Header diff --git a/Makefile b/Makefile index 6aec0b99d54..01b99792e5e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # WARNING: do not commit to a repository! -include Makefile.local -.PHONY: all test test-with-race integration-tests cover clean images protos exes dist doc clean-doc check-doc push-multiarch-build-image license check-license format check-mixin check-mixin-jb check-mixin-mixtool checkin-mixin-playbook build-mixin format-mixin push-multiarch-mimir list-image-targets +.PHONY: all test test-with-race integration-tests cover clean images protos exes dist doc clean-doc check-doc push-multiarch-build-image license check-license format check-mixin check-mixin-jb check-mixin-mixtool checkin-mixin-playbook build-mixin format-mixin check-jsonnet-manifests format-jsonnet-manifests push-multiarch-mimir list-image-targets .DEFAULT_GOAL := all # Version number @@ -42,6 +42,9 @@ JSONNET_FMT := jsonnetfmt # path to the mimir/mixin MIXIN_PATH := operations/mimir-mixin +# path to the mimir jsonnet manifests +JSONNET_MANIFESTS_PATH := operations/mimir + .PHONY: image-tag image-tag: @echo $(IMAGE_TAG) @@ -396,6 +399,13 @@ build-mixin: check-mixin-jb format-mixin: @find $(MIXIN_PATH) -type f -name '*.libsonnet' -print -o -name '*.jsonnet' -print | xargs jsonnetfmt -i +check-jsonnet-manifests: format-jsonnet-manifests + @echo "Checking diff:" + @git diff --exit-code -- $(JSONNET_MANIFESTS_PATH) || (echo "Please format jsonnet manifests by running 'make format-jsonnet-manifests'" && false) + +format-jsonnet-manifests: + @find $(JSONNET_MANIFESTS_PATH) -type f -name '*.libsonnet' -print -o -name '*.jsonnet' -print | xargs jsonnetfmt -i + check-tsdb-blocks-storage-s3-docker-compose-yaml: cd development/tsdb-blocks-storage-s3 && make check From 2b6f26ae144cbf4bdf5832ed73d9e389aaf91938 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Mon, 20 Dec 2021 12:37:32 +0000 Subject: [PATCH 190/192] Remove additional git diff in check-mixin --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 01b99792e5e..4f9e2ee1d9b 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ UPTODATE := .uptodate # path to jsonnetfmt JSONNET_FMT := jsonnetfmt -# path to the mimir/mixin +# path to the mimir-mixin MIXIN_PATH := operations/mimir-mixin # path to the mimir jsonnet manifests @@ -372,7 +372,6 @@ check-white-noise: clean-white-noise check-mixin: format-mixin check-mixin-jb check-mixin-mixtool check-mixin-playbook @echo "Checking diff:" - git diff @git diff --exit-code -- $(MIXIN_PATH) || (echo "Please format mixin by running 'make format-mixin'" && false) @cd $(MIXIN_PATH) && \ From 82a04e3fb003a73044c720c8176c62c387166101 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 20 Dec 2021 15:06:00 +0100 Subject: [PATCH 191/192] Imported cortex-jsonnet CHANGELOG entries from 1.9.0 Signed-off-by: Marco Pracucci --- CHANGELOG.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18ca78109d6..316f82ec94b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -179,19 +179,120 @@ * [BUGFIX] Distributor: fix bug in query-exemplar where some results would get dropped. #583 * [BUGFIX] Azure storage: only create HTTP client once, to reduce memory utilization. #605 -Mixin: - +Mixin (changes since `grafana/cortex-jsonnet` `1.9.0`): + +* [CHANGE] Update grafana-builder dependency: use $__rate_interval in qpsPanel and latencyPanel. [#372](https://github.com/grafana/cortex-jsonnet/pull/372) +* [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. [#311](https://github.com/grafana/cortex-jsonnet/pull/311) +* [CHANGE] `CortexIngesterRestarts` alert severity changed from `critical` to `warning`. [#321](https://github.com/grafana/cortex-jsonnet/pull/321) +* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. [#319](https://github.com/grafana/cortex-jsonnet/pull/319) +* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. [#319](https://github.com/grafana/cortex-jsonnet/pull/319) +* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` to `CortexCompactorHasNotUploadedBlocks`. [#334](https://github.com/grafana/cortex-jsonnet/pull/334) +* [CHANGE] Renamed `CortexCompactorRunFailed` to `CortexCompactorHasNotSuccessfullyRunCompaction`. [#334](https://github.com/grafana/cortex-jsonnet/pull/334) +* [CHANGE] Renamed `CortexInconsistentConfig` alert to `CortexInconsistentRuntimeConfig` and increased severity to `critical`. [#335](https://github.com/grafana/cortex-jsonnet/pull/335) +* [CHANGE] Increased `CortexBadRuntimeConfig` alert severity to `critical` and removed support for `cortex_overrides_last_reload_successful` metric (was removed in Cortex 1.3.0). [#335](https://github.com/grafana/cortex-jsonnet/pull/335) +* [CHANGE] Grafana 'min step' changed to 15s so dashboard show better detail. [#340](https://github.com/grafana/cortex-jsonnet/pull/340) +* [CHANGE] Replace `CortexRulerFailedEvaluations` with two new alerts: `CortexRulerTooManyFailedPushes` and `CortexRulerTooManyFailedQueries`. [#347](https://github.com/grafana/cortex-jsonnet/pull/347) +* [CHANGE] Removed `CortexCacheRequestErrors` alert. This alert was not working because the legacy Cortex cache client instrumentation doesn't track errors. [#346](https://github.com/grafana/cortex-jsonnet/pull/346) +* [CHANGE] Removed `CortexQuerierCapacityFull` alert. [#342](https://github.com/grafana/cortex-jsonnet/pull/342) +* [CHANGE] Changes blocks storage alerts to group metrics by the configured `cluster_labels` (supporting the deprecated `alert_aggregation_labels`). [#351](https://github.com/grafana/cortex-jsonnet/pull/351) +* [CHANGE] Increased `CortexIngesterReachingSeriesLimit` critical alert threshold from 80% to 85%. [#363](https://github.com/grafana/cortex-jsonnet/pull/363) +* [CHANGE] Changed default `job_names` for query-frontend, query-scheduler and querier to match custom deployments too. [#376](https://github.com/grafana/cortex-jsonnet/pull/376) +* [CHANGE] Split `cortex_api` recording rule group into three groups. This is a workaround for large clusters where this group can become slow to evaluate. [#401](https://github.com/grafana/cortex-jsonnet/pull/401) +* [CHANGE] Increased `CortexIngesterReachingSeriesLimit` warning threshold from 70% to 80% and critical threshold from 85% to 90%. [#404](https://github.com/grafana/cortex-jsonnet/pull/404) * [CHANGE] Raised `CortexKVStoreFailure` alert severity from warning to critical. #493 * [CHANGE] Increase `CortexRolloutStuck` alert "for" duration from 15m to 30m. #493 #573 +* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. [#317](https://github.com/grafana/cortex-jsonnet/pull/317) +* [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. [#328](https://github.com/grafana/cortex-jsonnet/pull/328) +* [ENHANCEMENT] Ruler dashboard: added object storage metrics. [#354](https://github.com/grafana/cortex-jsonnet/pull/354) +* [ENHANCEMENT] Alertmanager dashboard: added object storage metrics. [#354](https://github.com/grafana/cortex-jsonnet/pull/354) +* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. [#324](https://github.com/grafana/cortex-jsonnet/pull/324) +* [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. [#331](https://github.com/grafana/cortex-jsonnet/pull/331) +* [ENHANCEMENT] cortex-mixin: Added `alert_excluded_routes` config to exclude specific routes from alerts. [#338](https://github.com/grafana/cortex-jsonnet/pull/338) +* [ENHANCEMENT] Added `CortexMemcachedRequestErrors` alert. [#346](https://github.com/grafana/cortex-jsonnet/pull/346) +* [ENHANCEMENT] Ruler dashboard: added "Per route p99 latency" panel in the "Configuration API" row. [#353](https://github.com/grafana/cortex-jsonnet/pull/353) +* [ENHANCEMENT] Increased the `for` duration of the `CortexIngesterReachingSeriesLimit` warning alert to 3h. [#362](https://github.com/grafana/cortex-jsonnet/pull/362) +* [ENHANCEMENT] Added a new tier (`medium_small_user`) so we have another tier between 100K and 1Mil active series. [#364](https://github.com/grafana/cortex-jsonnet/pull/364) +* [ENHANCEMENT] Extend Alertmanager dashboard: [#313](https://github.com/grafana/cortex-jsonnet/pull/313) + * "Tenants" stat panel - shows number of discovered tenant configurations. + * "Replication" row - information about the replication of tenants/alerts/silences over instances. + * "Tenant Configuration Sync" row - information about the configuration sync procedure. + * "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled. + * "Sharding Runtime State Sync" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). +* [ENHANCEMENT] Update gsutil command for `not healthy index found` playbook [#370](https://github.com/grafana/cortex-jsonnet/pull/370) +* [ENHANCEMENT] Added Alertmanager alerts and playbooks covering configuration syncs and sharding operation: [#377 [#378](https://github.com/grafana/cortex-jsonnet/pull/378) + * `CortexAlertmanagerSyncConfigsFailing` + * `CortexAlertmanagerRingCheckFailing` + * `CortexAlertmanagerPartialStateMergeFailing` + * `CortexAlertmanagerReplicationFailing` + * `CortexAlertmanagerPersistStateFailing` + * `CortexAlertmanagerInitialSyncFailed` +* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. [#387](https://github.com/grafana/cortex-jsonnet/pull/387) +* [ENHANCEMENT] Add `CortexRolloutStuck` alert. [#405](https://github.com/grafana/cortex-jsonnet/pull/405) +* [ENHANCEMENT] Added `CortexKVStoreFailure` alert. [#406](https://github.com/grafana/cortex-jsonnet/pull/406) +* [ENHANCEMENT] Use configured `ruler` jobname for ruler dashboard panels. [#409](https://github.com/grafana/cortex-jsonnet/pull/409) +* [ENHANCEMENT] Add ability to override `datasource` for generated dashboards. [#407](https://github.com/grafana/cortex-jsonnet/pull/407) +* [ENHANCEMENT] Use alertmanager jobname for alertmanager dashboard panels [#411](https://github.com/grafana/cortex-jsonnet/pull/411) +* [ENHANCEMENT] Added `CortexDistributorReachingInflightPushRequestLimit` alert. [#408](https://github.com/grafana/cortex-jsonnet/pull/408) * [ENHANCEMENT] Added `CortexReachingTCPConnectionsLimit` alert. #403 * [ENHANCEMENT] Added "Cortex / Writes Networking" and "Cortex / Reads Networking" dashboards. #405 * [ENHANCEMENT] Improved "Queue length" panel in "Cortex / Queries" dashboard. #408 * [ENHANCEMENT] Add `CortexDistributorReachingInflightPushRequestLimit` alert and playbook. #401 * [ENHANCEMENT] Added "Recover accidentally deleted blocks (Google Cloud specific)" playbook. #475 * [ENHANCEMENT] Added support to multi-zone store-gateway deployments. #608 #615 +* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. [#308](https://github.com/grafana/cortex-jsonnet/pull/308) +* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. [#335](https://github.com/grafana/cortex-jsonnet/pull/335) +* [BUGFIX] Fixed scaling dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). [#365](https://github.com/grafana/cortex-jsonnet/pull/365) +* [BUGFIX] Fixed rollout progress dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). [#366](https://github.com/grafana/cortex-jsonnet/pull/366) +* [BUGFIX] Fixed rollout progress dashboard to include query-scheduler too. [#376](https://github.com/grafana/cortex-jsonnet/pull/376) +* [BUGFIX] Upstream recording rule `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate` renamed. [#379](https://github.com/grafana/cortex-jsonnet/pull/379) +* [BUGFIX] Fixed writes/reads/alertmanager resources dashboards to use `$._config.job_names.gateway`. [#403](https://github.com/grafana/cortex-jsonnet/pull/403) +* [BUGFIX] Span the annotation.message in alerts as YAML multiline strings. [#412](https://github.com/grafana/cortex-jsonnet/pull/412) * [BUGFIX] Fixed "Instant queries / sec" in "Cortex / Reads" dashboard. #445 * [BUGFIX] Fixed and added missing KV store panels in Writes, Reads, Ruler and Compactor dashboards. #448 +Jsonnet (changes since `grafana/cortex-jsonnet` `1.9.0`): + +* [CHANGE] Store gateway: set `-blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency`, + `-blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency`, + `-blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency`, + `-blocks-storage.bucket-store.index-cache.memcached.max-idle-connections`, + `-blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections`, + `-blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections` to 100 [#414](https://github.com/grafana/cortex-jsonnet/pull/414) +* [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. [#315](https://github.com/grafana/cortex-jsonnet/pull/315) +* [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. [#316](https://github.com/grafana/cortex-jsonnet/pull/316) +* [CHANGE] Store-gateway: increased memory request and limit respectively from 6GB / 6GB to 12GB / 18GB. [#322](https://github.com/grafana/cortex-jsonnet/pull/322) +* [CHANGE] Store-gateway: increased `-blocks-storage.bucket-store.max-chunk-pool-bytes` from 2GB (default) to 12GB. [#322](https://github.com/grafana/cortex-jsonnet/pull/322) +* [CHANGE] Ingester/Ruler: set `-server.grpc-max-send-msg-size-bytes` and `-server.grpc-max-send-msg-size-bytes` to sensible default values (10MB). [#326](https://github.com/grafana/cortex-jsonnet/pull/326) +* [CHANGE] Decreased `-server.grpc-max-concurrent-streams` from 100k to 10k. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [CHANGE] Decreased blocks storage ingesters graceful termination period from 80m to 20m. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [CHANGE] Increase the rules per group and rule groups limits on different tiers. [#396](https://github.com/grafana/cortex-jsonnet/pull/396) +* [CHANGE] Removed `max_samples_per_query` limit, since it only works with chunks and only when using `-distributor.shard-by-all-labels=false`. [#397](https://github.com/grafana/cortex-jsonnet/pull/397) +* [CHANGE] Removed chunks storage query sharding config support. The following config options have been removed: [#398](https://github.com/grafana/cortex-jsonnet/pull/398) + * `_config` > `queryFrontend` > `shard_factor` + * `_config` > `queryFrontend` > `sharded_queries_enabled` + * `_config` > `queryFrontend` > `query_split_factor` +* [CHANGE] Rename ruler_s3_bucket_name and ruler_gcs_bucket_name to ruler_storage_bucket_name: [#415](https://github.com/grafana/cortex-jsonnet/pull/415) +* [CHANGE] Fine-tuned rolling update policy for distributor, querier, query-frontend, query-scheduler. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [CHANGE] Increased memcached metadata/chunks/index-queries max connections from 4k to 16k. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [CHANGE] Disabled step alignment in query-frontend to be compliant with PromQL. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [CHANGE] Do not limit compactor CPU and request a number of cores equal to the configured concurrency. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [ENHANCEMENT] Add overrides config to compactor. This allows setting retention configs per user. [#386](https://github.com/grafana/cortex-jsonnet/pull/386) +* [ENHANCEMENT] Added 256MB memory ballast to querier. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [ENHANCEMENT] Update `etcd-operator` to latest version (see https://github.com/grafana/jsonnet-libs/pull/480). [#263](https://github.com/grafana/cortex-jsonnet/pull/263) +* [ENHANCEMENT] Add support for Azure storage in Alertmanager configuration. [#381](https://github.com/grafana/cortex-jsonnet/pull/381) +* [ENHANCEMENT] Add support for running Alertmanager in sharding mode. [#394](https://github.com/grafana/cortex-jsonnet/pull/394) +* [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. [#399](https://github.com/grafana/cortex-jsonnet/pull/399) +* [ENHANCEMENT] Define Azure object storage ruler args. [#416](https://github.com/grafana/cortex-jsonnet/pull/416) +* [ENHANCEMENT] Added the following config options to allow to schedule multiple replicas of the same service on the same node: [#418](https://github.com/grafana/cortex-jsonnet/pull/418) + * `cortex_distributor_allow_multiple_replicas_on_same_node` + * `cortex_ruler_allow_multiple_replicas_on_same_node` + * `cortex_querier_allow_multiple_replicas_on_same_node` + * `cortex_query_frontend_allow_multiple_replicas_on_same_node` +* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. [#329](https://github.com/grafana/cortex-jsonnet/pull/329) +* [BUGFIX] Fixed `-distributor.extend-writes` setting on ruler when `unregister_ingesters_on_shutdown` is disabled. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [BUGFIX] Treat `compactor_blocks_retention_period` type as string rather than int.[#395](https://github.com/grafana/cortex-jsonnet/pull/395) +* [BUGFIX] Pass `-ruler-storage.s3.endpoint` to ruler when using S3. [#421](https://github.com/grafana/cortex-jsonnet/pull/421) + ### Query-tee * [ENHANCEMENT] Added `/api/v1/query_exemplars` API endpoint support (no results comparison). #168 From 930ec05efc87b7f6dfb95ce30e56846d4cd41350 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 20 Dec 2021 15:16:17 +0100 Subject: [PATCH 192/192] Improved CHANGELOG header Signed-off-by: Marco Pracucci --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 316f82ec94b..773bf94d892 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -179,7 +179,7 @@ * [BUGFIX] Distributor: fix bug in query-exemplar where some results would get dropped. #583 * [BUGFIX] Azure storage: only create HTTP client once, to reduce memory utilization. #605 -Mixin (changes since `grafana/cortex-jsonnet` `1.9.0`): +### Mixin (changes since `grafana/cortex-jsonnet` `1.9.0`) * [CHANGE] Update grafana-builder dependency: use $__rate_interval in qpsPanel and latencyPanel. [#372](https://github.com/grafana/cortex-jsonnet/pull/372) * [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. [#311](https://github.com/grafana/cortex-jsonnet/pull/311) @@ -250,7 +250,7 @@ Mixin (changes since `grafana/cortex-jsonnet` `1.9.0`): * [BUGFIX] Fixed "Instant queries / sec" in "Cortex / Reads" dashboard. #445 * [BUGFIX] Fixed and added missing KV store panels in Writes, Reads, Ruler and Compactor dashboards. #448 -Jsonnet (changes since `grafana/cortex-jsonnet` `1.9.0`): +### Jsonnet (changes since `grafana/cortex-jsonnet` `1.9.0`) * [CHANGE] Store gateway: set `-blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency`, `-blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency`,