diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dc3289a..fa0de19c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * [CHANGE] Add overrides config to tsdb store-gateway. #167 * [CHANGE] Ingesters now default to running as `StatefulSet` with WAL enabled. It is controlled by the config `$._config.ingester_deployment_without_wal` which is `false` by default. Setting the config to `true` will yield the old behaviour (stateless `Deployment` without WAL enabled). #72 * [CHANGE] We now allow queries that are 32 days long. For example, rate(metric[32d]). Before it was 31d. #173 +* [CHANGE] Renamed `container_name` and `pod_name` label names to `container` and `pod` respectively. This is required in order to comply with cAdvisor metrics changes shipped with Kubernetes 1.16. #179 * [ENHANCEMENT] Enable support for HA in the Cortex Alertmanager #147 * [ENHANCEMENT] Support `alertmanager.fallback_config` option in the Alertmanager. #179 * [ENHANCEMENT] Add support for S3 block storage. #181 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index acc6fa4b..ea702017 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -372,9 +372,9 @@ alert: 'CortexAllocatingTooMuchMemory', expr: ||| ( - container_memory_working_set_bytes{container_name="ingester"} + container_memory_working_set_bytes{container="ingester"} / - container_spec_memory_limit_bytes{container_name="ingester"} + container_spec_memory_limit_bytes{container="ingester"} ) > 0.5 |||, 'for': '15m', @@ -391,9 +391,9 @@ alert: 'CortexAllocatingTooMuchMemory', expr: ||| ( - container_memory_working_set_bytes{container_name="ingester"} + container_memory_working_set_bytes{container="ingester"} / - container_spec_memory_limit_bytes{container_name="ingester"} + container_spec_memory_limit_bytes{container="ingester"} ) > 0.8 |||, 'for': '15m', diff --git a/cortex-mixin/dashboards/compactor-resources.libsonnet b/cortex-mixin/dashboards/compactor-resources.libsonnet index 043a3337..02d9edd0 100644 --- a/cortex-mixin/dashboards/compactor-resources.libsonnet +++ b/cortex-mixin/dashboards/compactor-resources.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-compactor-resources.json': local filterNodeDiskByCompactor = ||| - ignoring(pod_name) group_right() (label_replace(count by(pod_name, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % $.namespaceMatcher(); $.dashboard('Cortex / Compactor Resources') @@ -24,13 +24,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Network') .addPanel( $.panel('Receive Bandwidth') + - $.queryPanel('sum by(pod_name) (rate(container_network_receive_bytes_total{%s,pod_name=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod_name}}') + + $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Transmit Bandwidth') + - $.queryPanel('sum by(pod_name) (rate(container_network_transmit_bytes_total{%s,pod_name=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod_name}}') + + $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) @@ -39,13 +39,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod_name}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod_name}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/cortex-mixin/dashboards/comparison.libsonnet b/cortex-mixin/dashboards/comparison.libsonnet index 2646fffb..29355df9 100644 --- a/cortex-mixin/dashboards/comparison.libsonnet +++ b/cortex-mixin/dashboards/comparison.libsonnet @@ -30,13 +30,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU per sample') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') ) .addPanel( $.panel('Memory per active series') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + { yaxes: $.yaxes('bytes') } @@ -46,13 +46,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__interval]))', 'chunks') ) .addPanel( $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"})', 'chunks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"})', 'chunks - working set') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + { yaxes: $.yaxes('bytes') } @@ -90,13 +90,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__interval]))', 'chunks') ) .addPanel( $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"})', 'chunks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"})', 'chunks - working set') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + { yaxes: $.yaxes('bytes') } diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index aa1e68ce..00210fa5 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -128,9 +128,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerCPUUsagePanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod_name) (rate(container_cpu_usage_seconds_total{%s,container_name="%s"}[$__interval]))' % [$.namespaceMatcher(), containerName], - 'min(container_spec_cpu_quota{%s,container_name="%s"} / container_spec_cpu_period{%s,container_name="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], - ], ['{{pod_name}}', 'limit']) + + 'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__interval]))' % [$.namespaceMatcher(), containerName], + 'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], + ], ['{{pod}}', 'limit']) + { seriesOverrides: [ { @@ -144,9 +144,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerMemoryWorkingSetPanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod_name) (container_memory_working_set_bytes{%s,container_name="%s"})' % [$.namespaceMatcher(), containerName], - 'min(container_spec_memory_limit_bytes{%s,container_name="%s"} > 0)' % [$.namespaceMatcher(), containerName], - ], ['{{pod_name}}', 'limit']) + + 'sum by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName], + 'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName], + ], ['{{pod}}', 'limit']) + { seriesOverrides: [ { diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index 4f37132e..97e9b124 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -80,7 +80,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) / sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) |||, @@ -94,7 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) / sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) |||,