From ee4e2088bf0542f35ef1929fb5b3a9ea4082bb21 Mon Sep 17 00:00:00 2001 From: Paolo Patierno Date: Tue, 14 Jul 2020 08:44:49 +0200 Subject: [PATCH] Fixed metrics path and deprecated labels for the cadvisor job (#3312) * Fixed metrics path and deprecated labels for the cadvisor job Fixed dashboards using the right label Signed-off-by: Paolo Patierno * Updated CHANGELOG with breaking changes on Kubernetes < 1.14 Signed-off-by: Paolo Patierno * Fixed comment Signed-off-by: Paolo Patierno --- CHANGELOG.md | 7 ++++++ .../grafana-dashboards/strimzi-kafka.json | 4 ++-- .../grafana-dashboards/strimzi-zookeeper.json | 4 ++-- .../prometheus-additional.yaml | 14 +++++------ .../prometheus-install/prometheus-rules.yaml | 24 +++++++++---------- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ccabce28db..7270633d2bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,13 @@ In Strimzi 0.12.0, the `v1alpha1` versions of the following resources have been In the next release, the `v1alpha1` versions of these resources will be removed. Please follow the guide for upgrading the resources: https://strimzi.io/docs/operators/latest/full/deploying.html#assembly-upgrade-resources-str. +#### Removal deprecated cadvisor metric labels + +The `pod_name` and `container_name` labels provided on the cadvisor metrics are now just `pod` and `container` starting from Kubernetes 1.16. +We removed the old ones from the Prometheus scraping configuration/alerts and on the Kafka and ZooKeeper dashboard as well. +It means that the charts related to memory and CPU usage are not going to work on Kuvbernetes version previous 1.14. +For more information on what is changed: https://github.com/strimzi/strimzi-kafka-operator/pull/3312 + ## 0.18.0 * Add possibility to set Java System Properties for User Operator and Topic Operator via `Kafka` CR. diff --git a/examples/metrics/grafana-dashboards/strimzi-kafka.json b/examples/metrics/grafana-dashboards/strimzi-kafka.json index 6fd08fec8c8..bc942b7eb4a 100644 --- a/examples/metrics/grafana-dashboards/strimzi-kafka.json +++ b/examples/metrics/grafana-dashboards/strimzi-kafka.json @@ -764,7 +764,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container_name=\"kafka\"}) by (kubernetes_pod_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}) by (kubernetes_pod_name)", "format": "time_series", "hide": false, "intervalFactor": 1, @@ -852,7 +852,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container_name=\"kafka\"}[5m])) by (kubernetes_pod_name)", + "expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}[5m])) by (kubernetes_pod_name)", "format": "time_series", "hide": false, "intervalFactor": 1, diff --git a/examples/metrics/grafana-dashboards/strimzi-zookeeper.json b/examples/metrics/grafana-dashboards/strimzi-zookeeper.json index f5695c367b8..5d7c1433182 100644 --- a/examples/metrics/grafana-dashboards/strimzi-zookeeper.json +++ b/examples/metrics/grafana-dashboards/strimzi-zookeeper.json @@ -517,7 +517,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",container_name=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}) by (kubernetes_pod_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",container=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}) by (kubernetes_pod_name)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{kubernetes_pod_name}}", @@ -602,7 +602,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",container_name=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}[5m])) by (kubernetes_pod_name)", + "expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",container=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}[5m])) by (kubernetes_pod_name)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{kubernetes_pod_name}}", diff --git a/examples/metrics/prometheus-additional-properties/prometheus-additional.yaml b/examples/metrics/prometheus-additional-properties/prometheus-additional.yaml index bc51c8999fa..8547f8bb44c 100644 --- a/examples/metrics/prometheus-additional-properties/prometheus-additional.yaml +++ b/examples/metrics/prometheus-additional-properties/prometheus-additional.yaml @@ -4,7 +4,7 @@ honor_labels: true scrape_interval: 10s scrape_timeout: 10s - metrics_path: /metrics + metrics_path: /metrics/cadvisor scheme: https kubernetes_sd_configs: - role: node @@ -43,28 +43,28 @@ replacement: $1 action: replace metric_relabel_configs: - - source_labels: [pod_name] + - source_labels: [pod] separator: ; regex: (.*) target_label: kubernetes_pod_name replacement: $1 action: replace - separator: ; - regex: pod_name + regex: pod replacement: $1 action: labeldrop - - source_labels: [container_name, __name__] + - source_labels: [container, __name__] separator: ; regex: POD;container_(network).* - target_label: container_name + target_label: container replacement: $1 action: replace - - source_labels: [container_name] + - source_labels: [container] separator: ; regex: POD replacement: $1 action: drop - - source_labels: [container_name] + - source_labels: [container] separator: ; regex: ^$ replacement: $1 diff --git a/examples/metrics/prometheus-install/prometheus-rules.yaml b/examples/metrics/prometheus-install/prometheus-rules.yaml index 233bc863663..789cc21e030 100644 --- a/examples/metrics/prometheus-install/prometheus-rules.yaml +++ b/examples/metrics/prometheus-install/prometheus-rules.yaml @@ -66,7 +66,7 @@ spec: summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}' description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes' - alert: ClusterOperatorContainerDown - expr: count((container_last_seen{container_name="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container_name="strimzi-cluster-operator"}) + expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"}) for: 1m labels: severity: major @@ -74,7 +74,7 @@ spec: summary: 'Cluster Operator down' description: 'The Cluster Operator has been down for longer than 90 seconds' - alert: KafkaBrokerContainersDown - expr: absent(container_last_seen{container_name="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"}) + expr: absent(container_last_seen{container="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"}) for: 3m labels: severity: major @@ -82,7 +82,7 @@ spec: summary: 'All `kafka` containers down or in CrashLookBackOff status' description: 'All `kafka` containers have been down or in CrashLookBackOff status for 3 minutes' - alert: KafkaTlsSidecarContainersDown - expr: absent(container_last_seen{container_name="tls-sidecar",kubernetes_pod_name=~".+-kafka-[0-9]+"}) + expr: absent(container_last_seen{container="tls-sidecar",kubernetes_pod_name=~".+-kafka-[0-9]+"}) for: 3m labels: severity: major @@ -90,7 +90,7 @@ spec: summary: 'All `tls-sidecar` containers in the Kafka pods are down or in CrashLookBackOff status' description: 'All `tls-sidecar` containers in the Kafka pods are down or in CrashLookBackOff status for 3 minutes' - alert: KafkaContainerRestartedInTheLast5Minutes - expr: count(count_over_time(container_last_seen{container_name="kafka"}[5m])) > 2 * count(container_last_seen{container_name="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"}) + expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"}) for: 5m labels: severity: warning @@ -124,7 +124,7 @@ spec: summary: 'Zookeeper is running out of free disk space' description: 'There are only {{ $value }} bytes available at {{ $labels.persistentvolumeclaim }} PVC' - alert: ZookeeperContainerRestartedInTheLast5Minutes - expr: count(count_over_time(container_last_seen{container_name="zookeeper"}[5m])) > 2 * count(container_last_seen{container_name="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"}) + expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"}) for: 5m labels: severity: warning @@ -132,7 +132,7 @@ spec: summary: 'One or more Zookeeper containers were restarted too often' description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up' - alert: ZookeeperContainersDown - expr: absent(container_last_seen{container_name="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"}) + expr: absent(container_last_seen{container="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"}) for: 3m labels: severity: major @@ -142,7 +142,7 @@ spec: - name: entityOperator rules: - alert: TopicOperatorContainerDown - expr: absent(container_last_seen{container_name="topic-operator",kubernetes_pod_name=~".+-entity-operator-.+"}) + expr: absent(container_last_seen{container="topic-operator",kubernetes_pod_name=~".+-entity-operator-.+"}) for: 3m labels: severity: major @@ -150,7 +150,7 @@ spec: summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status' description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes' - alert: UserOperatorContainerDown - expr: absent(container_last_seen{container_name="user-operator",kubernetes_pod_name=~".+-entity-operator-.+"}) + expr: absent(container_last_seen{container="user-operator",kubernetes_pod_name=~".+-entity-operator-.+"}) for: 3m labels: severity: major @@ -158,7 +158,7 @@ spec: summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status' description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes' - alert: EntityOperatorTlsSidecarContainerDown - expr: absent(container_last_seen{container_name="tls-sidecar",kubernetes_pod_name=~".+-entity-operator-.+"}) + expr: absent(container_last_seen{container="tls-sidecar",kubernetes_pod_name=~".+-entity-operator-.+"}) for: 3m labels: severity: major @@ -168,7 +168,7 @@ spec: - name: connect rules: - alert: ConnectContainersDown - expr: absent(container_last_seen{container_name=~".+-connect",kubernetes_pod_name=~".+-connect-.+"}) + expr: absent(container_last_seen{container=~".+-connect",kubernetes_pod_name=~".+-connect-.+"}) for: 3m labels: severity: major @@ -178,7 +178,7 @@ spec: - name: bridge rules: - alert: BridgeContainersDown - expr: absent(container_last_seen{container_name=~".+-bridge",kubernetes_pod_name=~".+-bridge-.+"}) + expr: absent(container_last_seen{container=~".+-bridge",kubernetes_pod_name=~".+-bridge-.+"}) for: 3m labels: severity: major @@ -188,7 +188,7 @@ spec: - name: mirrorMaker rules: - alert: MirrorMakerContainerDown - expr: absent(container_last_seen{container_name=~".+-mirror-maker",kubernetes_pod_name=~".+-mirror-maker-.+"}) + expr: absent(container_last_seen{container=~".+-mirror-maker",kubernetes_pod_name=~".+-mirror-maker-.+"}) for: 3m labels: severity: major