Skip to content

Commit

Permalink
Fixed metrics path and deprecated labels for the cadvisor job (#3312)
Browse files Browse the repository at this point in the history
* Fixed metrics path and deprecated labels for the cadvisor job
Fixed dashboards using the right label

Signed-off-by: Paolo Patierno <[email protected]>

* Updated CHANGELOG with breaking changes on Kubernetes < 1.14

Signed-off-by: Paolo Patierno <[email protected]>

* Fixed comment

Signed-off-by: Paolo Patierno <[email protected]>
  • Loading branch information
ppatierno authored Jul 14, 2020
1 parent 69af6d0 commit ee4e208
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 23 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ In Strimzi 0.12.0, the `v1alpha1` versions of the following resources have been
In the next release, the `v1alpha1` versions of these resources will be removed.
Please follow the guide for upgrading the resources: https://strimzi.io/docs/operators/latest/full/deploying.html#assembly-upgrade-resources-str.

#### Removal deprecated cadvisor metric labels

The `pod_name` and `container_name` labels provided on the cadvisor metrics are now just `pod` and `container` starting from Kubernetes 1.16.
We removed the old ones from the Prometheus scraping configuration/alerts and on the Kafka and ZooKeeper dashboard as well.
It means that the charts related to memory and CPU usage are not going to work on Kuvbernetes version previous 1.14.
For more information on what is changed: https://github.com/strimzi/strimzi-kafka-operator/pull/3312

## 0.18.0

* Add possibility to set Java System Properties for User Operator and Topic Operator via `Kafka` CR.
Expand Down
4 changes: 2 additions & 2 deletions examples/metrics/grafana-dashboards/strimzi-kafka.json
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container_name=\"kafka\"}) by (kubernetes_pod_name)",
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}) by (kubernetes_pod_name)",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
Expand Down Expand Up @@ -852,7 +852,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container_name=\"kafka\"}[5m])) by (kubernetes_pod_name)",
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}[5m])) by (kubernetes_pod_name)",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
Expand Down
4 changes: 2 additions & 2 deletions examples/metrics/grafana-dashboards/strimzi-zookeeper.json
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",container_name=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}) by (kubernetes_pod_name)",
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",container=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}) by (kubernetes_pod_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}}",
Expand Down Expand Up @@ -602,7 +602,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",container_name=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}[5m])) by (kubernetes_pod_name)",
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",container=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}[5m])) by (kubernetes_pod_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
honor_labels: true
scrape_interval: 10s
scrape_timeout: 10s
metrics_path: /metrics
metrics_path: /metrics/cadvisor
scheme: https
kubernetes_sd_configs:
- role: node
Expand Down Expand Up @@ -43,28 +43,28 @@
replacement: $1
action: replace
metric_relabel_configs:
- source_labels: [pod_name]
- source_labels: [pod]
separator: ;
regex: (.*)
target_label: kubernetes_pod_name
replacement: $1
action: replace
- separator: ;
regex: pod_name
regex: pod
replacement: $1
action: labeldrop
- source_labels: [container_name, __name__]
- source_labels: [container, __name__]
separator: ;
regex: POD;container_(network).*
target_label: container_name
target_label: container
replacement: $1
action: replace
- source_labels: [container_name]
- source_labels: [container]
separator: ;
regex: POD
replacement: $1
action: drop
- source_labels: [container_name]
- source_labels: [container]
separator: ;
regex: ^$
replacement: $1
Expand Down
24 changes: 12 additions & 12 deletions examples/metrics/prometheus-install/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,31 +66,31 @@ spec:
summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}'
description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes'
- alert: ClusterOperatorContainerDown
expr: count((container_last_seen{container_name="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container_name="strimzi-cluster-operator"})
expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"})
for: 1m
labels:
severity: major
annotations:
summary: 'Cluster Operator down'
description: 'The Cluster Operator has been down for longer than 90 seconds'
- alert: KafkaBrokerContainersDown
expr: absent(container_last_seen{container_name="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
expr: absent(container_last_seen{container="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All `kafka` containers down or in CrashLookBackOff status'
description: 'All `kafka` containers have been down or in CrashLookBackOff status for 3 minutes'
- alert: KafkaTlsSidecarContainersDown
expr: absent(container_last_seen{container_name="tls-sidecar",kubernetes_pod_name=~".+-kafka-[0-9]+"})
expr: absent(container_last_seen{container="tls-sidecar",kubernetes_pod_name=~".+-kafka-[0-9]+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All `tls-sidecar` containers in the Kafka pods are down or in CrashLookBackOff status'
description: 'All `tls-sidecar` containers in the Kafka pods are down or in CrashLookBackOff status for 3 minutes'
- alert: KafkaContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container_name="kafka"}[5m])) > 2 * count(container_last_seen{container_name="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
for: 5m
labels:
severity: warning
Expand Down Expand Up @@ -124,15 +124,15 @@ spec:
summary: 'Zookeeper is running out of free disk space'
description: 'There are only {{ $value }} bytes available at {{ $labels.persistentvolumeclaim }} PVC'
- alert: ZookeeperContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container_name="zookeeper"}[5m])) > 2 * count(container_last_seen{container_name="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
for: 5m
labels:
severity: warning
annotations:
summary: 'One or more Zookeeper containers were restarted too often'
description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up'
- alert: ZookeeperContainersDown
expr: absent(container_last_seen{container_name="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
expr: absent(container_last_seen{container="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
for: 3m
labels:
severity: major
Expand All @@ -142,23 +142,23 @@ spec:
- name: entityOperator
rules:
- alert: TopicOperatorContainerDown
expr: absent(container_last_seen{container_name="topic-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
expr: absent(container_last_seen{container="topic-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status'
description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes'
- alert: UserOperatorContainerDown
expr: absent(container_last_seen{container_name="user-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
expr: absent(container_last_seen{container="user-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status'
description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes'
- alert: EntityOperatorTlsSidecarContainerDown
expr: absent(container_last_seen{container_name="tls-sidecar",kubernetes_pod_name=~".+-entity-operator-.+"})
expr: absent(container_last_seen{container="tls-sidecar",kubernetes_pod_name=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
Expand All @@ -168,7 +168,7 @@ spec:
- name: connect
rules:
- alert: ConnectContainersDown
expr: absent(container_last_seen{container_name=~".+-connect",kubernetes_pod_name=~".+-connect-.+"})
expr: absent(container_last_seen{container=~".+-connect",kubernetes_pod_name=~".+-connect-.+"})
for: 3m
labels:
severity: major
Expand All @@ -178,7 +178,7 @@ spec:
- name: bridge
rules:
- alert: BridgeContainersDown
expr: absent(container_last_seen{container_name=~".+-bridge",kubernetes_pod_name=~".+-bridge-.+"})
expr: absent(container_last_seen{container=~".+-bridge",kubernetes_pod_name=~".+-bridge-.+"})
for: 3m
labels:
severity: major
Expand All @@ -188,7 +188,7 @@ spec:
- name: mirrorMaker
rules:
- alert: MirrorMakerContainerDown
expr: absent(container_last_seen{container_name=~".+-mirror-maker",kubernetes_pod_name=~".+-mirror-maker-.+"})
expr: absent(container_last_seen{container=~".+-mirror-maker",kubernetes_pod_name=~".+-mirror-maker-.+"})
for: 3m
labels:
severity: major
Expand Down

0 comments on commit ee4e208

Please sign in to comment.