Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed metrics path and deprecated labels for the cadvisor job #3312

Merged
merged 3 commits into from
Jul 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ In Strimzi 0.12.0, the `v1alpha1` versions of the following resources have been
In the next release, the `v1alpha1` versions of these resources will be removed.
Please follow the guide for upgrading the resources: https://strimzi.io/docs/operators/latest/full/deploying.html#assembly-upgrade-resources-str.

#### Removal deprecated cadvisor metric labels

The `pod_name` and `container_name` labels provided on the cadvisor metrics are now just `pod` and `container` starting from Kubernetes 1.16.
We removed the old ones from the Prometheus scraping configuration/alerts and on the Kafka and ZooKeeper dashboard as well.
It means that the charts related to memory and CPU usage are not going to work on Kuvbernetes version previous 1.14.
For more information on what is changed: https://github.com/strimzi/strimzi-kafka-operator/pull/3312

## 0.18.0

* Add possibility to set Java System Properties for User Operator and Topic Operator via `Kafka` CR.
Expand Down
4 changes: 2 additions & 2 deletions examples/metrics/grafana-dashboards/strimzi-kafka.json
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container_name=\"kafka\"}) by (kubernetes_pod_name)",
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}) by (kubernetes_pod_name)",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
Expand Down Expand Up @@ -852,7 +852,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container_name=\"kafka\"}[5m])) by (kubernetes_pod_name)",
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}[5m])) by (kubernetes_pod_name)",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
Expand Down
4 changes: 2 additions & 2 deletions examples/metrics/grafana-dashboards/strimzi-zookeeper.json
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",container_name=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}) by (kubernetes_pod_name)",
"expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",container=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}) by (kubernetes_pod_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}}",
Expand Down Expand Up @@ -602,7 +602,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",container_name=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}[5m])) by (kubernetes_pod_name)",
"expr": "sum(rate(container_cpu_user_seconds_total{namespace=\"$kubernetes_namespace\",container=\"zookeeper\",kubernetes_pod_name=~\"$strimzi_cluster_name-$zk_node\"}[5m])) by (kubernetes_pod_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
honor_labels: true
scrape_interval: 10s
scrape_timeout: 10s
metrics_path: /metrics
metrics_path: /metrics/cadvisor
scheme: https
kubernetes_sd_configs:
- role: node
Expand Down Expand Up @@ -43,28 +43,28 @@
replacement: $1
action: replace
metric_relabel_configs:
- source_labels: [pod_name]
- source_labels: [pod]
separator: ;
regex: (.*)
target_label: kubernetes_pod_name
replacement: $1
action: replace
- separator: ;
regex: pod_name
regex: pod
replacement: $1
action: labeldrop
- source_labels: [container_name, __name__]
- source_labels: [container, __name__]
separator: ;
regex: POD;container_(network).*
target_label: container_name
target_label: container
replacement: $1
action: replace
- source_labels: [container_name]
- source_labels: [container]
separator: ;
regex: POD
replacement: $1
action: drop
- source_labels: [container_name]
- source_labels: [container]
separator: ;
regex: ^$
replacement: $1
Expand Down
24 changes: 12 additions & 12 deletions examples/metrics/prometheus-install/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,31 +66,31 @@ spec:
summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}'
description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes'
- alert: ClusterOperatorContainerDown
expr: count((container_last_seen{container_name="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container_name="strimzi-cluster-operator"})
expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"})
for: 1m
labels:
severity: major
annotations:
summary: 'Cluster Operator down'
description: 'The Cluster Operator has been down for longer than 90 seconds'
- alert: KafkaBrokerContainersDown
expr: absent(container_last_seen{container_name="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
expr: absent(container_last_seen{container="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All `kafka` containers down or in CrashLookBackOff status'
description: 'All `kafka` containers have been down or in CrashLookBackOff status for 3 minutes'
- alert: KafkaTlsSidecarContainersDown
expr: absent(container_last_seen{container_name="tls-sidecar",kubernetes_pod_name=~".+-kafka-[0-9]+"})
expr: absent(container_last_seen{container="tls-sidecar",kubernetes_pod_name=~".+-kafka-[0-9]+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All `tls-sidecar` containers in the Kafka pods are down or in CrashLookBackOff status'
description: 'All `tls-sidecar` containers in the Kafka pods are down or in CrashLookBackOff status for 3 minutes'
- alert: KafkaContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container_name="kafka"}[5m])) > 2 * count(container_last_seen{container_name="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",kubernetes_pod_name=~".+-kafka-[0-9]+"})
for: 5m
labels:
severity: warning
Expand Down Expand Up @@ -124,15 +124,15 @@ spec:
summary: 'Zookeeper is running out of free disk space'
description: 'There are only {{ $value }} bytes available at {{ $labels.persistentvolumeclaim }} PVC'
- alert: ZookeeperContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container_name="zookeeper"}[5m])) > 2 * count(container_last_seen{container_name="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
for: 5m
labels:
severity: warning
annotations:
summary: 'One or more Zookeeper containers were restarted too often'
description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up'
- alert: ZookeeperContainersDown
expr: absent(container_last_seen{container_name="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
expr: absent(container_last_seen{container="zookeeper",kubernetes_pod_name=~".+-zookeeper-[0-9]+"})
for: 3m
labels:
severity: major
Expand All @@ -142,23 +142,23 @@ spec:
- name: entityOperator
rules:
- alert: TopicOperatorContainerDown
expr: absent(container_last_seen{container_name="topic-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
expr: absent(container_last_seen{container="topic-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status'
description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes'
- alert: UserOperatorContainerDown
expr: absent(container_last_seen{container_name="user-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
expr: absent(container_last_seen{container="user-operator",kubernetes_pod_name=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status'
description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes'
- alert: EntityOperatorTlsSidecarContainerDown
expr: absent(container_last_seen{container_name="tls-sidecar",kubernetes_pod_name=~".+-entity-operator-.+"})
expr: absent(container_last_seen{container="tls-sidecar",kubernetes_pod_name=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
Expand All @@ -168,7 +168,7 @@ spec:
- name: connect
rules:
- alert: ConnectContainersDown
expr: absent(container_last_seen{container_name=~".+-connect",kubernetes_pod_name=~".+-connect-.+"})
expr: absent(container_last_seen{container=~".+-connect",kubernetes_pod_name=~".+-connect-.+"})
for: 3m
labels:
severity: major
Expand All @@ -178,7 +178,7 @@ spec:
- name: bridge
rules:
- alert: BridgeContainersDown
expr: absent(container_last_seen{container_name=~".+-bridge",kubernetes_pod_name=~".+-bridge-.+"})
expr: absent(container_last_seen{container=~".+-bridge",kubernetes_pod_name=~".+-bridge-.+"})
for: 3m
labels:
severity: major
Expand All @@ -188,7 +188,7 @@ spec:
- name: mirrorMaker
rules:
- alert: MirrorMakerContainerDown
expr: absent(container_last_seen{container_name=~".+-mirror-maker",kubernetes_pod_name=~".+-mirror-maker-.+"})
expr: absent(container_last_seen{container=~".+-mirror-maker",kubernetes_pod_name=~".+-mirror-maker-.+"})
for: 3m
labels:
severity: major
Expand Down