From 178afe4a9bc682f85fc40d72609b2cfc547febca Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 8 Dec 2019 22:05:15 -0800 Subject: [PATCH] Reduce apiserver metrics cardinality and extraneous labels * Stop mapping node labels to targets discovered via Kubernetes nodes (e.g. etcd, kubelet, cadvisor). It is rarely useful to store node labels (e.g. kubernetes.io/os=linux) on these metrics * kube-apiserver's apiserver_request_duration_seconds_bucket metric has a high cardinality that includes labels for the API group, verb, scope, resource, and component for each object type, including for each CRD. This one metric has ~10k time series in a typical cluster (btw 10-40% of total) * Removing the apiserver request duration outright would make latency alerts a NoOp and break a Grafana apiserver panel. Instead, drop series that have a "group" label. Effectively, only request durations for core Kubernetes APIs will be kept (e.g. cardinality won't grow with each CRD added). This reduces the metric to ~2k unique series --- CHANGES.md | 1 + addons/prometheus/config.yaml | 25 ++++++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 9ce519d63..7e36d0b63 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -28,6 +28,7 @@ Notable changes between versions. * Update Grafana from v6.4.4 to [v6.5.1](https://grafana.com/docs/guides/whats-new-in-v6-5/) * Add pod networking details in dashboards ([#593](https://github.com/poseidon/typhoon/pull/593)) * Add node alerts and Grafana dashboard from node-exporter ([#591](https://github.com/poseidon/typhoon/pull/591)) +* Reduce Prometheus time series of high cardinality metrics ([#596](https://github.com/poseidon/typhoon/pull/596)) ## v1.16.3 diff --git a/addons/prometheus/config.yaml b/addons/prometheus/config.yaml index 298c4edc0..38b7472f1 100644 --- a/addons/prometheus/config.yaml +++ b/addons/prometheus/config.yaml @@ -65,6 +65,9 @@ data: - source_labels: [__name__] action: drop regex: apiserver_admission_step_admission_latencies_seconds_.* + - source_labels: [__name__, group] + regex: apiserver_request_duration_seconds_bucket;.+ + action: drop # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics). @@ -81,7 +84,7 @@ data: relabel_configs: - action: labelmap - regex: __meta_kubernetes_node_label_(.+) + regex: __meta_kubernetes_node_name # Scrape config for Kubelet cAdvisor. Explore metrics from a node by # scraping kubelet (127.0.0.1:10250/metrics/cadvisor). @@ -99,7 +102,7 @@ data: relabel_configs: - action: labelmap - regex: __meta_kubernetes_node_label_(.+) + regex: __meta_kubernetes_node_name metric_relabel_configs: - source_labels: [__name__, image] action: drop @@ -115,15 +118,15 @@ data: - role: node scheme: http relabel_configs: - - source_labels: [__meta_kubernetes_node_label_node_kubernetes_io_controller] - action: keep - regex: 'true' - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - source_labels: [__meta_kubernetes_node_address_InternalIP] - action: replace - target_label: __address__ - replacement: '${1}:2381' + - source_labels: [__meta_kubernetes_node_label_node_kubernetes_io_controller] + action: keep + regex: 'true' + - action: labelmap + regex: __meta_kubernetes_node_name + - source_labels: [__meta_kubernetes_node_address_InternalIP] + action: replace + target_label: __address__ + replacement: '${1}:2381' # Scrape config for service endpoints. #