From 4a3566a50d651a2edc03841d51d662987c13759b Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Tue, 9 Dec 2025 14:57:38 +0100 Subject: [PATCH] fix(base-cluster/monitoring)!: grafana-tempo-distributed would need s3 Therefore we switch to the singleBinary chart. Without this change the compactor doesn't work; meaning the traces are never cleaned up... --- charts/base-cluster/README.md.gotmpl | 22 +++++++ .../templates/monitoring/alloy-gateway.yaml | 2 +- .../monitoring/tracing/grafana-tempo.yaml | 60 +++++++++---------- charts/base-cluster/values.schema.json | 30 ++++------ charts/base-cluster/values.yaml | 11 ++-- 5 files changed, 70 insertions(+), 55 deletions(-) diff --git a/charts/base-cluster/README.md.gotmpl b/charts/base-cluster/README.md.gotmpl index 203328ca68..a866977d06 100644 --- a/charts/base-cluster/README.md.gotmpl +++ b/charts/base-cluster/README.md.gotmpl @@ -393,4 +393,26 @@ ingress: provider: nginx ``` +### 8.x.x -> 9.0.0 + +This release adds the capability to configure alertmanager to push it's alerts +to multiple receivers, including email. + +To migrate to this new version you have to remove `.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled` + +### 9.x.x -> 10.0.0 + +This release adds support for [k8up](https://k8up.io) as a backup provider. + +If you are using velero, you need to migrate to using `.backup.provider.velero`. + +### 10.x.x -> 11.0.0 + +This release fixes the grafana tempo deployment compaction / retention. + +For this we switched from the distributed deployment to the single binary deployment. + +Because of this you need to migrate to the `.monitoring.tracing.` instead +of `.monitoring.tracing.ingester.` + {{ .Files.Get "values.md" }} diff --git a/charts/base-cluster/templates/monitoring/alloy-gateway.yaml b/charts/base-cluster/templates/monitoring/alloy-gateway.yaml index 8571eb2af3..2b022c9ced 100644 --- a/charts/base-cluster/templates/monitoring/alloy-gateway.yaml +++ b/charts/base-cluster/templates/monitoring/alloy-gateway.yaml @@ -98,7 +98,7 @@ spec: otelcol.exporter.otlp "tempo" { client { - endpoint = "grafana-tempo-distributor:4317" + endpoint = "grafana-tempo:4317" tls { insecure = true diff --git a/charts/base-cluster/templates/monitoring/tracing/grafana-tempo.yaml b/charts/base-cluster/templates/monitoring/tracing/grafana-tempo.yaml index 9715c16e5c..7264753c12 100644 --- a/charts/base-cluster/templates/monitoring/tracing/grafana-tempo.yaml +++ b/charts/base-cluster/templates/monitoring/tracing/grafana-tempo.yaml @@ -9,7 +9,7 @@ metadata: app.kubernetes.io/part-of: monitoring spec: chart: - spec: {{- include "base-cluster.helm.chartSpec" (dict "repo" "grafana" "chart" "tempo-distributed" "context" $) | nindent 6 }} + spec: {{- include "base-cluster.helm.chartSpec" (dict "repo" "grafana" "chart" "tempo" "context" $) | nindent 6 }} interval: 1h driftDetection: mode: enabled @@ -17,43 +17,43 @@ spec: - name: kube-prometheus-stack namespace: monitoring values: - {{- if .Values.global.imageRegistry }} - global: - image: - registry: {{ $.Values.global.imageRegistry }} - {{- end }} - ingester: {{- include "common.resourcesWithPreset" .Values.monitoring.tracing.ingester | nindent 6 }} - persistentVolumeClaimRetentionPolicy: - enabled: true - whenDeleted: Delete - whenScaled: Retain - replicas: &replicas 1 - config: - replication_factor: *replicas - persistence: - enabled: true - size: {{ .Values.monitoring.tracing.ingester.persistence.size }} tempo: + {{- with .Values.global.imageRegistry }} + registry: {{ . }} + {{- end }} + resources: {{- include "common.resources" .Values.monitoring.tracing.resources | nindent 8 }} securityContext: privileged: false seLinuxOptions: {} seccompProfile: type: RuntimeDefault - podSecurityContext: - fsGroupChangePolicy: OnRootMismatch - supplementalGroups: [] - sysctls: [] + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + reportingEnabled: false + + persistentVolumeClaimRetentionPolicy: + enabled: true + whenDeleted: Delete + whenScaled: Retain + replicas: 1 + retention: 48h + persistence: + enabled: true + size: {{ .Values.monitoring.tracing.persistence.size }} + securityContext: + fsGroupChangePolicy: OnRootMismatch + supplementalGroups: [] + sysctls: [] metricsGenerator: enabled: true - traces: - otlp: - grpc: - enabled: true - metaMonitoring: - serviceMonitor: - enabled: true - labels: - monitoring/provisioned-by: base-cluster + remoteWriteUrl: "http://kube-prometheus-stack-prometheus:9090/api/v1/write" + serviceMonitor: + enabled: true + additionalLabels: + monitoring/provisioned-by: base-cluster --- apiVersion: v1 kind: ConfigMap diff --git a/charts/base-cluster/values.schema.json b/charts/base-cluster/values.schema.json index c1d4a30f70..a197ccfdc0 100644 --- a/charts/base-cluster/values.schema.json +++ b/charts/base-cluster/values.schema.json @@ -871,30 +871,24 @@ "enabled": { "type": "boolean" }, - "ingester": { + "resourcesPreset": { + "$ref": "#/$defs/resourcesPreset" + }, + "resources": { + "$ref": "#/$defs/resourceRequirements" + }, + "persistence": { "type": "object", "properties": { - "resourcesPreset": { - "$ref": "#/$defs/resourcesPreset" - }, - "resources": { - "$ref": "#/$defs/resourceRequirements" - }, - "persistence": { - "type": "object", - "properties": { - "size": { - "$ref": "#/$defs/quantity" - } - }, - "additionalProperties": false + "size": { + "$ref": "#/$defs/quantity" } }, "additionalProperties": false } - } - }, - "additionalProperties": false + }, + "additionalProperties": false + } }, "additionalProperties": false }, diff --git a/charts/base-cluster/values.yaml b/charts/base-cluster/values.yaml index 784084baa9..99dbfb1de7 100644 --- a/charts/base-cluster/values.yaml +++ b/charts/base-cluster/values.yaml @@ -111,7 +111,7 @@ global: charts: loki: 6.46.0 alloy: 1.5.0 - tempo-distributed: 1.57.0 + tempo: 1.24.1 condition: "{{ and .Values.monitoring.prometheus.enabled (or .Values.monitoring.loki.enabled .Values.monitoring.tracing.enabled) }}" external-dns: url: https://kubernetes-sigs.github.io/external-dns @@ -347,11 +347,10 @@ monitoring: effect: NoSchedule tracing: enabled: false - ingester: - resourcesPreset: small - resources: {} - persistence: - size: 10Gi + resourcesPreset: small + resources: {} + persistence: + size: 10Gi descheduler: enabled: true