diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cf701b1b1..4ee3e48bcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Note: This CHANGELOG is only for the monitoring team to track all monitoring related changes. Please see OpenShift release notes for official changes. +## 4.21 + +- [#2553](https://github.com/openshift/cluster-monitoring-operator/pull/2553) Expose an `additionalLabelsAllowList` field in CMO's KubeStateMetrics configuration to allow users to specify which additional label metrics from Kubernetes objects should be exposed by KSM. + ## 4.20 - [#2595](https://github.com/openshift/cluster-monitoring-operator/pull/2595) Multi-tenant support for KSM's CRS feature-set downstream. diff --git a/Documentation/api.md b/Documentation/api.md index 4f8a1d2ab1..8f00eb4506 100644 --- a/Documentation/api.md +++ b/Documentation/api.md @@ -174,6 +174,7 @@ The `KubeStateMetricsConfig` resource defines settings for the `kube-state-metri | resources | *[v1.ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#resourcerequirements-v1-core) | Defines resource requests and limits for the KubeStateMetrics container. | | tolerations | [][v1.Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#toleration-v1-core) | Defines tolerations for the pods. | | topologySpreadConstraints | []v1.TopologySpreadConstraint | Defines a pod's topology spread constraints. | +| additionalLabelsAllowList | *string | Defines label-metrics' allow list for resources in addition to the default one. Currently, this is only supported for `jobs` and `cronjobs`, due to cardinality concerns. This follows the format: resource1=[label1,label2,labelN...],...,resourceN=[...], which is then validated and appended to the default labels' allow list. | [Back to TOC](#table-of-contents) diff --git a/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc b/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc index 1872516996..987f187cac 100644 --- a/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc +++ b/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc @@ -26,6 +26,8 @@ Appears in: link:clustermonitoringconfiguration.adoc[ClusterMonitoringConfigurat |topologySpreadConstraints|[]v1.TopologySpreadConstraint|Defines a pod's topology spread constraints. +|additionalLabelsAllowList|*string|Defines label-metrics' allow list for resources in addition to the default one. Currently, this is only supported for `jobs` and `cronjobs`, due to cardinality concerns. This follows the format: resource1=[label1,label2,labelN...],...,resourceN=[...], which is then validated and appended to the default labels' allow list. + |=== link:../index.adoc[Back to TOC] diff --git a/pkg/manifests/manifests.go b/pkg/manifests/manifests.go index b28c87a956..78e54ba4f8 100644 --- a/pkg/manifests/manifests.go +++ b/pkg/manifests/manifests.go @@ -48,6 +48,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" auditv1 "k8s.io/apiserver/pkg/apis/audit/v1" + "k8s.io/klog/v2" apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1" "k8s.io/utils/ptr" k8syaml "sigs.k8s.io/yaml" @@ -760,6 +761,34 @@ func (f *Factory) KubeStateMetricsDeployment() (*appsv1.Deployment, error) { if f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.Resources != nil { d.Spec.Template.Spec.Containers[i].Resources = *f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.Resources } + additionalAllowList := f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.AdditionalLabelsAllowList + if additionalAllowList != nil && *additionalAllowList != "" { + err = validateLabelsAllowListFormat(*additionalAllowList) + if err != nil { + return nil, fmt.Errorf("error parsing allowlist: %w", err) + } + for i = range container.Args { + if strings.HasPrefix(container.Args[i], "--metric-labels-allowlist=") { + allowedResources := sets.New[string]("jobs", "cronjobs") + gotResourcesKeyValues := strings.Split(*additionalAllowList, ",") + acceptedResources := make([]string, 0, len(gotResourcesKeyValues)) + for _, keyValue := range gotResourcesKeyValues { + parts := strings.SplitN(keyValue, "=", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("error parsing allowlist: %w", ErrConfigValidation) + } + if allowedResources.Has(parts[0]) { + acceptedResources = append(acceptedResources, keyValue) + } else { + klog.V(4).Infof("ignoring unsupported resource %q in additional labels allowlist", parts[0]) + } + } + if len(acceptedResources) > 0 { + container.Args[i] += "," + strings.Join(acceptedResources, ",") + } + } + } + } } } @@ -3614,3 +3643,65 @@ func hashStringMap(m map[string]string) string { } return hashByteMap(byteMap) } + +func validateLabelsAllowListFormat(value string) error { + var errLabelsAllowListFormat = errors.New("invalid format, should be: resource1=[label1,label2,labelN...],...,resourceN=[...]") + + // Taken from text/scanner EOF constant. + const EOF = -1 + var ( + m = map[string][]string{} + previous rune + next rune + firstWordPos int + name string + ) + firstWordPos = 0 + + for i, v := range value { + if i+1 == len(value) { + next = EOF + } else { + next = []rune(value)[i+1] + } + if i-1 >= 0 { + previous = []rune(value)[i-1] + } else { + previous = v + } + + switch v { + case '=': + if previous == ',' || next != '[' { + return errLabelsAllowListFormat + } + name = strings.TrimSpace(string([]rune(value)[firstWordPos:i])) + m[name] = []string{} + firstWordPos = i + 1 + case '[': + if previous != '=' { + return errLabelsAllowListFormat + } + firstWordPos = i + 1 + case ']': + // if after metric group, has char not comma or end. + if next != EOF && next != ',' { + return errLabelsAllowListFormat + } + if previous != '[' { + m[name] = append(m[name], strings.TrimSpace(string(([]rune(value)[firstWordPos:i])))) + } + firstWordPos = i + 1 + case ',': + // if starts or ends with comma + if previous == v || next == EOF || next == ']' { + return errLabelsAllowListFormat + } + if previous != ']' { + m[name] = append(m[name], strings.TrimSpace(string(([]rune(value)[firstWordPos:i])))) + } + firstWordPos = i + 1 + } + } + return nil +} diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go index 287d1bc61e..61a4fc4e10 100644 --- a/pkg/manifests/types.go +++ b/pkg/manifests/types.go @@ -175,6 +175,11 @@ type KubeStateMetricsConfig struct { Tolerations []v1.Toleration `json:"tolerations,omitempty"` // Defines a pod's topology spread constraints. TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` + // Defines label-metrics' allow list for resources in addition to the default one. + // Currently, this is only supported for `jobs` and `cronjobs`, due to cardinality concerns. + // This follows the format: resource1=[label1,label2,labelN...],...,resourceN=[...], + // which is then validated and appended to the default labels' allow list. + AdditionalLabelsAllowList *string `json:"additionalLabelsAllowList,omitempty"` } // The `PrometheusK8sConfig` resource defines settings for the Prometheus