-
Notifications
You must be signed in to change notification settings - Fork 378
perf: Add flag to disable costly metrics controllers #2354
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
58fe9b8
8fccfc3
b3bc8de
5d4436c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,9 +85,6 @@ func NewControllers( | |
| informer.NewNodePoolController(kubeClient, cloudProvider, cluster), | ||
| informer.NewNodeClaimController(kubeClient, cloudProvider, cluster), | ||
| termination.NewController(clock, kubeClient, cloudProvider, terminator.NewTerminator(clock, kubeClient, evictionQueue, recorder), recorder), | ||
| metricspod.NewController(kubeClient, cluster), | ||
| metricsnodepool.NewController(kubeClient, cloudProvider), | ||
| metricsnode.NewController(cluster), | ||
| nodepoolreadiness.NewController(kubeClient, cloudProvider), | ||
| nodepoolregistrationhealth.NewController(kubeClient, cloudProvider), | ||
| nodepoolcounter.NewController(kubeClient, cloudProvider, cluster), | ||
|
|
@@ -99,25 +96,32 @@ func NewControllers( | |
| nodeclaimdisruption.NewController(clock, kubeClient, cloudProvider), | ||
| nodeclaimhydration.NewController(kubeClient, cloudProvider), | ||
| nodehydration.NewController(kubeClient, cloudProvider), | ||
| status.NewController[*v1.NodeClaim]( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these status controllers only responsible for metrics? Anything else would be missing by disabling them? (For example, looks like they also issue transition events ...)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They do also emit events for finalizers and status condition changes, but I think thats all they emit |
||
| kubeClient, | ||
| mgr.GetEventRecorderFor("karpenter"), | ||
| status.EmitDeprecatedMetrics, | ||
| status.WithHistogramBuckets(prometheus.ExponentialBuckets(0.5, 2, 15)), // 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 | ||
| status.WithLabels(append(lo.Map(cloudProvider.GetSupportedNodeClasses(), func(obj status.Object, _ int) string { return v1.NodeClassLabelKey(object.GVK(obj).GroupKind()) }), v1.NodePoolLabelKey)...), | ||
| ), | ||
| status.NewController[*v1.NodePool]( | ||
| kubeClient, | ||
| mgr.GetEventRecorderFor("karpenter"), | ||
| status.EmitDeprecatedMetrics, | ||
| status.WithHistogramBuckets(prometheus.ExponentialBuckets(0.5, 2, 15)), // 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 | ||
| ), | ||
| status.NewGenericObjectController[*corev1.Node]( | ||
| kubeClient, | ||
| mgr.GetEventRecorderFor("karpenter"), | ||
| status.WithHistogramBuckets(prometheus.ExponentialBuckets(0.5, 2, 15)), // 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 | ||
| status.WithLabels(append(lo.Map(cloudProvider.GetSupportedNodeClasses(), func(obj status.Object, _ int) string { return v1.NodeClassLabelKey(object.GVK(obj).GroupKind()) }), v1.NodePoolLabelKey, v1.NodeInitializedLabelKey)...), | ||
| ), | ||
| } | ||
|
|
||
| if !options.FromContext(ctx).SimplifiedMetrics { | ||
| controllers = append(controllers, | ||
| metricspod.NewController(kubeClient, cluster), | ||
| metricsnodepool.NewController(kubeClient, cloudProvider), | ||
| metricsnode.NewController(cluster), | ||
| status.NewController[*v1.NodeClaim]( | ||
| kubeClient, | ||
| mgr.GetEventRecorderFor("karpenter"), | ||
| status.EmitDeprecatedMetrics, | ||
| status.WithHistogramBuckets(prometheus.ExponentialBuckets(0.5, 2, 15)), // 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 | ||
| status.WithLabels(append(lo.Map(cloudProvider.GetSupportedNodeClasses(), func(obj status.Object, _ int) string { return v1.NodeClassLabelKey(object.GVK(obj).GroupKind()) }), v1.NodePoolLabelKey)...), | ||
| ), | ||
| status.NewController[*v1.NodePool]( | ||
| kubeClient, | ||
| mgr.GetEventRecorderFor("karpenter"), | ||
| status.EmitDeprecatedMetrics, | ||
| status.WithHistogramBuckets(prometheus.ExponentialBuckets(0.5, 2, 15)), // 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 | ||
| ), | ||
| status.NewGenericObjectController[*corev1.Node]( | ||
| kubeClient, | ||
| mgr.GetEventRecorderFor("karpenter"), | ||
| status.WithHistogramBuckets(prometheus.ExponentialBuckets(0.5, 2, 15)), // 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 | ||
| status.WithLabels(append(lo.Map(cloudProvider.GetSupportedNodeClasses(), func(obj status.Object, _ int) string { return v1.NodeClassLabelKey(object.GVK(obj).GroupKind()) }), v1.NodePoolLabelKey, v1.NodeInitializedLabelKey)...)), | ||
| ) | ||
| } | ||
|
|
||
| // The cloud provider must define status conditions for the node repair controller to use to detect unhealthy nodes | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if it would make sense to include the NodePool Counter controller in this set? I could go either way - since it's just propagating information from cluster state it should be relatively cheap, but the cost does scale with the number of nodepools. It is a "cluster state observability" controller though. Did it show up at all in your performance review?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We were using a relatively small number of nodepools (~12), we didn't see significant performance impact. One thing I want to explore is how exactly the volume of metrics changes as we scale the number of nodes in the cluster
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems fine to include it for the time being, and we can re-evaluate if we see significant performance impact.