diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index 62c235c27458..36951ee18773 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -1104,6 +1104,7 @@ The following startup parameters are supported for cluster autoscaler: | `scale-down-unready-enabled` | Should CA scale down unready nodes of the cluster | true | | `scale-down-unready-time` | How long an unready node should be unneeded before it is eligible for scale down | 20m0s | | `scale-down-utilization-threshold` | The maximum value between the sum of cpu requests and sum of memory requests of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down | 0.5 | +| `scale-from-unschedulable` | Should CA ignore a node's .spec.unschedulable field when creating a node template for considering to scale a node group. | false | | `scale-up-from-zero` | Should CA scale up when there are 0 ready nodes. | true | | `scan-interval` | How often cluster is reevaluated for scale up or down | 10s | | `scheduler-config-file` | scheduler-config allows changing configuration of in-tree scheduler plugins acting on PreFilter and Filter extension points | | diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index a144580989cd..d910447eb12e 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -230,6 +230,9 @@ type AutoscalingOptions struct { BalancingLabels []string // AWSUseStaticInstanceList tells if AWS cloud provider use static instance type list or dynamically fetch from remote APIs. AWSUseStaticInstanceList bool + // ScaleFromUnschedulable tells the autoscaler to ignore a node's .spec.unschedulable field when creating a node template. + // Specifically, this will cause the autoscaler to set the node template's .spec.unschedulable field to false. + ScaleFromUnschedulable bool // GCEOptions contain autoscaling options specific to GCE cloud provider. GCEOptions GCEOptions // KubeClientOpts specify options for kube client diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 0f7209ebbb1b..71b328b32117 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -167,6 +167,7 @@ var ( balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar") balancingLabelsFlag = multiStringFlag("balancing-label", "Specifies a label to use for comparing if two node groups are similar, rather than the built in heuristics. Setting this flag disables all other comparison logic, and cannot be combined with --balancing-ignore-label.") awsUseStaticInstanceList = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only") + scaleFromUnschedulable = flag.Bool("scale-from-unschedulable", false, "Specifies that the CA should ignore a node's .spec.unschedulable field in node templates when considering to scale a node group.") // GCE specific flags concurrentGceRefreshes = flag.Int("gce-concurrent-refreshes", 1, "Maximum number of concurrent refreshes per cloud object type.") @@ -351,6 +352,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { }, NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout, AWSUseStaticInstanceList: *awsUseStaticInstanceList, + ScaleFromUnschedulable: *scaleFromUnschedulable, GCEOptions: config.GCEOptions{ ConcurrentRefreshes: *concurrentGceRefreshes, MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime, diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 86d4ab9826a4..9b089a319c71 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -348,7 +348,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ") } - nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, daemonsets, a.taintConfig, currentTime) + nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, allNodes, daemonsets, a.taintConfig, currentTime) if autoscalerError != nil { klog.Errorf("Failed to get node infos for groups: %v", autoscalerError) return autoscalerError.AddPrefix("failed to build node infos for node groups: ") diff --git a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go index 4b297372e7b1..304a05039e39 100644 --- a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go +++ b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go @@ -78,6 +78,18 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos result := make(map[string]*framework.NodeInfo) seenGroups := make(map[string]bool) + // sort nodes into those good and bad candidates for templates. the bad candidates will be processed + // at the end of this function as a last resort for a node info template. + goodCandidates := make([]*apiv1.Node, 0) + badCandidates := make([]*apiv1.Node, 0) + for _, node := range nodes { + if isNodeGoodTemplateCandidate(node, now) { + goodCandidates = append(goodCandidates, node) + } else { + badCandidates = append(badCandidates, node) + } + } + // processNode returns information whether the nodeTemplate was generated and if there was an error. processNode := func(node *apiv1.Node) (bool, string, caerror.AutoscalerError) { nodeGroup, err := autoscalingCtx.CloudProvider.NodeGroupForNode(node) @@ -103,11 +115,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos return false, "", nil } - for _, node := range nodes { - // Broken nodes might have some stuff missing. Skipping. - if !isNodeGoodTemplateCandidate(node, now) { - continue - } + for _, node := range goodCandidates { added, id, typedErr := processNode(node) if typedErr != nil { return map[string]*framework.NodeInfo{}, typedErr @@ -156,11 +164,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos } // Last resort - unready/unschedulable nodes. - for _, node := range nodes { - // Allowing broken nodes - if isNodeGoodTemplateCandidate(node, now) { - continue - } + for _, node := range badCandidates { added, _, typedErr := processNode(node) if typedErr != nil { return map[string]*framework.NodeInfo{}, typedErr diff --git a/cluster-autoscaler/simulator/node_info_utils.go b/cluster-autoscaler/simulator/node_info_utils.go index ffa8f33be53a..40251c333b9a 100644 --- a/cluster-autoscaler/simulator/node_info_utils.go +++ b/cluster-autoscaler/simulator/node_info_utils.go @@ -115,6 +115,12 @@ func createSanitizedNode(node *apiv1.Node, newName string, taintConfig *taints.T } newNode.Labels[apiv1.LabelHostname] = newName + if taintConfig != nil { + if taintConfig.ShouldScaleFromUnschedulable() { + newNode.Spec.Unschedulable = false + } + } + if taintConfig != nil { newNode.Spec.Taints = taints.SanitizeTaints(newNode.Spec.Taints, *taintConfig) } diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go index acae356089d0..70881bd559af 100644 --- a/cluster-autoscaler/utils/taints/taints.go +++ b/cluster-autoscaler/utils/taints/taints.go @@ -97,6 +97,11 @@ type TaintConfig struct { startupTaintPrefixes []string statusTaintPrefixes []string explicitlyReportedTaints TaintKeySet + // The scaleFromUnschedulable field helps to inform the CA when + // to ignore .spec.unschedulable for a node. It is being added to this + // struct for convenience as it will be used in similar places that check + // for taints to ignore. + scaleFromUnschedulable bool } // NewTaintConfig returns the taint config extracted from options @@ -128,6 +133,7 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig { startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, statusTaintPrefixes: []string{StatusTaintPrefix}, explicitlyReportedTaints: explicitlyReportedTaints, + scaleFromUnschedulable: opts.ScaleFromUnschedulable, } } @@ -147,6 +153,11 @@ func (tc TaintConfig) IsStatusTaint(taint string) bool { return matchesAnyPrefix(tc.statusTaintPrefixes, taint) } +// ShouldScaleFromUnschedulable returns whether a node's .spec.unschedulable field should be ignored. +func (tc TaintConfig) ShouldScaleFromUnschedulable() bool { + return tc.scaleFromUnschedulable +} + func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool { _, ok := tc.explicitlyReportedTaints[taint] return ok