diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 5f5b3c2a0418..82c106953064 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -354,6 +354,8 @@ type AutoscalingOptions struct { CapacitybufferControllerEnabled bool // CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning CapacitybufferPodInjectionEnabled bool + // MaxNodeSkipEvalTimeTrackerEnabled is used to enabled/disable the tracking of maximum evaluation time of a node being skipped during ScaleDown. + MaxNodeSkipEvalTimeTrackerEnabled bool } // KubeClientOptions specify options for kube client diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 40a141b2f5fe..8445239a548f 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -232,6 +232,7 @@ var ( nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive") capacitybufferControllerEnabled = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not") capacitybufferPodInjectionEnabled = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly") + maxNodeSkipEvalTimeTrackerEnabled = flag.Bool("max-node-skip-eval-time-tracker-enabled", false, "Whether to enable the tracking of the maximum time of node being skipped during ScaleDown") // Deprecated flags ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)") @@ -422,6 +423,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL, CapacitybufferControllerEnabled: *capacitybufferControllerEnabled, CapacitybufferPodInjectionEnabled: *capacitybufferPodInjectionEnabled, + MaxNodeSkipEvalTimeTrackerEnabled: *maxNodeSkipEvalTimeTrackerEnabled, } } diff --git a/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time.go b/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time.go new file mode 100644 index 000000000000..c38c052f7acc --- /dev/null +++ b/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time.go @@ -0,0 +1,70 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeevaltracker + +import ( + "time" + + "k8s.io/autoscaler/cluster-autoscaler/metrics" +) + +// MaxNodeSkipEvalTime is a time tracker for the biggest evaluation time of a node during ScaleDown +type MaxNodeSkipEvalTime struct { + // lastEvalTime is the time of previous currentlyUnneededNodeNames parsing + lastEvalTime time.Time + // nodeNamesWithTimeStamps is maps of nodeNames with their time of last successful evaluation + nodeNamesWithTimeStamps map[string]time.Time +} + +// NewMaxNodeSkipEvalTime returns LongestNodeScaleDownEvalTime with lastEvalTime set to currentTime +func NewMaxNodeSkipEvalTime(currentTime time.Time) *MaxNodeSkipEvalTime { + return &MaxNodeSkipEvalTime{lastEvalTime: currentTime} +} + +// Retrieves the time of the last evaluation of a node. +func (l *MaxNodeSkipEvalTime) get(nodeName string) time.Time { + if _, ok := l.nodeNamesWithTimeStamps[nodeName]; ok { + return l.nodeNamesWithTimeStamps[nodeName] + } + return l.lastEvalTime +} + +// getMin() returns the minimum time in nodeNamesWithTimeStamps or time of last evaluation +func (l *MaxNodeSkipEvalTime) getMin() time.Time { + minimumTime := l.lastEvalTime + for _, val := range l.nodeNamesWithTimeStamps { + if minimumTime.After(val) { + minimumTime = val + } + } + return minimumTime +} + +// Update returns the longest evaluation time for the nodes in nodeNamesWithTimeStamps +// and changes nodeNamesWithTimeStamps for nodeNames. +func (l *MaxNodeSkipEvalTime) Update(nodeNames []string, currentTime time.Time) time.Duration { + newNodes := make(map[string]time.Time) + for _, nodeName := range nodeNames { + newNodes[nodeName] = l.get(nodeName) + } + l.nodeNamesWithTimeStamps = newNodes + l.lastEvalTime = currentTime + minimumTime := l.getMin() + longestDuration := currentTime.Sub(minimumTime) + metrics.ObserveMaxNodeSkipEvalDurationSeconds(longestDuration) + return longestDuration +} diff --git a/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time_test.go b/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time_test.go new file mode 100644 index 000000000000..02ea32d96aad --- /dev/null +++ b/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time_test.go @@ -0,0 +1,77 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeevaltracker + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestMaxNodeSkipEvalTime(t *testing.T) { + type testCase struct { + name string + unprocessedNodes [][]string + wantMaxSkipEvalTimeSeconds []int + } + start := time.Now() + testCases := []testCase{ + { + name: "Only one node is skipped in one iteration", + unprocessedNodes: [][]string{{}, {"n1"}, {}, {}}, + wantMaxSkipEvalTimeSeconds: []int{0, 1, 0, 0}, + }, + { + name: "No nodes are skipped in the first iteration", + unprocessedNodes: [][]string{{}, {"n1", "n2"}, {"n2", "n3"}, {}}, + wantMaxSkipEvalTimeSeconds: []int{0, 1, 2, 0}, + }, + { + name: "Some nodes are skipped in the first iteration", + unprocessedNodes: [][]string{{"n1", "n2"}, {"n1", "n2"}, {"n2", "n3"}, {}}, + wantMaxSkipEvalTimeSeconds: []int{1, 2, 3, 0}, + }, + { + name: "Overlapping node sets are skipped in different iteration", + unprocessedNodes: [][]string{{}, {"n1", "n2"}, {"n1"}, {"n2"}, {}}, + wantMaxSkipEvalTimeSeconds: []int{0, 1, 2, 1, 0}, + }, + { + name: "Disjoint node sets are skipped in each iteration", + unprocessedNodes: [][]string{{"n1"}, {"n2"}, {"n3"}, {"n4"}, {}}, + wantMaxSkipEvalTimeSeconds: []int{1, 1, 1, 1, 0}, + }, + { + name: "None of the nodes are skipped in each iteration", + unprocessedNodes: [][]string{{}, {}, {}}, + wantMaxSkipEvalTimeSeconds: []int{0, 0, 0}, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + timestamp := start + maxNodeSkipEvalTime := NewMaxNodeSkipEvalTime(start) + for i := 0; i < len(tc.unprocessedNodes); i++ { + timestamp = timestamp.Add(1 * time.Second) + assert.Equal(t, time.Duration(tc.wantMaxSkipEvalTimeSeconds[i])*time.Second, maxNodeSkipEvalTime.Update(tc.unprocessedNodes[i], timestamp)) + assert.Equal(t, len(tc.unprocessedNodes[i]), len(maxNodeSkipEvalTime.nodeNamesWithTimeStamps)) + } + }) + } +} diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 6767d9ddfb4b..6897337014df 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -26,6 +26,7 @@ import ( ca_context "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/nodeevaltracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unneeded" @@ -76,6 +77,7 @@ type Planner struct { cc controllerReplicasCalculator scaleDownSetProcessor nodes.ScaleDownSetProcessor scaleDownContext *nodes.ScaleDownContext + maxNodeSkipEvalTime *nodeevaltracker.MaxNodeSkipEvalTime } // New creates a new Planner object. @@ -91,6 +93,11 @@ func New(autoscalingCtx *ca_context.AutoscalingContext, processors *processors.A unneededNodes.LoadFromExistingTaints(autoscalingCtx.ListerRegistry, time.Now(), autoscalingCtx.AutoscalingOptions.NodeDeletionCandidateTTL) } + var maxNodeSkipEvalTime *nodeevaltracker.MaxNodeSkipEvalTime + if autoscalingCtx.AutoscalingOptions.MaxNodeSkipEvalTimeTrackerEnabled { + maxNodeSkipEvalTime = nodeevaltracker.NewMaxNodeSkipEvalTime(time.Now()) + } + return &Planner{ autoscalingCtx: autoscalingCtx, unremovableNodes: unremovable.NewNodes(), @@ -104,6 +111,7 @@ func New(autoscalingCtx *ca_context.AutoscalingContext, processors *processors.A scaleDownSetProcessor: processors.ScaleDownSetProcessor, scaleDownContext: nodes.NewDefaultScaleDownContext(), minUpdateInterval: minUpdateInterval, + maxNodeSkipEvalTime: maxNodeSkipEvalTime, } } @@ -277,13 +285,16 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand } p.nodeUtilizationMap = utilizationMap timer := time.NewTimer(p.autoscalingCtx.ScaleDownSimulationTimeout) + var skippedNodes []string for i, node := range currentlyUnneededNodeNames { if timedOut(timer) { + skippedNodes = currentlyUnneededNodeNames[i:] klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames)) break } if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() { + skippedNodes = currentlyUnneededNodeNames[i:] klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more. Total atomic scale down nodes: %d", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList), atomicScaleDownNodesCount) break } @@ -306,6 +317,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand p.unremovableNodes.AddTimeout(unremovable, unremovableTimeout) } } + p.handleUnprocessedNodes(skippedNodes) p.unneededNodes.Update(removableList, p.latestUpdate) if unremovableCount > 0 { klog.V(1).Infof("%v nodes found to be unremovable in simulation, will re-check them at %v", unremovableCount, unremovableTimeout) @@ -372,6 +384,15 @@ func (p *Planner) unneededNodesLimit() int { return limit } +// handleUnprocessedNodes is used to track the longest time that a node is being skipped during ScaleDown +func (p *Planner) handleUnprocessedNodes(unprocessedNodeNames []string) { + // if p.maxNodeSkipEvalTime is nil (flag is disabled) do not do anything + if p.maxNodeSkipEvalTime == nil { + return + } + p.maxNodeSkipEvalTime.Update(unprocessedNodeNames, time.Now()) +} + // getKnownOwnerRef returns ownerRef that is known by CA and CA knows the logic of how this controller recreates pods. func getKnownOwnerRef(ownerRefs []metav1.OwnerReference) *metav1.OwnerReference { for _, ownerRef := range ownerRefs { diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 6739ebb4d2b7..7d96e0447ba8 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -425,6 +425,14 @@ var ( Buckets: k8smetrics.ExponentialBuckets(1, 2, 6), // 1, 2, 4, ..., 32 }, []string{"instance_type", "cpu_count", "namespace_count"}, ) + + maxNodeSkipEvalDurationSeconds = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "max_node_skip_eval_duration_seconds", + Help: "Maximum evaluation time of a node being skipped during ScaleDown.", + }, + ) ) // RegisterAll registers all metrics. @@ -461,6 +469,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(nodeTaintsCount) legacyregistry.MustRegister(inconsistentInstancesMigsCount) legacyregistry.MustRegister(binpackingHeterogeneity) + legacyregistry.MustRegister(maxNodeSkipEvalDurationSeconds) if emitPerNodeGroupMetrics { legacyregistry.MustRegister(nodesGroupMinNodes) @@ -748,3 +757,9 @@ func UpdateInconsistentInstancesMigsCount(migCount int) { func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount string, pegCount int) { binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount)) } + +// ObserveMaxNodeSkipEvalDurationSeconds records the longest time during which node was skipped during ScaleDown. +// If a node is skipped multiple times consecutively, we store only the earliest timestamp. +func ObserveMaxNodeSkipEvalDurationSeconds(duration time.Duration) { + maxNodeSkipEvalDurationSeconds.Set(duration.Seconds()) +}