Skip to content

Commit 35f6b05

Browse files
Modify scale down set processor to add reasons to unremovable nodes
1 parent 04b1402 commit 35f6b05

File tree

16 files changed

+626
-136
lines changed

16 files changed

+626
-136
lines changed

cluster-autoscaler/core/scaledown/planner/planner.go

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ type Planner struct {
7373
minUpdateInterval time.Duration
7474
eligibilityChecker eligibilityChecker
7575
nodeUtilizationMap map[string]utilization.Info
76-
actuationStatus scaledown.ActuationStatus
7776
resourceLimitsFinder *resource.LimitsFinder
7877
cc controllerReplicasCalculator
7978
scaleDownSetProcessor nodes.ScaleDownSetProcessor
79+
scaleDownContext *nodes.ScaleDownContext
8080
}
8181

8282
// New creates a new Planner object.
@@ -97,7 +97,11 @@ func New(context *context.AutoscalingContext, processors *processors.Autoscaling
9797
resourceLimitsFinder: resourceLimitsFinder,
9898
cc: newControllerReplicasCalculator(context.ListerRegistry),
9999
scaleDownSetProcessor: processors.ScaleDownSetProcessor,
100-
minUpdateInterval: minUpdateInterval,
100+
// No need to limit the number of nodes, since it will happen later, in the actuation stage.
101+
// It will make a more appropriate decision by using additional information about deletions
102+
// in progress.
103+
scaleDownContext: nodes.NewDefaultScaleDownSetContext(math.MaxInt),
104+
minUpdateInterval: minUpdateInterval,
101105
}
102106
}
103107

@@ -110,7 +114,7 @@ func (p *Planner) UpdateClusterState(podDestinations, scaleDownCandidates []*api
110114
p.minUpdateInterval = updateInterval
111115
}
112116
p.latestUpdate = currentTime
113-
p.actuationStatus = as
117+
p.scaleDownContext.ActuationStatus = as
114118
// Avoid persisting changes done by the simulation.
115119
p.context.ClusterSnapshot.Fork()
116120
defer p.context.ClusterSnapshot.Revert()
@@ -147,22 +151,17 @@ func (p *Planner) NodesToDelete(_ time.Time) (empty, needDrain []*apiv1.Node) {
147151
klog.Errorf("Nothing will scale down, failed to create resource limiter: %v", err)
148152
return nil, nil
149153
}
150-
limitsLeft := p.resourceLimitsFinder.LimitsLeft(p.context, nodes, resourceLimiter, p.latestUpdate)
151-
emptyRemovable, needDrainRemovable, unremovable := p.unneededNodes.RemovableAt(p.context, p.latestUpdate, limitsLeft, resourceLimiter.GetResources(), p.actuationStatus)
152-
for _, u := range unremovable {
153-
p.unremovableNodes.Add(u)
154-
}
155-
needDrainRemovable = sortByRisk(needDrainRemovable)
156-
nodesToRemove := p.scaleDownSetProcessor.GetNodesToRemove(
157-
p.context,
158-
// We need to pass empty nodes first, as there might be some non-empty scale
159-
// downs already in progress. If we pass the empty nodes first, they will be first
160-
// to get deleted, thus we decrease chances of hitting the limit on non-empty scale down.
161-
append(emptyRemovable, needDrainRemovable...),
162-
// No need to limit the number of nodes, since it will happen later, in the actuation stage.
163-
// It will make a more appropriate decision by using additional information about deletions
164-
// in progress.
165-
math.MaxInt)
154+
p.scaleDownContext.ResourcesLeft = p.resourceLimitsFinder.LimitsLeft(p.context, nodes, resourceLimiter, p.latestUpdate).DeepCopy()
155+
p.scaleDownContext.ResourcesWithLimits = resourceLimiter.GetResources()
156+
emptyRemovableNodes, needDrainRemovableNodes, unremovableNodes := p.unneededNodes.RemovableAt(p.context, *p.scaleDownContext, p.latestUpdate)
157+
p.addUnremovabaleNodes(unremovableNodes)
158+
159+
needDrainRemovableNodes = sortByRisk(needDrainRemovableNodes)
160+
candidatesToBeRemoved := append(emptyRemovableNodes, needDrainRemovableNodes...)
161+
162+
nodesToRemove, unremovableNodes := p.scaleDownSetProcessor.FilterUnremovableNodes(p.context, *p.scaleDownContext, candidatesToBeRemoved)
163+
p.addUnremovabaleNodes(unremovableNodes)
164+
166165
for _, nodeToRemove := range nodesToRemove {
167166
if len(nodeToRemove.PodsToReschedule) > 0 {
168167
needDrain = append(needDrain, nodeToRemove.Node)
@@ -174,6 +173,12 @@ func (p *Planner) NodesToDelete(_ time.Time) (empty, needDrain []*apiv1.Node) {
174173
return empty, needDrain
175174
}
176175

176+
func (p *Planner) addUnremovabaleNodes(unremovableNodes []simulator.UnremovableNode) {
177+
for _, u := range unremovableNodes {
178+
p.unremovableNodes.Add(&u)
179+
}
180+
}
181+
177182
func allNodes(s clustersnapshot.ClusterSnapshot) ([]*apiv1.Node, error) {
178183
nodeInfos, err := s.NodeInfos().List()
179184
if err != nil {
@@ -212,7 +217,7 @@ func (p *Planner) NodeUtilizationMap() map[string]utilization.Info {
212217
// For pods that are controlled by controller known by CA, it will check whether
213218
// they have been recreated and will inject only not yet recreated pods.
214219
func (p *Planner) injectRecentlyEvictedPods() error {
215-
recentlyEvictedRecreatablePods := pod_util.FilterRecreatablePods(p.actuationStatus.RecentEvictions())
220+
recentlyEvictedRecreatablePods := pod_util.FilterRecreatablePods(p.scaleDownContext.ActuationStatus.RecentEvictions())
216221
return p.injectPods(filterOutRecreatedPods(recentlyEvictedRecreatablePods, p.cc))
217222
}
218223

cluster-autoscaler/core/scaledown/planner/planner_test.go

Lines changed: 90 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package planner
1818

1919
import (
2020
"fmt"
21+
"math"
2122
"testing"
2223
"time"
2324

@@ -35,6 +36,7 @@ import (
3536
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
3637
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unremovable"
3738
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
39+
processorstest "k8s.io/autoscaler/cluster-autoscaler/processors/test"
3840
"k8s.io/autoscaler/cluster-autoscaler/simulator"
3941
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
4042
"k8s.io/autoscaler/cluster-autoscaler/simulator/options"
@@ -498,7 +500,7 @@ func TestUpdateClusterState(t *testing.T) {
498500
assert.NoError(t, err)
499501
clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, tc.nodes, tc.pods)
500502
deleteOptions := options.NodeDeleteOptions{}
501-
p := New(&context, NewTestProcessors(&context), deleteOptions, nil)
503+
p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil)
502504
p.eligibilityChecker = &fakeEligibilityChecker{eligible: asMap(tc.eligible)}
503505
if tc.isSimulationTimeout {
504506
context.AutoscalingOptions.ScaleDownSimulationTimeout = 1 * time.Second
@@ -694,7 +696,7 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
694696
assert.NoError(t, err)
695697
clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, nil)
696698
deleteOptions := options.NodeDeleteOptions{}
697-
p := New(&context, NewTestProcessors(&context), deleteOptions, nil)
699+
p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil)
698700
p.eligibilityChecker = &fakeEligibilityChecker{eligible: asMap(nodeNames(nodes))}
699701
p.minUpdateInterval = tc.updateInterval
700702
p.unneededNodes.Update(previouslyUnneeded, time.Now())
@@ -706,16 +708,18 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
706708

707709
func TestNodesToDelete(t *testing.T) {
708710
testCases := []struct {
709-
name string
710-
nodes map[cloudprovider.NodeGroup][]simulator.NodeToBeRemoved
711-
wantEmpty []*apiv1.Node
712-
wantDrain []*apiv1.Node
711+
name string
712+
nodes map[cloudprovider.NodeGroup][]simulator.NodeToBeRemoved
713+
wantEmpty []*apiv1.Node
714+
wantDrain []*apiv1.Node
715+
maxNodeCountToBeRemoved int
713716
}{
714717
{
715-
name: "empty",
716-
nodes: map[cloudprovider.NodeGroup][]simulator.NodeToBeRemoved{},
717-
wantEmpty: []*apiv1.Node{},
718-
wantDrain: []*apiv1.Node{},
718+
name: "empty",
719+
nodes: map[cloudprovider.NodeGroup][]simulator.NodeToBeRemoved{},
720+
wantEmpty: []*apiv1.Node{},
721+
wantDrain: []*apiv1.Node{},
722+
maxNodeCountToBeRemoved: math.MaxInt,
719723
},
720724
{
721725
name: "single empty",
@@ -727,7 +731,26 @@ func TestNodesToDelete(t *testing.T) {
727731
wantEmpty: []*apiv1.Node{
728732
buildRemovableNode("test-node", 0).Node,
729733
},
730-
wantDrain: []*apiv1.Node{},
734+
wantDrain: []*apiv1.Node{},
735+
maxNodeCountToBeRemoved: math.MaxInt,
736+
},
737+
{
738+
name: "multiple empty with limit",
739+
nodes: map[cloudprovider.NodeGroup][]simulator.NodeToBeRemoved{
740+
sizedNodeGroup("test-ng", 3, false): {
741+
buildRemovableNode("node-1", 0),
742+
buildRemovableNode("node-2", 0),
743+
buildRemovableNode("node-3", 0),
744+
buildRemovableNode("node-4", 1),
745+
},
746+
},
747+
wantEmpty: []*apiv1.Node{
748+
buildRemovableNode("node-1", 0).Node,
749+
buildRemovableNode("node-2", 0).Node,
750+
buildRemovableNode("node-3", 0).Node,
751+
},
752+
wantDrain: []*apiv1.Node{},
753+
maxNodeCountToBeRemoved: 3,
731754
},
732755
{
733756
name: "single drain",
@@ -740,6 +763,7 @@ func TestNodesToDelete(t *testing.T) {
740763
wantDrain: []*apiv1.Node{
741764
buildRemovableNode("test-node", 1).Node,
742765
},
766+
maxNodeCountToBeRemoved: math.MaxInt,
743767
},
744768
{
745769
name: "single empty atomic",
@@ -748,8 +772,9 @@ func TestNodesToDelete(t *testing.T) {
748772
buildRemovableNode("node-1", 0),
749773
},
750774
},
751-
wantEmpty: []*apiv1.Node{},
752-
wantDrain: []*apiv1.Node{},
775+
wantEmpty: []*apiv1.Node{},
776+
wantDrain: []*apiv1.Node{},
777+
maxNodeCountToBeRemoved: math.MaxInt,
753778
},
754779
{
755780
name: "all empty atomic",
@@ -765,7 +790,8 @@ func TestNodesToDelete(t *testing.T) {
765790
buildRemovableNode("node-2", 0).Node,
766791
buildRemovableNode("node-3", 0).Node,
767792
},
768-
wantDrain: []*apiv1.Node{},
793+
wantDrain: []*apiv1.Node{},
794+
maxNodeCountToBeRemoved: math.MaxInt,
769795
},
770796
{
771797
name: "some drain atomic",
@@ -783,6 +809,7 @@ func TestNodesToDelete(t *testing.T) {
783809
wantDrain: []*apiv1.Node{
784810
buildRemovableNode("node-3", 1).Node,
785811
},
812+
maxNodeCountToBeRemoved: math.MaxInt,
786813
},
787814
{
788815
name: "different groups",
@@ -836,6 +863,52 @@ func TestNodesToDelete(t *testing.T) {
836863
buildRemovableNode("node-14", 0).Node,
837864
buildRemovableNode("node-15", 0).Node,
838865
},
866+
maxNodeCountToBeRemoved: math.MaxInt,
867+
},
868+
{
869+
name: "different groups with max count equal to all empty",
870+
nodes: map[cloudprovider.NodeGroup][]simulator.NodeToBeRemoved{
871+
sizedNodeGroup("standard-empty-ng", 3, false): {
872+
buildRemovableNode("node-1", 0),
873+
buildRemovableNode("node-2", 0),
874+
buildRemovableNode("node-3", 0),
875+
},
876+
sizedNodeGroup("standard-drain-ng", 3, false): {
877+
buildRemovableNode("node-4", 1),
878+
buildRemovableNode("node-5", 2),
879+
buildRemovableNode("node-6", 3),
880+
},
881+
sizedNodeGroup("standard-mixed-ng", 3, false): {
882+
buildRemovableNode("node-7", 0),
883+
buildRemovableNode("node-8", 1),
884+
buildRemovableNode("node-9", 2),
885+
},
886+
sizedNodeGroup("atomic-empty-ng", 3, true): {
887+
buildRemovableNode("node-10", 0),
888+
buildRemovableNode("node-11", 0),
889+
buildRemovableNode("node-12", 0),
890+
},
891+
sizedNodeGroup("atomic-mixed-ng", 3, true): {
892+
buildRemovableNode("node-13", 0),
893+
buildRemovableNode("node-14", 1),
894+
buildRemovableNode("node-15", 2),
895+
},
896+
sizedNodeGroup("atomic-partial-ng", 3, true): {
897+
buildRemovableNode("node-16", 0),
898+
buildRemovableNode("node-17", 1),
899+
},
900+
},
901+
wantEmpty: []*apiv1.Node{
902+
buildRemovableNode("node-1", 0).Node,
903+
buildRemovableNode("node-2", 0).Node,
904+
buildRemovableNode("node-3", 0).Node,
905+
buildRemovableNode("node-7", 0).Node,
906+
buildRemovableNode("node-10", 0).Node,
907+
buildRemovableNode("node-11", 0).Node,
908+
buildRemovableNode("node-12", 0).Node,
909+
},
910+
wantDrain: []*apiv1.Node{},
911+
maxNodeCountToBeRemoved: 9,
839912
},
840913
}
841914
for _, tc := range testCases {
@@ -862,9 +935,10 @@ func TestNodesToDelete(t *testing.T) {
862935
assert.NoError(t, err)
863936
clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, allNodes, nil)
864937
deleteOptions := options.NodeDeleteOptions{}
865-
p := New(&context, NewTestProcessors(&context), deleteOptions, nil)
938+
p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil)
866939
p.latestUpdate = time.Now()
867-
p.actuationStatus = deletiontracker.NewNodeDeletionTracker(0 * time.Second)
940+
p.scaleDownContext.ActuationStatus = deletiontracker.NewNodeDeletionTracker(0 * time.Second)
941+
p.scaleDownContext.MaxNodeCountToBeRemoved = tc.maxNodeCountToBeRemoved
868942
p.unneededNodes.Update(allRemovables, time.Now().Add(-1*time.Hour))
869943
p.eligibilityChecker = &fakeEligibilityChecker{eligible: asMap(nodeNames(allNodes))}
870944
empty, drain := p.NodesToDelete(time.Now())

cluster-autoscaler/core/scaledown/unneeded/nodes.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
2727
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource"
2828
"k8s.io/autoscaler/cluster-autoscaler/metrics"
29+
"k8s.io/autoscaler/cluster-autoscaler/processors/nodes"
2930
"k8s.io/autoscaler/cluster-autoscaler/simulator"
3031
"k8s.io/autoscaler/cluster-autoscaler/utils"
3132
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
@@ -117,31 +118,30 @@ func (n *Nodes) Drop(node string) {
117118
// RemovableAt returns all nodes that can be removed at a given time, divided
118119
// into empty and non-empty node lists, as well as a list of nodes that were
119120
// unneeded, but are not removable, annotated by reason.
120-
func (n *Nodes) RemovableAt(context *context.AutoscalingContext, ts time.Time, resourcesLeft resource.Limits, resourcesWithLimits []string, as scaledown.ActuationStatus) (empty, needDrain []simulator.NodeToBeRemoved, unremovable []*simulator.UnremovableNode) {
121+
func (n *Nodes) RemovableAt(context *context.AutoscalingContext, scaleDownContext nodes.ScaleDownContext, ts time.Time) (empty, needDrain []simulator.NodeToBeRemoved, unremovable []simulator.UnremovableNode) {
121122
nodeGroupSize := utils.GetNodeGroupSizeMap(context.CloudProvider)
122-
resourcesLeftCopy := resourcesLeft.DeepCopy()
123123
emptyNodes, drainNodes := n.splitEmptyAndNonEmptyNodes()
124124

125125
for nodeName, v := range emptyNodes {
126126
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
127-
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeftCopy, resourcesWithLimits, as); r != simulator.NoReason {
128-
unremovable = append(unremovable, &simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
127+
if r := n.unremovableReason(context, scaleDownContext, v, ts, nodeGroupSize); r != simulator.NoReason {
128+
unremovable = append(unremovable, simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
129129
continue
130130
}
131131
empty = append(empty, v.ntbr)
132132
}
133133
for nodeName, v := range drainNodes {
134134
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
135-
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeftCopy, resourcesWithLimits, as); r != simulator.NoReason {
136-
unremovable = append(unremovable, &simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
135+
if r := n.unremovableReason(context, scaleDownContext, v, ts, nodeGroupSize); r != simulator.NoReason {
136+
unremovable = append(unremovable, simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
137137
continue
138138
}
139139
needDrain = append(needDrain, v.ntbr)
140140
}
141141
return
142142
}
143143

144-
func (n *Nodes) unremovableReason(context *context.AutoscalingContext, v *node, ts time.Time, nodeGroupSize map[string]int, resourcesLeft resource.Limits, resourcesWithLimits []string, as scaledown.ActuationStatus) simulator.UnremovableReason {
144+
func (n *Nodes) unremovableReason(context *context.AutoscalingContext, scaleDownContext nodes.ScaleDownContext, v *node, ts time.Time, nodeGroupSize map[string]int) simulator.UnremovableReason {
145145
node := v.ntbr.Node
146146
// Check if node is marked with no scale down annotation.
147147
if eligibility.HasNoScaleDownAnnotation(node) {
@@ -182,17 +182,17 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, v *node,
182182
}
183183
}
184184

185-
if reason := verifyMinSize(node.Name, nodeGroup, nodeGroupSize, as); reason != simulator.NoReason {
185+
if reason := verifyMinSize(node.Name, nodeGroup, nodeGroupSize, scaleDownContext.ActuationStatus); reason != simulator.NoReason {
186186
return reason
187187
}
188188

189-
resourceDelta, err := n.limitsFinder.DeltaForNode(context, node, nodeGroup, resourcesWithLimits)
189+
resourceDelta, err := n.limitsFinder.DeltaForNode(context, node, nodeGroup, scaleDownContext.ResourcesWithLimits)
190190
if err != nil {
191191
klog.Errorf("Error getting node resources: %v", err)
192192
return simulator.UnexpectedError
193193
}
194194

195-
checkResult := resourcesLeft.TryDecrementBy(resourceDelta)
195+
checkResult := scaleDownContext.ResourcesLeft.TryDecrementBy(resourceDelta)
196196
if checkResult.Exceeded() {
197197
klog.V(4).Infof("Skipping %s - minimal limit exceeded for %v", node.Name, checkResult.ExceededResources)
198198
for _, resource := range checkResult.ExceededResources {

cluster-autoscaler/core/scaledown/unneeded/nodes_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package unneeded
1818

1919
import (
2020
"fmt"
21+
"math"
2122
"testing"
2223
"time"
2324

@@ -28,6 +29,7 @@ import (
2829
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource"
2930
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
3031
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
32+
"k8s.io/autoscaler/cluster-autoscaler/processors/nodes"
3133
"k8s.io/autoscaler/cluster-autoscaler/simulator"
3234
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
3335
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
@@ -186,10 +188,10 @@ func TestRemovableAt(t *testing.T) {
186188
})
187189
}
188190

189-
nodes := append(empty, drain...)
191+
removableNodes := append(empty, drain...)
190192
provider := testprovider.NewTestCloudProvider(nil, nil)
191193
provider.InsertNodeGroup(ng)
192-
for _, node := range nodes {
194+
for _, node := range removableNodes {
193195
provider.AddNode("ng", node.Node)
194196
}
195197

@@ -202,8 +204,13 @@ func TestRemovableAt(t *testing.T) {
202204
assert.NoError(t, err)
203205

204206
n := NewNodes(&fakeScaleDownTimeGetter{}, &resource.LimitsFinder{})
205-
n.Update(nodes, time.Now())
206-
gotEmptyToRemove, gotDrainToRemove, _ := n.RemovableAt(&ctx, time.Now(), resource.Limits{}, []string{}, as)
207+
n.Update(removableNodes, time.Now())
208+
gotEmptyToRemove, gotDrainToRemove, _ := n.RemovableAt(&ctx, nodes.ScaleDownContext{
209+
ActuationStatus: as,
210+
ResourcesLeft: resource.Limits{},
211+
ResourcesWithLimits: []string{},
212+
MaxNodeCountToBeRemoved: math.MaxInt,
213+
}, time.Now())
207214
if len(gotDrainToRemove) != tc.numDrainToRemove || len(gotEmptyToRemove) != tc.numEmptyToRemove {
208215
t.Errorf("%s: getNodesToRemove() return %d, %d, want %d, %d", tc.name, len(gotEmptyToRemove), len(gotDrainToRemove), tc.numEmptyToRemove, tc.numDrainToRemove)
209216
}

0 commit comments

Comments
 (0)