Skip to content

Commit 8c84d84

Browse files
Fix node tainting when using UseProviderId with SQS monitors (#1203)
1 parent aec2f28 commit 8c84d84

File tree

5 files changed

+52
-4
lines changed

5 files changed

+52
-4
lines changed

pkg/monitor/sqsevent/asg-lifecycle-event.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,18 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
126126
go m.SendHeartbeats(nthConfig.HeartbeatInterval, nthConfig.HeartbeatUntil, lifecycleDetail, stopHeartbeatCh, cancelHeartbeatCh)
127127
}
128128

129-
err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID)
129+
// Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured
130+
nodeName := interruptionEvent.NodeName
131+
if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" {
132+
resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID)
133+
if err != nil {
134+
log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event")
135+
} else {
136+
nodeName = resolvedNodeName
137+
}
138+
}
139+
140+
err := n.TaintASGLifecycleTermination(nodeName, interruptionEvent.EventID)
130141
if err != nil {
131142
log.Err(err).Msgf("unable to taint node with taint %s:%s", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID)
132143
}

pkg/monitor/sqsevent/rebalance-recommendation-event.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,19 @@ func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event *EventBridg
7878
return nil
7979
}
8080
interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
81-
err := n.TaintRebalanceRecommendation(interruptionEvent.NodeName, interruptionEvent.EventID)
81+
// Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured
82+
nthConfig := n.GetNthConfig()
83+
nodeName := interruptionEvent.NodeName
84+
if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" {
85+
resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID)
86+
if err != nil {
87+
log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event")
88+
} else {
89+
nodeName = resolvedNodeName
90+
}
91+
}
92+
93+
err := n.TaintRebalanceRecommendation(nodeName, interruptionEvent.EventID)
8294
if err != nil {
8395
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID)
8496
}

pkg/monitor/sqsevent/scheduled-change-event.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,19 @@ func (m SQSMonitor) scheduledEventToInterruptionEvents(event *EventBridgeEvent,
113113
return nil
114114
}
115115
interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
116-
if err := n.TaintScheduledMaintenance(interruptionEvent.NodeName, interruptionEvent.EventID); err != nil {
116+
// Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured
117+
nthConfig := n.GetNthConfig()
118+
nodeName := interruptionEvent.NodeName
119+
if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" {
120+
resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID)
121+
if err != nil {
122+
log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event")
123+
} else {
124+
nodeName = resolvedNodeName
125+
}
126+
}
127+
128+
if err := n.TaintScheduledMaintenance(nodeName, interruptionEvent.EventID); err != nil {
117129
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.ScheduledMaintenanceTaint, interruptionEvent.EventID)
118130
}
119131
return nil

pkg/monitor/sqsevent/spot-itn-event.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,19 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEven
8080
return nil
8181
}
8282
interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
83-
err := n.TaintSpotItn(interruptionEvent.NodeName, interruptionEvent.EventID)
83+
// Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured
84+
nthConfig := n.GetNthConfig()
85+
nodeName := interruptionEvent.NodeName
86+
if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" {
87+
resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID)
88+
if err != nil {
89+
log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event")
90+
} else {
91+
nodeName = resolvedNodeName
92+
}
93+
}
94+
95+
err := n.TaintSpotItn(nodeName, interruptionEvent.EventID)
8496
if err != nil {
8597
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID)
8698
}

pkg/node/node.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,7 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) {
662662
},
663663
},
664664
}
665+
665666
listOptions := metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(&labelSelector)}
666667
matchingNodes, err := n.drainHelper.Client.CoreV1().Nodes().List(context.TODO(), listOptions)
667668
if err != nil || len(matchingNodes.Items) == 0 {

0 commit comments

Comments
 (0)