diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index c049072e..09b45ed7 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -83,6 +83,14 @@ func main() { zerolog.SetGlobalLevel(zerolog.ErrorLevel) } + log.Info().Msgf("Using log format version %d", nthConfig.LogFormatVersion) + if err = logging.SetFormatVersion(nthConfig.LogFormatVersion); err != nil { + log.Warn().Err(err).Send() + } + if err = observability.SetReasonForKindVersion(nthConfig.LogFormatVersion); err != nil { + log.Warn().Err(err).Send() + } + err = webhook.ValidateWebhookConfig(nthConfig) if err != nil { nthConfig.Print() @@ -196,13 +204,13 @@ func main() { for _, fn := range monitoringFns { go func(monitor monitor.Monitor) { - log.Info().Str("event_type", monitor.Kind()).Msg("Started monitoring for events") + logging.VersionedMsgs.MonitoringStarted(monitor.Kind()) var previousErr error var duplicateErrCount int for range time.Tick(time.Second * 2) { err := monitor.Monitor() if err != nil { - log.Warn().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events") + logging.VersionedMsgs.ProblemMonitoringForEvents(monitor.Kind(), err) metrics.ErrorEventsInc(monitor.Kind()) recorder.Emit(nthConfig.NodeName, observability.Warning, observability.MonitorErrReason, observability.MonitorErrMsgFmt, monitor.Kind()) if previousErr != nil && err.Error() == previousErr.Error() { @@ -239,16 +247,10 @@ func main() { for event, ok := interruptionEventStore.GetActiveEvent(); ok; event, ok = interruptionEventStore.GetActiveEvent() { select { case interruptionEventStore.Workers <- 1: - log.Info(). - Str("event-id", event.EventID). - Str("kind", event.Kind). - Str("node-name", event.NodeName). - Str("instance-id", event.InstanceID). - Str("provider-id", event.ProviderID). - Msg("Requesting instance drain") + logging.VersionedMsgs.RequestingInstanceDrain(event) event.InProgress = true wg.Add(1) - recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind), event.Description) + recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description) go drainOrCordonIfNecessary(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, &wg) default: log.Warn().Msg("all workers busy, waiting") diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md index 5d8825af..4b9c8843 100644 --- a/config/helm/aws-node-termination-handler/README.md +++ b/config/helm/aws-node-termination-handler/README.md @@ -70,6 +70,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode | `extraEnv` | Additional environment variables for the _aws-node-termination-handler_ container. | `[]` | | `probes` | The Kubernetes liveness probe configuration. | _See values.yaml_ | | `logLevel` | Sets the log level (`info`,`debug`, or `error`) | `info` | +| `logFormatVersion` | Sets the log format version. Available versions: 1, 2. Version 1 refers to the format that has been used through v1.17.3. Version 2 offers more detail for the "event kind" and "reason", especially when operating in Queue Processor mode. | `1` | | `jsonLogging` | If `true`, use JSON-formatted logs instead of human readable logs. | `false` | | `enablePrometheusServer` | If `true`, start an http server exposing `/metrics` endpoint for _Prometheus_. | `false` | | `prometheusServerPort` | Replaces the default HTTP port for exposing _Prometheus_ metrics. | `9092` | diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml index b6568ad6..95e4b50f 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml @@ -81,6 +81,8 @@ spec: value: {{ .Values.logLevel | quote }} - name: JSON_LOGGING value: {{ .Values.jsonLogging | quote }} + - name: LOG_FORMAT_VERSION + value: {{ .Values.logFormatVersion | quote }} - name: ENABLE_PROMETHEUS_SERVER value: {{ .Values.enablePrometheusServer | quote }} - name: PROMETHEUS_SERVER_PORT diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml index 6a9118fe..8a9db7bf 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml @@ -81,6 +81,8 @@ spec: value: {{ .Values.logLevel | quote }} - name: JSON_LOGGING value: {{ .Values.jsonLogging | quote }} + - name: LOG_FORMAT_VERSION + value: {{ .Values.logFormatVersion | quote }} - name: ENABLE_PROMETHEUS_SERVER value: {{ .Values.enablePrometheusServer | quote }} - name: PROMETHEUS_SERVER_PORT diff --git a/config/helm/aws-node-termination-handler/templates/deployment.yaml b/config/helm/aws-node-termination-handler/templates/deployment.yaml index 241eddf1..eeecafb7 100644 --- a/config/helm/aws-node-termination-handler/templates/deployment.yaml +++ b/config/helm/aws-node-termination-handler/templates/deployment.yaml @@ -78,6 +78,8 @@ spec: value: {{ .Values.logLevel | quote }} - name: JSON_LOGGING value: {{ .Values.jsonLogging | quote }} + - name: LOG_FORMAT_VERSION + value: {{ .Values.logFormatVersion | quote }} - name: ENABLE_PROMETHEUS_SERVER value: {{ .Values.enablePrometheusServer | quote }} - name: PROMETHEUS_SERVER_PORT diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index 2fbdf2d3..c965358e 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -66,6 +66,9 @@ probes: # Set the log level logLevel: info +# Set the log format version +logFormatVersion: 1 + # Log messages in JSON format jsonLogging: false diff --git a/pkg/config/config.go b/pkg/config/config.go index 938531e0..3979260c 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -79,6 +79,10 @@ const ( jsonLoggingDefault = false logLevelConfigKey = "LOG_LEVEL" logLevelDefault = "INFO" + logFormatVersionKey = "LOG_FORMAT_VERSION" + logFormatVersionDefault = 1 + MinSupportedLogFormatVersion = 1 + MaxSupportedLogFormatVersion = 2 uptimeFromFileConfigKey = "UPTIME_FROM_FILE" uptimeFromFileDefault = "" workersConfigKey = "WORKERS" @@ -138,6 +142,7 @@ type Config struct { ExcludeFromLoadBalancers bool JsonLogging bool LogLevel string + LogFormatVersion int UptimeFromFile string EnablePrometheus bool PrometheusPort int @@ -197,6 +202,7 @@ func ParseCliArgs() (config Config, err error) { flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.") flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.") flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)") + flag.IntVar(&config.LogFormatVersion, "log-format-version", getIntEnv(logFormatVersionKey, logFormatVersionDefault), "Sets the log format version.") flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).") flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.") flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.") @@ -242,6 +248,15 @@ func ParseCliArgs() (config Config, err error) { return config, fmt.Errorf("invalid log-level passed: %s Should be one of: info, debug, error", config.LogLevel) } + if config.LogFormatVersion < MinSupportedLogFormatVersion { + log.Warn().Msgf("Log format version %d is not supported, using format version %d", config.LogFormatVersion, MinSupportedLogFormatVersion) + config.LogFormatVersion = MinSupportedLogFormatVersion + } + if config.LogFormatVersion > MaxSupportedLogFormatVersion { + log.Warn().Msgf("Log format version %d is not supported, using format version %d", config.LogFormatVersion, MaxSupportedLogFormatVersion) + config.LogFormatVersion = MaxSupportedLogFormatVersion + } + if config.NodeName == "" { panic("You must provide a node-name to the CLI or NODE_NAME environment variable.") } diff --git a/pkg/logging/versioned.go b/pkg/logging/versioned.go new file mode 100644 index 00000000..12b47641 --- /dev/null +++ b/pkg/logging/versioned.go @@ -0,0 +1,91 @@ +// Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package logging + +import ( + "fmt" + + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/rs/zerolog/log" +) + +type versionedMsgsV1 struct{} + +func (versionedMsgsV1) MonitoringStarted(monitorKind string) { + log.Info().Str("event_type", monitorKind).Msg("Started monitoring for events") +} + +func (versionedMsgsV1) ProblemMonitoringForEvents(monitorKind string, err error) { + log.Warn().Str("event_type", monitorKind).Err(err).Msg("There was a problem monitoring for events") +} + +func (versionedMsgsV1) RequestingInstanceDrain(event *monitor.InterruptionEvent) { + log.Info(). + Str("event-id", event.EventID). + Str("kind", event.Kind). + Str("node-name", event.NodeName). + Str("instance-id", event.InstanceID). + Str("provider-id", event.ProviderID). + Msg("Requesting instance drain") +} + +func (versionedMsgsV1) SendingInterruptionEventToChannel(_ string) { + log.Debug().Msg("Sending SQS_TERMINATE interruption event to the interruption channel") +} + +type versionedMsgsV2 struct{} + +func (versionedMsgsV2) MonitoringStarted(monitorKind string) { + log.Info().Str("monitor_type", monitorKind).Msg("Started monitoring for events") +} + +func (versionedMsgsV2) ProblemMonitoringForEvents(monitorKind string, err error) { + log.Warn().Str("monitor_type", monitorKind).Err(err).Msg("There was a problem monitoring for events") +} + +func (versionedMsgsV2) RequestingInstanceDrain(event *monitor.InterruptionEvent) { + log.Info(). + Str("event-id", event.EventID). + Str("kind", event.Kind). + Str("monitor", event.Monitor). + Str("node-name", event.NodeName). + Str("instance-id", event.InstanceID). + Str("provider-id", event.ProviderID). + Msg("Requesting instance drain") +} + +func (versionedMsgsV2) SendingInterruptionEventToChannel(eventKind string) { + log.Debug().Msgf("Sending %s interruption event to the interruption channel", eventKind) +} + +var VersionedMsgs interface { + MonitoringStarted(monitorKind string) + ProblemMonitoringForEvents(monitorKind string, err error) + RequestingInstanceDrain(event *monitor.InterruptionEvent) + SendingInterruptionEventToChannel(eventKind string) +} = versionedMsgsV1{} + +func SetFormatVersion(version int) error { + switch version { + case 1: + VersionedMsgs = versionedMsgsV1{} + return nil + case 2: + VersionedMsgs = versionedMsgsV2{} + return nil + default: + VersionedMsgs = versionedMsgsV1{} + return fmt.Errorf("Unrecognized log format version: %d, using version 1", version) + } +} diff --git a/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor.go b/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor.go index 5c12c67b..d0095c78 100644 --- a/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor.go +++ b/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor.go @@ -23,10 +23,8 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/node" ) -const ( - // RebalanceRecommendationKind is a const to define a Rebalance Recommendation kind of event - RebalanceRecommendationKind = "REBALANCE_RECOMMENDATION" -) +// RebalanceRecommentadionMonitorKind is a const to define this monitor kind +const RebalanceRecommendationMonitorKind = "REBALANCE_RECOMMENDATION_MONITOR" // RebalanceRecommendationMonitor is a struct definition which facilitates monitoring of rebalance recommendations from IMDS type RebalanceRecommendationMonitor struct { @@ -50,15 +48,15 @@ func (m RebalanceRecommendationMonitor) Monitor() error { if err != nil { return err } - if interruptionEvent != nil && interruptionEvent.Kind == RebalanceRecommendationKind { + if interruptionEvent != nil && interruptionEvent.Kind == monitor.RebalanceRecommendationKind { m.InterruptionChan <- *interruptionEvent } return nil } -// Kind denotes the kind of event that is processed +// Kind denotes the kind of monitor func (m RebalanceRecommendationMonitor) Kind() string { - return RebalanceRecommendationKind + return RebalanceRecommendationMonitorKind } // checkForRebalanceRecommendation Checks EC2 instance metadata for a rebalance recommendation @@ -86,7 +84,8 @@ func (m RebalanceRecommendationMonitor) checkForRebalanceRecommendation() (*moni return &monitor.InterruptionEvent{ EventID: fmt.Sprintf("rebalance-recommendation-%x", hash.Sum(nil)), - Kind: RebalanceRecommendationKind, + Kind: monitor.RebalanceRecommendationKind, + Monitor: RebalanceRecommendationMonitorKind, StartTime: noticeTime, NodeName: nodeName, Description: fmt.Sprintf("Rebalance recommendation received. Instance will be cordoned at %s \n", rebalanceRecommendation.NoticeTime), diff --git a/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor_test.go b/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor_test.go index afc91295..baba4462 100644 --- a/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor_test.go +++ b/pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor_test.go @@ -55,7 +55,8 @@ func TestMonitor_Success(t *testing.T) { go func() { result := <-drainChan - h.Equals(t, rebalancerecommendation.RebalanceRecommendationKind, result.Kind) + h.Equals(t, monitor.RebalanceRecommendationKind, result.Kind) + h.Equals(t, rebalancerecommendation.RebalanceRecommendationMonitorKind, result.Monitor) h.Equals(t, expFormattedTime, result.StartTime.String()) h.Assert(t, strings.Contains(result.Description, startTime), "Expected description to contain: "+startTime+" but is actually: "+result.Description) diff --git a/pkg/monitor/scheduledevent/scheduled-event-monitor.go b/pkg/monitor/scheduledevent/scheduled-event-monitor.go index a7c06070..94cf2bd1 100644 --- a/pkg/monitor/scheduledevent/scheduled-event-monitor.go +++ b/pkg/monitor/scheduledevent/scheduled-event-monitor.go @@ -24,8 +24,8 @@ import ( ) const ( - // ScheduledEventKind is a const to define a scheduled event kind of interruption event - ScheduledEventKind = "SCHEDULED_EVENT" + // ScheduledEventMonitorKind is a const to define this monitor kind + ScheduledEventMonitorKind = "SCHEDULED_EVENT_MONITOR" scheduledEventStateCompleted = "completed" scheduledEventStateCanceled = "canceled" scheduledEventDateFormat = "2 Jan 2006 15:04:05 GMT" @@ -69,9 +69,9 @@ func (m ScheduledEventMonitor) Monitor() error { return nil } -// Kind denotes the kind of event that is processed +// Kind denotes the kind of monitor func (m ScheduledEventMonitor) Kind() string { - return ScheduledEventKind + return ScheduledEventMonitorKind } // checkForScheduledEvents Checks EC2 instance metadata for a scheduled event requiring a node drain @@ -101,7 +101,8 @@ func (m ScheduledEventMonitor) checkForScheduledEvents() ([]monitor.Interruption } events = append(events, monitor.InterruptionEvent{ EventID: scheduledEvent.EventID, - Kind: ScheduledEventKind, + Kind: monitor.ScheduledEventKind, + Monitor: ScheduledEventMonitorKind, Description: fmt.Sprintf("%s will occur between %s and %s because %s\n", scheduledEvent.Code, scheduledEvent.NotBefore, scheduledEvent.NotAfter, scheduledEvent.Description), State: scheduledEvent.State, NodeName: m.NodeName, diff --git a/pkg/monitor/scheduledevent/scheduled-event-monitor_test.go b/pkg/monitor/scheduledevent/scheduled-event-monitor_test.go index eb7fbb8f..fbe3a26f 100644 --- a/pkg/monitor/scheduledevent/scheduled-event-monitor_test.go +++ b/pkg/monitor/scheduledevent/scheduled-event-monitor_test.go @@ -75,7 +75,8 @@ func TestMonitor_Success(t *testing.T) { go func() { result := <-drainChan h.Equals(t, scheduledEventId, result.EventID) - h.Equals(t, scheduledevent.ScheduledEventKind, result.Kind) + h.Equals(t, monitor.ScheduledEventKind, result.Kind) + h.Equals(t, scheduledevent.ScheduledEventMonitorKind, result.Monitor) h.Equals(t, scheduledEventState, result.State) h.TimeWithinRange(t, result.StartTime, oneSecondAgo(), time.Now()) h.Equals(t, expScheduledEventEndTimeFmt, result.EndTime.String()) @@ -130,7 +131,7 @@ func TestMonitor_CanceledEvent(t *testing.T) { go func() { result := <-cancelChan h.Equals(t, scheduledEventId, result.EventID) - h.Equals(t, scheduledevent.ScheduledEventKind, result.Kind) + h.Equals(t, monitor.ScheduledEventKind, result.Kind) h.Equals(t, state, result.State) h.TimeWithinRange(t, result.StartTime, oneSecondAgo(), time.Now()) h.Equals(t, expScheduledEventEndTimeFmt, result.EndTime.String()) @@ -257,7 +258,7 @@ func TestMonitor_EndTimeParseFail(t *testing.T) { go func() { result := <-drainChan h.Equals(t, scheduledEventId, result.EventID) - h.Equals(t, scheduledevent.ScheduledEventKind, result.Kind) + h.Equals(t, monitor.ScheduledEventKind, result.Kind) h.Equals(t, scheduledEventState, result.State) h.TimeWithinRange(t, result.StartTime, oneSecondAgo(), time.Now()) h.Equals(t, expScheduledEventStartTimeFmt, result.EndTime.String()) diff --git a/pkg/monitor/spotitn/spot-itn-monitor.go b/pkg/monitor/spotitn/spot-itn-monitor.go index c755a165..061577e7 100644 --- a/pkg/monitor/spotitn/spot-itn-monitor.go +++ b/pkg/monitor/spotitn/spot-itn-monitor.go @@ -23,10 +23,8 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/node" ) -const ( - // SpotITNKind is a const to define a Spot ITN kind of interruption event - SpotITNKind = "SPOT_ITN" -) +// SpotITNMonitorKind is a const to define this monitor kind +const SpotITNMonitorKind = "SPOT_ITN_MONITOR" // SpotInterruptionMonitor is a struct definition which facilitates monitoring of spot ITNs from IMDS type SpotInterruptionMonitor struct { @@ -52,15 +50,15 @@ func (m SpotInterruptionMonitor) Monitor() error { if err != nil { return err } - if interruptionEvent != nil && interruptionEvent.Kind == SpotITNKind { + if interruptionEvent != nil && interruptionEvent.Kind == monitor.SpotITNKind { m.InterruptionChan <- *interruptionEvent } return nil } -// Kind denotes the kind of event that is processed +// Kind denotes the kind of monitor func (m SpotInterruptionMonitor) Kind() string { - return SpotITNKind + return SpotITNMonitorKind } // checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice @@ -88,7 +86,8 @@ func (m SpotInterruptionMonitor) checkForSpotInterruptionNotice() (*monitor.Inte return &monitor.InterruptionEvent{ EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)), - Kind: SpotITNKind, + Kind: monitor.SpotITNKind, + Monitor: SpotITNMonitorKind, StartTime: interruptionTime, NodeName: nodeName, Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time), diff --git a/pkg/monitor/spotitn/spot-itn-monitor_test.go b/pkg/monitor/spotitn/spot-itn-monitor_test.go index 73907ffe..afbd44b8 100644 --- a/pkg/monitor/spotitn/spot-itn-monitor_test.go +++ b/pkg/monitor/spotitn/spot-itn-monitor_test.go @@ -57,7 +57,8 @@ func TestMonitor_Success(t *testing.T) { go func() { result := <-drainChan - h.Equals(t, spotitn.SpotITNKind, result.Kind) + h.Equals(t, monitor.SpotITNKind, result.Kind) + h.Equals(t, spotitn.SpotITNMonitorKind, result.Monitor) h.Equals(t, expFormattedTime, result.StartTime.String()) h.Assert(t, strings.Contains(result.Description, startTime), "Expected description to contain: "+startTime+" but is actually: "+result.Description) diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index e5411cfb..5c088030 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -80,7 +80,8 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m interruptionEvent := monitor.InterruptionEvent{ EventID: fmt.Sprintf("asg-lifecycle-term-%x", event.ID), - Kind: SQSTerminateKind, + Kind: monitor.ASGLifecycleKind, + Monitor: SQSMonitorKind, AutoScalingGroupName: lifecycleDetail.AutoScalingGroupName, StartTime: event.getTime(), NodeName: nodeInfo.Name, diff --git a/pkg/monitor/sqsevent/ec2-state-change-event.go b/pkg/monitor/sqsevent/ec2-state-change-event.go index 0eb98834..ba4f08c5 100644 --- a/pkg/monitor/sqsevent/ec2-state-change-event.go +++ b/pkg/monitor/sqsevent/ec2-state-change-event.go @@ -67,7 +67,8 @@ func (m SQSMonitor) ec2StateChangeToInterruptionEvent(event *EventBridgeEvent, m } interruptionEvent := monitor.InterruptionEvent{ EventID: fmt.Sprintf("ec2-state-change-event-%x", event.ID), - Kind: SQSTerminateKind, + Kind: monitor.StateChangeKind, + Monitor: SQSMonitorKind, StartTime: event.getTime(), NodeName: nodeInfo.Name, IsManaged: nodeInfo.IsManaged, diff --git a/pkg/monitor/sqsevent/rebalance-recommendation-event.go b/pkg/monitor/sqsevent/rebalance-recommendation-event.go index 75db706c..9b2b9f42 100644 --- a/pkg/monitor/sqsevent/rebalance-recommendation-event.go +++ b/pkg/monitor/sqsevent/rebalance-recommendation-event.go @@ -59,7 +59,8 @@ func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event *EventBridg } interruptionEvent := monitor.InterruptionEvent{ EventID: fmt.Sprintf("rebalance-recommendation-event-%x", event.ID), - Kind: SQSTerminateKind, + Kind: monitor.RebalanceRecommendationKind, + Monitor: SQSMonitorKind, AutoScalingGroupName: nodeInfo.AsgName, StartTime: event.getTime(), NodeName: nodeInfo.Name, diff --git a/pkg/monitor/sqsevent/scheduled-change-event.go b/pkg/monitor/sqsevent/scheduled-change-event.go index 41adfbf4..e66f0453 100644 --- a/pkg/monitor/sqsevent/scheduled-change-event.go +++ b/pkg/monitor/sqsevent/scheduled-change-event.go @@ -95,7 +95,8 @@ func (m SQSMonitor) scheduledEventToInterruptionEvents(event *EventBridgeEvent, // Begin drain immediately for scheduled change events to avoid disruptions in cases such as degraded hardware interruptionEvent := monitor.InterruptionEvent{ EventID: fmt.Sprintf("aws-health-scheduled-change-event-%x", event.ID), - Kind: SQSTerminateKind, + Kind: monitor.ScheduledEventKind, + Monitor: SQSMonitorKind, AutoScalingGroupName: nodeInfo.AsgName, StartTime: time.Now(), NodeName: nodeInfo.Name, diff --git a/pkg/monitor/sqsevent/spot-itn-event.go b/pkg/monitor/sqsevent/spot-itn-event.go index 97e8d138..25aac324 100644 --- a/pkg/monitor/sqsevent/spot-itn-event.go +++ b/pkg/monitor/sqsevent/spot-itn-event.go @@ -61,7 +61,8 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEven } interruptionEvent := monitor.InterruptionEvent{ EventID: fmt.Sprintf("spot-itn-event-%x", event.ID), - Kind: SQSTerminateKind, + Kind: monitor.SpotITNKind, + Monitor: SQSMonitorKind, AutoScalingGroupName: nodeInfo.AsgName, StartTime: event.getTime(), NodeName: nodeInfo.Name, diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index b2da9e98..e028506c 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -18,6 +18,7 @@ import ( "errors" "fmt" + "github.com/aws/aws-node-termination-handler/pkg/logging" "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -34,8 +35,8 @@ import ( ) const ( - // SQSTerminateKind is a const to define an SQS termination kind of interruption event - SQSTerminateKind = "SQS_TERMINATE" + // SQSMonitorKind is a const to define this monitor kind + SQSMonitorKind = "SQS_MONITOR" // ASGTagName is the name of the instance tag whose value is the AutoScaling group name ASGTagName = "aws:autoscaling:groupName" ) @@ -71,9 +72,9 @@ func (s skip) Unwrap() error { return s.err } -// Kind denotes the kind of event that is processed +// Kind denotes the kind of monitor func (m SQSMonitor) Kind() string { - return SQSTerminateKind + return SQSMonitorKind } // Monitor continuously monitors SQS for events and coordinates processing of the events @@ -219,9 +220,9 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr log.Debug().Str("instance-id", eventWrapper.InterruptionEvent.InstanceID).Msg("dropping interruption event for unmanaged node") dropMessageSuggestionCount++ - case eventWrapper.InterruptionEvent.Kind == SQSTerminateKind: - // Successfully processed SQS message into a SQSTerminateKind interruption event - log.Debug().Msgf("Sending %s interruption event to the interruption channel", SQSTerminateKind) + case eventWrapper.InterruptionEvent.Monitor == SQSMonitorKind: + // Successfully processed SQS message into a eventWrapper.InterruptionEvent.Kind interruption event + logging.VersionedMsgs.SendingInterruptionEventToChannel(eventWrapper.InterruptionEvent.Kind) m.InterruptionChan <- *eventWrapper.InterruptionEvent default: diff --git a/pkg/monitor/sqsevent/sqs-monitor_test.go b/pkg/monitor/sqsevent/sqs-monitor_test.go index d6b5e6fa..61199ed2 100644 --- a/pkg/monitor/sqsevent/sqs-monitor_test.go +++ b/pkg/monitor/sqsevent/sqs-monitor_test.go @@ -121,13 +121,15 @@ var rebalanceRecommendationEvent = sqsevent.EventBridgeEvent{ }`), } -func TestKind(t *testing.T) { - h.Assert(t, sqsevent.SQSMonitor{}.Kind() == sqsevent.SQSTerminateKind, "SQSMonitor kind should return the kind constant for the event") +func TestMonitorKind(t *testing.T) { + h.Assert(t, sqsevent.SQSMonitor{}.Kind() == sqsevent.SQSMonitorKind, "SQSMonitor kind should return the kind constant for the monitor") } func TestMonitor_EventBridgeSuccess(t *testing.T) { spotItnEventNoTime := spotItnEvent spotItnEventNoTime.Time = "" + i := 0 + expectedResultKinds := []string{monitor.SpotITNKind, monitor.ASGLifecycleKind, monitor.SpotITNKind, monitor.RebalanceRecommendationKind} for _, event := range []sqsevent.EventBridgeEvent{spotItnEvent, asgLifecycleEvent, spotItnEventNoTime, rebalanceRecommendationEvent} { msg, err := getSQSMessageFromEvent(event) h.Ok(t, err) @@ -159,7 +161,8 @@ func TestMonitor_EventBridgeSuccess(t *testing.T) { select { case result := <-drainChan: - h.Equals(t, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(t, expectedResultKinds[i], result.Kind) + h.Equals(t, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(t, result.NodeName, dnsNodeName) h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") @@ -168,7 +171,7 @@ func TestMonitor_EventBridgeSuccess(t *testing.T) { default: h.Ok(t, fmt.Errorf("Expected an event to be generated")) } - + i++ } } @@ -243,7 +246,8 @@ func TestMonitor_AsgDirectToSqsSuccess(t *testing.T) { select { case result := <-drainChan: - h.Equals(t, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(t, monitor.ASGLifecycleKind, result.Kind) + h.Equals(t, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(t, result.NodeName, dnsNodeName) h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") @@ -330,16 +334,20 @@ func TestMonitor_DrainTasks(t *testing.T) { err := sqsMonitor.Monitor() h.Ok(t, err) + i := 0 + expectedResultKinds := []string{monitor.SpotITNKind, monitor.ASGLifecycleKind, monitor.RebalanceRecommendationKind} for _, event := range testEvents { t.Run(event.DetailType, func(st *testing.T) { result := <-drainChan - h.Equals(st, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(st, expectedResultKinds[i], result.Kind) + h.Equals(st, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(st, result.NodeName, dnsNodeName) h.Assert(st, result.PostDrainTask != nil, "PostDrainTask should have been set") h.Assert(st, result.PreDrainTask != nil, "PreDrainTask should have been set") err := result.PostDrainTask(result, node.Node{}) h.Ok(st, err) }) + i++ } } @@ -378,7 +386,8 @@ func TestMonitor_DrainTasks_Delay(t *testing.T) { t.Run(asgLifecycleEvent.DetailType, func(st *testing.T) { result := <-drainChan - h.Equals(st, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(st, monitor.ASGLifecycleKind, result.Kind) + h.Equals(st, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(st, result.NodeName, dnsNodeName) h.Assert(st, result.PostDrainTask != nil, "PostDrainTask should have been set") h.Assert(st, result.PreDrainTask != nil, "PreDrainTask should have been set") @@ -425,12 +434,15 @@ func TestMonitor_DrainTasks_Errors(t *testing.T) { h.Ok(t, err) count := 0 + i := 0 + expectedResultKinds := []string{monitor.SpotITNKind, monitor.ASGLifecycleKind, monitor.RebalanceRecommendationKind} done := false for !done { select { case result := <-drainChan: count++ - h.Equals(t, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(t, expectedResultKinds[i], result.Kind) + h.Equals(t, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(t, result.NodeName, dnsNodeName) h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") @@ -439,6 +451,7 @@ func TestMonitor_DrainTasks_Errors(t *testing.T) { default: done = true } + i++ } h.Equals(t, count, 3) } @@ -479,7 +492,8 @@ func TestMonitor_DrainTasksASGFailure(t *testing.T) { select { case result := <-drainChan: - h.Equals(t, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(t, monitor.ASGLifecycleKind, result.Kind) + h.Equals(t, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(t, result.NodeName, dnsNodeName) h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") diff --git a/pkg/monitor/types.go b/pkg/monitor/types.go index 6e5a5787..c3c587d2 100644 --- a/pkg/monitor/types.go +++ b/pkg/monitor/types.go @@ -20,6 +20,21 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/node" ) +const ( + // SpotITNKind is a const to define a Spot ITN kind of interruption event + SpotITNKind = "SPOT_ITN" + // ScheduledEventKind is a const to define a scheduled event kind of interruption event + ScheduledEventKind = "SCHEDULED_EVENT" + // RebalanceRecommendationKind is a const to define a Rebalance Recommendation kind of interruption event + RebalanceRecommendationKind = "REBALANCE_RECOMMENDATION" + // StateChangeKind is a const to define an EC2 State Change kind of interruption event + StateChangeKind = "STATE_CHANGE" + // ASGLifecycleKind is a const to define an ASG Lifecycle kind of interruption event + ASGLifecycleKind = "ASG_LIFECYCLE" + // SQSTerminateKind is a const to define an SQS termination kind of interruption event + SQSTerminateKind = "SQS_TERMINATE" +) + // DrainTask defines a task to be run when draining a node type DrainTask func(InterruptionEvent, node.Node) error @@ -27,6 +42,7 @@ type DrainTask func(InterruptionEvent, node.Node) error type InterruptionEvent struct { EventID string Kind string + Monitor string Description string State string AutoScalingGroupName string diff --git a/pkg/observability/k8s-events.go b/pkg/observability/k8s-events.go index dd44b122..edb42dde 100644 --- a/pkg/observability/k8s-events.go +++ b/pkg/observability/k8s-events.go @@ -19,10 +19,7 @@ import ( "strings" "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" - "github.com/aws/aws-node-termination-handler/pkg/monitor/rebalancerecommendation" - "github.com/aws/aws-node-termination-handler/pkg/monitor/scheduledevent" - "github.com/aws/aws-node-termination-handler/pkg/monitor/spotitn" - "github.com/aws/aws-node-termination-handler/pkg/monitor/sqsevent" + "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/rs/zerolog/log" corev1 "k8s.io/api/core/v1" kErr "k8s.io/apimachinery/pkg/api/errors" @@ -66,8 +63,10 @@ const ( const ( scheduledEventReason = "ScheduledEvent" spotITNReason = "SpotInterruption" - sqsTerminateReason = "SQSTermination" + sqsTerminationReason = "SQSTermination" rebalanceRecommendationReason = "RebalanceRecommendation" + stateChangeReason = "StateChange" + asgLifecycleReason = "ASGLifecycle" unknownReason = "UnknownInterruption" ) @@ -165,22 +164,63 @@ func (r K8sEventRecorder) Emit(nodeName string, eventType, eventReason, eventMsg } } -// GetReasonForKind returns a Kubernetes event reason for the given interruption event kind -func GetReasonForKind(kind string) string { - switch kind { - case scheduledevent.ScheduledEventKind: +// getReasonForKindV1 returns a Kubernetes event reason for the given interruption event kind. +// Compatible with log format version 1. +func getReasonForKindV1(eventKind, monitorKind string) string { + // In v1 all events received from SQS were given the same reason. + if monitorKind == monitor.SQSTerminateKind { + return sqsTerminationReason + } + + // However, events received from IMDS could be more specific. + switch eventKind { + case monitor.ScheduledEventKind: return scheduledEventReason - case spotitn.SpotITNKind: + case monitor.SpotITNKind: return spotITNReason - case sqsevent.SQSTerminateKind: - return sqsTerminateReason - case rebalancerecommendation.RebalanceRecommendationKind: + case monitor.RebalanceRecommendationKind: return rebalanceRecommendationReason default: return unknownReason } } +// getReasonForKindV2 returns a Kubernetes event reason for the given interruption event kind. +// Compatible with log format version 2. +func getReasonForKindV2(eventKind, _ string) string { + // v2 added reasons for more event kinds for both IMDS and SQS events. + switch eventKind { + case monitor.ScheduledEventKind: + return scheduledEventReason + case monitor.SpotITNKind: + return spotITNReason + case monitor.RebalanceRecommendationKind: + return rebalanceRecommendationReason + case monitor.StateChangeKind: + return stateChangeReason + case monitor.ASGLifecycleKind: + return asgLifecycleReason + default: + return unknownReason + } +} + +var GetReasonForKind func(kind, monitor string) string = getReasonForKindV1 + +func SetReasonForKindVersion(version int) error { + switch version { + case 1: + GetReasonForKind = getReasonForKindV1 + return nil + case 2: + GetReasonForKind = getReasonForKindV2 + return nil + default: + GetReasonForKind = getReasonForKindV1 + return fmt.Errorf("Unrecognized 'reason for kind' version: %d, using version 1", version) + } +} + // Parse the given extra annotations string into a map func parseExtraAnnotations(annotations map[string]string, extraAnnotationsStr string) (map[string]string, error) { parts := strings.Split(extraAnnotationsStr, ",")