Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ func main() {
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
}

log.Info().Msgf("Using log format version %d", nthConfig.LogFormatVersion)
if err = logging.SetFormatVersion(nthConfig.LogFormatVersion); err != nil {
log.Warn().Err(err).Send()
}
if err = observability.SetReasonForKindVersion(nthConfig.LogFormatVersion); err != nil {
log.Warn().Err(err).Send()
}

err = webhook.ValidateWebhookConfig(nthConfig)
if err != nil {
nthConfig.Print()
Expand Down Expand Up @@ -196,13 +204,13 @@ func main() {

for _, fn := range monitoringFns {
go func(monitor monitor.Monitor) {
log.Info().Str("event_type", monitor.Kind()).Msg("Started monitoring for events")
logging.VersionedMsgs.MonitoringStarted(monitor.Kind())
var previousErr error
var duplicateErrCount int
for range time.Tick(time.Second * 2) {
err := monitor.Monitor()
if err != nil {
log.Warn().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events")
logging.VersionedMsgs.ProblemMonitoringForEvents(monitor.Kind(), err)
metrics.ErrorEventsInc(monitor.Kind())
recorder.Emit(nthConfig.NodeName, observability.Warning, observability.MonitorErrReason, observability.MonitorErrMsgFmt, monitor.Kind())
if previousErr != nil && err.Error() == previousErr.Error() {
Expand Down Expand Up @@ -239,16 +247,10 @@ func main() {
for event, ok := interruptionEventStore.GetActiveEvent(); ok; event, ok = interruptionEventStore.GetActiveEvent() {
select {
case interruptionEventStore.Workers <- 1:
log.Info().
Str("event-id", event.EventID).
Str("kind", event.Kind).
Str("node-name", event.NodeName).
Str("instance-id", event.InstanceID).
Str("provider-id", event.ProviderID).
Msg("Requesting instance drain")
logging.VersionedMsgs.RequestingInstanceDrain(event)
event.InProgress = true
wg.Add(1)
recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind), event.Description)
recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description)
go drainOrCordonIfNecessary(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, &wg)
default:
log.Warn().Msg("all workers busy, waiting")
Expand Down
1 change: 1 addition & 0 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
| `extraEnv` | Additional environment variables for the _aws-node-termination-handler_ container. | `[]` |
| `probes` | The Kubernetes liveness probe configuration. | _See values.yaml_ |
| `logLevel` | Sets the log level (`info`,`debug`, or `error`) | `info` |
| `logFormatVersion` | Sets the log format version. Available versions: 1, 2. Version 1 refers to the format that has been used through v1.17.3. Version 2 offers more detail for the "event kind" and "reason", especially when operating in Queue Processor mode. | `1` |
| `jsonLogging` | If `true`, use JSON-formatted logs instead of human readable logs. | `false` |
| `enablePrometheusServer` | If `true`, start an http server exposing `/metrics` endpoint for _Prometheus_. | `false` |
| `prometheusServerPort` | Replaces the default HTTP port for exposing _Prometheus_ metrics. | `9092` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ spec:
value: {{ .Values.logLevel | quote }}
- name: JSON_LOGGING
value: {{ .Values.jsonLogging | quote }}
- name: LOG_FORMAT_VERSION
value: {{ .Values.logFormatVersion | quote }}
- name: ENABLE_PROMETHEUS_SERVER
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ spec:
value: {{ .Values.logLevel | quote }}
- name: JSON_LOGGING
value: {{ .Values.jsonLogging | quote }}
- name: LOG_FORMAT_VERSION
value: {{ .Values.logFormatVersion | quote }}
- name: ENABLE_PROMETHEUS_SERVER
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ spec:
value: {{ .Values.logLevel | quote }}
- name: JSON_LOGGING
value: {{ .Values.jsonLogging | quote }}
- name: LOG_FORMAT_VERSION
value: {{ .Values.logFormatVersion | quote }}
- name: ENABLE_PROMETHEUS_SERVER
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
Expand Down
3 changes: 3 additions & 0 deletions config/helm/aws-node-termination-handler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ probes:
# Set the log level
logLevel: info

# Set the log format version
logFormatVersion: 1

# Log messages in JSON format
jsonLogging: false

Expand Down
15 changes: 15 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ const (
jsonLoggingDefault = false
logLevelConfigKey = "LOG_LEVEL"
logLevelDefault = "INFO"
logFormatVersionKey = "LOG_FORMAT_VERSION"
logFormatVersionDefault = 1
MinSupportedLogFormatVersion = 1
MaxSupportedLogFormatVersion = 2
uptimeFromFileConfigKey = "UPTIME_FROM_FILE"
uptimeFromFileDefault = ""
workersConfigKey = "WORKERS"
Expand Down Expand Up @@ -138,6 +142,7 @@ type Config struct {
ExcludeFromLoadBalancers bool
JsonLogging bool
LogLevel string
LogFormatVersion int
UptimeFromFile string
EnablePrometheus bool
PrometheusPort int
Expand Down Expand Up @@ -197,6 +202,7 @@ func ParseCliArgs() (config Config, err error) {
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
flag.IntVar(&config.LogFormatVersion, "log-format-version", getIntEnv(logFormatVersionKey, logFormatVersionDefault), "Sets the log format version.")
flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).")
flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.")
flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.")
Expand Down Expand Up @@ -242,6 +248,15 @@ func ParseCliArgs() (config Config, err error) {
return config, fmt.Errorf("invalid log-level passed: %s Should be one of: info, debug, error", config.LogLevel)
}

if config.LogFormatVersion < MinSupportedLogFormatVersion {
log.Warn().Msgf("Log format version %d is not supported, using format version %d", config.LogFormatVersion, MinSupportedLogFormatVersion)
config.LogFormatVersion = MinSupportedLogFormatVersion
}
if config.LogFormatVersion > MaxSupportedLogFormatVersion {
log.Warn().Msgf("Log format version %d is not supported, using format version %d", config.LogFormatVersion, MaxSupportedLogFormatVersion)
config.LogFormatVersion = MaxSupportedLogFormatVersion
}

if config.NodeName == "" {
panic("You must provide a node-name to the CLI or NODE_NAME environment variable.")
}
Expand Down
91 changes: 91 additions & 0 deletions pkg/logging/versioned.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package logging

import (
"fmt"

"github.com/aws/aws-node-termination-handler/pkg/monitor"
"github.com/rs/zerolog/log"
)

type versionedMsgsV1 struct{}

func (versionedMsgsV1) MonitoringStarted(monitorKind string) {
log.Info().Str("event_type", monitorKind).Msg("Started monitoring for events")
}

func (versionedMsgsV1) ProblemMonitoringForEvents(monitorKind string, err error) {
log.Warn().Str("event_type", monitorKind).Err(err).Msg("There was a problem monitoring for events")
}

func (versionedMsgsV1) RequestingInstanceDrain(event *monitor.InterruptionEvent) {
log.Info().
Str("event-id", event.EventID).
Str("kind", event.Kind).
Str("node-name", event.NodeName).
Str("instance-id", event.InstanceID).
Str("provider-id", event.ProviderID).
Msg("Requesting instance drain")
}

func (versionedMsgsV1) SendingInterruptionEventToChannel(_ string) {
log.Debug().Msg("Sending SQS_TERMINATE interruption event to the interruption channel")
}

type versionedMsgsV2 struct{}

func (versionedMsgsV2) MonitoringStarted(monitorKind string) {
log.Info().Str("monitor_type", monitorKind).Msg("Started monitoring for events")
}

func (versionedMsgsV2) ProblemMonitoringForEvents(monitorKind string, err error) {
log.Warn().Str("monitor_type", monitorKind).Err(err).Msg("There was a problem monitoring for events")
}

func (versionedMsgsV2) RequestingInstanceDrain(event *monitor.InterruptionEvent) {
log.Info().
Str("event-id", event.EventID).
Str("kind", event.Kind).
Str("monitor", event.Monitor).
Str("node-name", event.NodeName).
Str("instance-id", event.InstanceID).
Str("provider-id", event.ProviderID).
Msg("Requesting instance drain")
}

func (versionedMsgsV2) SendingInterruptionEventToChannel(eventKind string) {
log.Debug().Msgf("Sending %s interruption event to the interruption channel", eventKind)
}

var VersionedMsgs interface {
MonitoringStarted(monitorKind string)
ProblemMonitoringForEvents(monitorKind string, err error)
RequestingInstanceDrain(event *monitor.InterruptionEvent)
SendingInterruptionEventToChannel(eventKind string)
} = versionedMsgsV1{}

func SetFormatVersion(version int) error {
switch version {
case 1:
VersionedMsgs = versionedMsgsV1{}
return nil
case 2:
VersionedMsgs = versionedMsgsV2{}
return nil
default:
VersionedMsgs = versionedMsgsV1{}
return fmt.Errorf("Unrecognized log format version: %d, using version 1", version)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ import (
"github.com/aws/aws-node-termination-handler/pkg/node"
)

const (
// RebalanceRecommendationKind is a const to define a Rebalance Recommendation kind of event
RebalanceRecommendationKind = "REBALANCE_RECOMMENDATION"
)
// RebalanceRecommentadionMonitorKind is a const to define this monitor kind
const RebalanceRecommendationMonitorKind = "REBALANCE_RECOMMENDATION_MONITOR"

// RebalanceRecommendationMonitor is a struct definition which facilitates monitoring of rebalance recommendations from IMDS
type RebalanceRecommendationMonitor struct {
Expand All @@ -50,15 +48,15 @@ func (m RebalanceRecommendationMonitor) Monitor() error {
if err != nil {
return err
}
if interruptionEvent != nil && interruptionEvent.Kind == RebalanceRecommendationKind {
if interruptionEvent != nil && interruptionEvent.Kind == monitor.RebalanceRecommendationKind {
m.InterruptionChan <- *interruptionEvent
}
return nil
}

// Kind denotes the kind of event that is processed
// Kind denotes the kind of monitor
func (m RebalanceRecommendationMonitor) Kind() string {
return RebalanceRecommendationKind
return RebalanceRecommendationMonitorKind
}

// checkForRebalanceRecommendation Checks EC2 instance metadata for a rebalance recommendation
Expand Down Expand Up @@ -86,7 +84,8 @@ func (m RebalanceRecommendationMonitor) checkForRebalanceRecommendation() (*moni

return &monitor.InterruptionEvent{
EventID: fmt.Sprintf("rebalance-recommendation-%x", hash.Sum(nil)),
Kind: RebalanceRecommendationKind,
Kind: monitor.RebalanceRecommendationKind,
Monitor: RebalanceRecommendationMonitorKind,
StartTime: noticeTime,
NodeName: nodeName,
Description: fmt.Sprintf("Rebalance recommendation received. Instance will be cordoned at %s \n", rebalanceRecommendation.NoticeTime),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ func TestMonitor_Success(t *testing.T) {

go func() {
result := <-drainChan
h.Equals(t, rebalancerecommendation.RebalanceRecommendationKind, result.Kind)
h.Equals(t, monitor.RebalanceRecommendationKind, result.Kind)
h.Equals(t, rebalancerecommendation.RebalanceRecommendationMonitorKind, result.Monitor)
h.Equals(t, expFormattedTime, result.StartTime.String())
h.Assert(t, strings.Contains(result.Description, startTime),
"Expected description to contain: "+startTime+" but is actually: "+result.Description)
Expand Down
11 changes: 6 additions & 5 deletions pkg/monitor/scheduledevent/scheduled-event-monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ import (
)

const (
// ScheduledEventKind is a const to define a scheduled event kind of interruption event
ScheduledEventKind = "SCHEDULED_EVENT"
// ScheduledEventMonitorKind is a const to define this monitor kind
ScheduledEventMonitorKind = "SCHEDULED_EVENT_MONITOR"
scheduledEventStateCompleted = "completed"
scheduledEventStateCanceled = "canceled"
scheduledEventDateFormat = "2 Jan 2006 15:04:05 GMT"
Expand Down Expand Up @@ -69,9 +69,9 @@ func (m ScheduledEventMonitor) Monitor() error {
return nil
}

// Kind denotes the kind of event that is processed
// Kind denotes the kind of monitor
func (m ScheduledEventMonitor) Kind() string {
return ScheduledEventKind
return ScheduledEventMonitorKind
}

// checkForScheduledEvents Checks EC2 instance metadata for a scheduled event requiring a node drain
Expand Down Expand Up @@ -101,7 +101,8 @@ func (m ScheduledEventMonitor) checkForScheduledEvents() ([]monitor.Interruption
}
events = append(events, monitor.InterruptionEvent{
EventID: scheduledEvent.EventID,
Kind: ScheduledEventKind,
Kind: monitor.ScheduledEventKind,
Monitor: ScheduledEventMonitorKind,
Description: fmt.Sprintf("%s will occur between %s and %s because %s\n", scheduledEvent.Code, scheduledEvent.NotBefore, scheduledEvent.NotAfter, scheduledEvent.Description),
State: scheduledEvent.State,
NodeName: m.NodeName,
Expand Down
7 changes: 4 additions & 3 deletions pkg/monitor/scheduledevent/scheduled-event-monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ func TestMonitor_Success(t *testing.T) {
go func() {
result := <-drainChan
h.Equals(t, scheduledEventId, result.EventID)
h.Equals(t, scheduledevent.ScheduledEventKind, result.Kind)
h.Equals(t, monitor.ScheduledEventKind, result.Kind)
h.Equals(t, scheduledevent.ScheduledEventMonitorKind, result.Monitor)
h.Equals(t, scheduledEventState, result.State)
h.TimeWithinRange(t, result.StartTime, oneSecondAgo(), time.Now())
h.Equals(t, expScheduledEventEndTimeFmt, result.EndTime.String())
Expand Down Expand Up @@ -130,7 +131,7 @@ func TestMonitor_CanceledEvent(t *testing.T) {
go func() {
result := <-cancelChan
h.Equals(t, scheduledEventId, result.EventID)
h.Equals(t, scheduledevent.ScheduledEventKind, result.Kind)
h.Equals(t, monitor.ScheduledEventKind, result.Kind)
h.Equals(t, state, result.State)
h.TimeWithinRange(t, result.StartTime, oneSecondAgo(), time.Now())
h.Equals(t, expScheduledEventEndTimeFmt, result.EndTime.String())
Expand Down Expand Up @@ -257,7 +258,7 @@ func TestMonitor_EndTimeParseFail(t *testing.T) {
go func() {
result := <-drainChan
h.Equals(t, scheduledEventId, result.EventID)
h.Equals(t, scheduledevent.ScheduledEventKind, result.Kind)
h.Equals(t, monitor.ScheduledEventKind, result.Kind)
h.Equals(t, scheduledEventState, result.State)
h.TimeWithinRange(t, result.StartTime, oneSecondAgo(), time.Now())
h.Equals(t, expScheduledEventStartTimeFmt, result.EndTime.String())
Expand Down
15 changes: 7 additions & 8 deletions pkg/monitor/spotitn/spot-itn-monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ import (
"github.com/aws/aws-node-termination-handler/pkg/node"
)

const (
// SpotITNKind is a const to define a Spot ITN kind of interruption event
SpotITNKind = "SPOT_ITN"
)
// SpotITNMonitorKind is a const to define this monitor kind
const SpotITNMonitorKind = "SPOT_ITN_MONITOR"

// SpotInterruptionMonitor is a struct definition which facilitates monitoring of spot ITNs from IMDS
type SpotInterruptionMonitor struct {
Expand All @@ -52,15 +50,15 @@ func (m SpotInterruptionMonitor) Monitor() error {
if err != nil {
return err
}
if interruptionEvent != nil && interruptionEvent.Kind == SpotITNKind {
if interruptionEvent != nil && interruptionEvent.Kind == monitor.SpotITNKind {
m.InterruptionChan <- *interruptionEvent
}
return nil
}

// Kind denotes the kind of event that is processed
// Kind denotes the kind of monitor
func (m SpotInterruptionMonitor) Kind() string {
return SpotITNKind
return SpotITNMonitorKind
}

// checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice
Expand Down Expand Up @@ -88,7 +86,8 @@ func (m SpotInterruptionMonitor) checkForSpotInterruptionNotice() (*monitor.Inte

return &monitor.InterruptionEvent{
EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)),
Kind: SpotITNKind,
Kind: monitor.SpotITNKind,
Monitor: SpotITNMonitorKind,
StartTime: interruptionTime,
NodeName: nodeName,
Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time),
Expand Down
Loading