diff --git a/Dockerfile.windows b/Dockerfile.windows new file mode 100644 index 00000000..7c97ca5e --- /dev/null +++ b/Dockerfile.windows @@ -0,0 +1,31 @@ +ARG WINDOWS_VERSION=1903 + +# Build the manager binary +FROM --platform=windows/amd64 golang:1.14 as builder + +## GOLANG env +ENV GO111MODULE="on" CGO_ENABLED="0" GOOS="windows" GOARCH="amd64" +ARG GOPROXY="https://proxy.golang.org,direct" + +# Copy go.mod and download dependencies +WORKDIR /node-termination-handler +COPY go.mod . +COPY go.sum . +RUN go mod download + +# Build +COPY . . +RUN go build -a -tags nth${GOOS} -o build/node-termination-handler cmd/node-termination-handler.go + +# In case the target is build for testing: +# $ docker build --target=builder -t test . +ENTRYPOINT ["/node-termination-handler/build/node-termination-handler"] + +# Copy the controller-manager into a thin image +FROM mcr.microsoft.com/windows/nanoserver:${WINDOWS_VERSION} +WORKDIR / +COPY --from=builder /windows/system32/netapi32.dll /windows/system32/ +COPY --from=builder /node-termination-handler/build/node-termination-handler . +COPY THIRD_PARTY_LICENSES . +ENTRYPOINT ["/node-termination-handler"] + diff --git a/Makefile b/Makefile index 59e71580..2b8e1bbf 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,9 @@ sync-readme-to-dockerhub: unit-test: create-build-dir go test -bench=. ${MAKEFILE_PATH}/... -v -coverprofile=coverage.txt -covermode=atomic -outputdir=${BUILD_DIR_PATH} +unit-test-linux: + ${MAKEFILE_PATH}/scripts/run-unit-tests-in-docker + build: create-build-dir compile helm-tests: helm-sync-test helm-version-sync-test diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md index f37d9f27..b050945e 100644 --- a/config/helm/aws-node-termination-handler/README.md +++ b/config/helm/aws-node-termination-handler/README.md @@ -72,12 +72,20 @@ Parameter | Description | Default `taintNode` | If true, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance` and `aws-node-termination-handler/spot-itn` | `false` `jsonLogging` | If true, use JSON-formatted logs instead of human readable logs. | `false` `affinity` | node/pod affinities | None +`linuxAffinity` | Linux node/pod affinities | None +`windowsAffinity` | Windows node/pod affinities | None `podAnnotations` | annotations to add to each pod | `{}` +`linuxPodAnnotations` | Linux annotations to add to each pod | `{}` +`windowsPodAnnotations` | Windows annotations to add to each pod | `{}` `podLabels` | labels to add to each pod | `{}` +`linuxPodLabels` | labels to add to each Linux pod | `{}` +`windowsPodLabels` | labels to add to each Windows pod | `{}` `priorityClassName` | Name of the priorityClass | `system-node-critical` `resources` | Resources for the pods | `requests.cpu: 50m, requests.memory: 64Mi, limits.cpu: 100m, limits.memory: 128Mi` -`dnsPolicy` | DaemonSet DNS policy | `ClusterFirstWithHostNet` -`nodeSelector` | Tells the daemon set where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` +`dnsPolicy` | DaemonSet DNS policy | Linux: `ClusterFirstWithHostNet`, Windows: `ClusterFirst` +`nodeSelector` | Tells the all daemon sets where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` +`linuxNodeSelector` | Tells the Linux daemon set where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` +`windowsNodeSelector` | Tells the Windows daemon set where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` `tolerations` | list of node taints to tolerate | `[ {"operator": "Exists"} ]` `rbac.create` | if `true`, create and use RBAC resources | `true` `rbac.pspEnabled` | If `true`, create and use a restricted pod security policy | `false` @@ -86,9 +94,10 @@ Parameter | Description | Default `serviceAccount.annotations` | Specifies the annotations for ServiceAccount | `{}` `procUptimeFile` | (Used for Testing) Specify the uptime file | `/proc/uptime` `securityContext.runAsUserID` | User ID to run the container | `1000` -`securityContext.runAsGroupID` | Group ID to run the container | `1000` -`nodeSelectorTermsOs` | Operating System Node Selector Key | `beta.kubernetes.io/os` -`nodeSelectorTermsArch` | CPU Architecture Node Selector Key | `beta.kubernetes.io/arch` +`securityContext.runAsGroupID` | Group ID to run the container | `1000` +`nodeSelectorTermsOs` | Operating System Node Selector Key | >=1.14: `kubernetes.io/os`, <1.14: `beta.kubernetes.io/os` +`nodeSelectorTermsArch` | CPU Architecture Node Selector Key | >=1.14: `kubernetes.io/arch`, <1.14: `beta.kubernetes.io/arch` +`targetNodeOs | Space separated list of node OS's to target, e.g. "linux", "windows", "linux windows". Note: Windows support is experimental. | `"linux"` `enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false` `prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092` diff --git a/config/helm/aws-node-termination-handler/templates/_helpers.tpl b/config/helm/aws-node-termination-handler/templates/_helpers.tpl index 902844a7..15acd5c6 100644 --- a/config/helm/aws-node-termination-handler/templates/_helpers.tpl +++ b/config/helm/aws-node-termination-handler/templates/_helpers.tpl @@ -24,6 +24,14 @@ If release name contains chart name it will be used as a full name. {{- end -}} {{- end -}} +{{/* +Equivalent to "aws-node-termination-handler.fullname" except that "-win" indicator is appended to the end. +Name will not exceed 63 characters. +*/}} +{{- define "aws-node-termination-handler.fullname.windows" -}} +{{- include "aws-node-termination-handler.fullname" . | trunc 59 | trimSuffix "-" | printf "%s-win" -}} +{{- end -}} + {{/* Common labels */}} @@ -55,3 +63,41 @@ Create the name of the service account to use {{ default "default" .Values.serviceAccount.name }} {{- end -}} {{- end -}} + +{{/* +Get the default node selector term prefix. + +In 1.14 "beta.kubernetes.io" was deprecated and is scheduled for removal in 1.18. +See https://v1-14.docs.kubernetes.io/docs/setup/release/notes/#deprecations +*/}} +{{- define "aws-node-termination-handler.defaultNodeSelectorTermsPrefix" -}} + {{- semverCompare "<1.14" .Capabilities.KubeVersion.Version | ternary "beta.kubernetes.io" "kubernetes.io" -}} +{{- end -}} + +{{/* +Get the default node selector OS term. +*/}} +{{- define "aws-node-termination-handler.defaultNodeSelectorTermsOs" -}} + {{- list (include "aws-node-termination-handler.defaultNodeSelectorTermsPrefix" .) "os" | join "/" -}} +{{- end -}} + +{{/* +Get the default node selector Arch term. +*/}} +{{- define "aws-node-termination-handler.defaultNodeSelectorTermsArch" -}} + {{- list (include "aws-node-termination-handler.defaultNodeSelectorTermsPrefix" .) "arch" | join "/" -}} +{{- end -}} + +{{/* +Get the node selector OS term. +*/}} +{{- define "aws-node-termination-handler.nodeSelectorTermsOs" -}} + {{- or .Values.nodeSelectorTermsOs (include "aws-node-termination-handler.defaultNodeSelectorTermsOs" .) -}} +{{- end -}} + +{{/* +Get the node selector Arch term. +*/}} +{{- define "aws-node-termination-handler.nodeSelectorTermsArch" -}} + {{- or .Values.nodeSelectorTermsArch (include "aws-node-termination-handler.defaultNodeSelectorTermsArch" .) -}} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml similarity index 68% rename from config/helm/aws-node-termination-handler/templates/daemonset.yaml rename to config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml index a8bd41aa..10c6442d 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml @@ -1,3 +1,4 @@ +{{- if (lower .Values.targetNodeOs | contains "linux") -}} apiVersion: apps/v1 kind: DaemonSet metadata: @@ -6,17 +7,25 @@ metadata: labels: {{ include "aws-node-termination-handler.labels" . | indent 4 }} spec: + {{- if (or .Values.updateStrategy .Values.linuxUpdateStrategy) }} updateStrategy: -{{ toYaml .Values.updateStrategy | indent 4 }} + {{- with .Values.updateStrategy }} + {{- toYaml . | indent 4 }} + {{- end }} + {{- with .Values.linuxUpdateStrategy }} + {{- toYaml . | indent 4 }} + {{- end }} + {{- end }} selector: matchLabels: app.kubernetes.io/name: {{ include "aws-node-termination-handler.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} + {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux template: metadata: - {{- if .Values.podAnnotations }} + {{- if (or .Values.podAnnotations .Values.linuxPodAnnotations) }} annotations: - {{- range $key, $value := .Values.podAnnotations }} + {{- range $key, $value := (mergeOverwrite (dict) .Values.podAnnotations .Values.linuxPodAnnotations) }} {{ $key }}: {{ $value | quote }} {{- end }} {{- end }} @@ -24,38 +33,43 @@ spec: app.kubernetes.io/name: {{ include "aws-node-termination-handler.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} k8s-app: aws-node-termination-handler - {{- range $key, $value := .Values.podLabels }} + {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux + {{- range $key, $value := (mergeOverwrite (dict) .Values.podLabels .Values.linuxPodLabels) }} {{ $key }}: {{ $value | quote }} {{- end }} spec: volumes: - name: "uptime" hostPath: - path: "{{ .Values.procUptimeFile }}" - priorityClassName: "{{ .Values.priorityClassName }}" + path: {{ .Values.procUptimeFile | default "/proc/uptime" | quote }} + priorityClassName: {{ .Values.priorityClassName | quote }} affinity: nodeAffinity: - # NOTE(jaypipes): Change when we complete - # https://github.com/aws/aws-node-termination-handler/issues/8 requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: {{ .Values.nodeSelectorTermsOs | default "beta.kubernetes.io/os" | quote }} - operator: In - values: - - linux - - key: {{ .Values.nodeSelectorTermsArch | default "beta.kubernetes.io/arch" | quote }} - operator: In - values: - - amd64 - - arm - - arm64 + - key: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . | quote }} + operator: In + values: + - linux + - key: {{ include "aws-node-termination-handler.nodeSelectorTermsArch" . | quote }} + operator: In + values: + - amd64 + - arm64 + - arm + {{- with .Values.affinity }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.linuxAffinity }} + {{- toYaml . | nindent 8 }} + {{- end }} serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }} hostNetwork: true - dnsPolicy: {{ .Values.dnsPolicy }} + dnsPolicy: {{ .Values.dnsPolicy | default "ClusterFirstWithHostNet" | quote }} containers: - name: {{ include "aws-node-termination-handler.name" . }} - image: {{ .Values.image.repository}}:{{ .Values.image.tag }} + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} imagePullPolicy: {{ .Values.image.pullPolicy }} securityContext: readOnlyRootFilesystem: true @@ -65,7 +79,7 @@ spec: allowPrivilegeEscalation: false volumeMounts: - name: "uptime" - mountPath: "/proc/uptime" + mountPath: {{ .Values.procUptimeFile | default "/proc/uptime" | quote }} readOnly: true env: - name: NODE_NAME @@ -125,27 +139,30 @@ spec: value: {{ .Values.jsonLogging | quote }} - name: WEBHOOK_PROXY value: {{ .Values.webhookProxy | quote }} + - name: UPTIME_FROM_FILE + value: {{ .Values.procUptimeFile | quote }} - name: ENABLE_PROMETHEUS_SERVER value: {{ .Values.enablePrometheusServer | quote }} - name: PROMETHEUS_SERVER_PORT value: {{ .Values.prometheusServerPort | quote }} resources: {{- toYaml .Values.resources | nindent 12 }} - {{- with .Values.nodeSelector }} nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} + {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux + {{- with .Values.nodeSelector }} + {{- . | nindent 8 }} + {{- end }} + {{- with .Values.linuxNodeSelector }} + {{- . | nindent 8 }} + {{- end }} {{- if .Values.image.pullSecrets }} imagePullSecrets: {{- range .Values.image.pullSecrets }} - name: {{ . }} {{- end }} {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} {{- with .Values.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml new file mode 100644 index 00000000..e84d2f2d --- /dev/null +++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml @@ -0,0 +1,144 @@ +{{- if (lower .Values.targetNodeOs | contains "windows") -}} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "aws-node-termination-handler.fullname.windows" . }} + namespace: {{ .Release.Namespace }} + labels: +{{ include "aws-node-termination-handler.labels" . | indent 4 }} +spec: + {{- if (or .Values.updateStrategy .Values.windowsUpdateStrategy) }} + updateStrategy: + {{- with .Values.updateStrategy }} + {{- toYaml . | indent 4 }} + {{- end }} + {{- with .Values.windowsUpdateStrategy }} + {{- toYaml . | indent 4 }} + {{- end }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "aws-node-termination-handler.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows + template: + metadata: + {{- if (or .Values.podAnnotations .Values.windowsPodAnnotations) }} + annotations: + {{- range $key, $value := (mergeOverwrite (dict) .Values.podAnnotations .Values.windowsPodAnnotations) }} + {{ $key }}: {{ $value | quote }} + {{- end }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "aws-node-termination-handler.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + k8s-app: aws-node-termination-handler + {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows + {{- range $key, $value := (mergeOverwrite (dict) .Values.podLabels .Values.windowsPodLabels) }} + {{ $key }}: {{ $value | quote }} + {{- end }} + spec: + priorityClassName: {{ .Values.priorityClassName | quote }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . | quote }} + operator: In + values: + - windows + - key: {{ include "aws-node-termination-handler.nodeSelectorTermsArch" . | quote }} + operator: In + values: + - amd64 + {{- with .Values.affinity }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.windowsAffinity }} + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }} + dnsPolicy: {{ .Values.dnsPolicy | default "ClusterFirst" | quote }} + containers: + - name: {{ include "aws-node-termination-handler.name" . }} + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: SPOT_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: DELETE_LOCAL_DATA + value: {{ .Values.deleteLocalData | quote }} + - name: IGNORE_DAEMON_SETS + value: {{ .Values.ignoreDaemonSets | quote }} + - name: GRACE_PERIOD + value: {{ .Values.gracePeriod | quote }} + - name: POD_TERMINATION_GRACE_PERIOD + value: {{ .Values.podTerminationGracePeriod | quote }} + - name: INSTANCE_METADATA_URL + value: {{ .Values.instanceMetadataURL | quote }} + - name: NODE_TERMINATION_GRACE_PERIOD + value: {{ .Values.nodeTerminationGracePeriod | quote }} + - name: WEBHOOK_URL + value: {{ .Values.webhookURL | quote }} + - name: WEBHOOK_HEADERS + value: {{ .Values.webhookHeaders | quote }} + - name: WEBHOOK_TEMPLATE + value: {{ .Values.webhookTemplate | quote }} + - name: DRY_RUN + value: {{ .Values.dryRun | quote }} + - name: ENABLE_SPOT_INTERRUPTION_DRAINING + value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_SCHEDULED_EVENT_DRAINING + value: {{ .Values.enableScheduledEventDraining | quote }} + - name: METADATA_TRIES + value: {{ .Values.metadataTries | quote }} + - name: CORDON_ONLY + value: {{ .Values.cordonOnly | quote }} + - name: TAINT_NODE + value: {{ .Values.taintNode | quote }} + - name: JSON_LOGGING + value: {{ .Values.jsonLogging | quote }} + - name: WEBHOOK_PROXY + value: {{ .Values.webhookProxy | quote }} + - name: UPTIME_FROM_FILE + value: {{ .Values.procUptimeFile | quote }} + - name: ENABLE_PROMETHEUS_SERVER + value: {{ .Values.enablePrometheusServer | quote }} + - name: PROMETHEUS_SERVER_PORT + value: {{ .Values.prometheusServerPort | quote }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + nodeSelector: + {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows + {{- with .Values.nodeSelector }} + {{- . | nindent 8 }} + {{- end }} + {{- with .Values.windowsNodeSelector }} + {{- . | nindent 8 }} + {{- end }} + {{- if .Values.image.pullSecrets }} + imagePullSecrets: + {{- range .Values.image.pullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index 2156f66b..c4886a49 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -18,7 +18,12 @@ fullnameOverride: "" priorityClassName: system-node-critical podAnnotations: {} +linuxAnnotations: {} +windowsAnnotations: {} + podLabels: {} +linuxPodLabels: {} +windowsPodLabels: {} resources: requests: @@ -28,7 +33,7 @@ resources: memory: "128Mi" cpu: "100m" -## enableSpotInterruptionDraining If true, drain nodes when the spot interruption termination notice is receieved +## enableSpotInterruptionDraining If true, drain nodes when the spot interruption termination notice is received enableSpotInterruptionDraining: "" ## enableScheduledEventDraining [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event @@ -70,11 +75,20 @@ webhookTemplate: "" instanceMetadataURL: "" # (TESTING USE): Mount path for uptime file -procUptimeFile: "/proc/uptime" +procUptimeFile: "" + +# Create node OS specific daemonset(s). (e.g. "linux", "windows", "linux windows") +targetNodeOs: "linux" -# nodeSelector tells the daemonset where to place the node-termination-handler +# nodeSelector tells both linux and windows daemonsets where to place the node-termination-handler # pods. By default, this value is empty and every node will receive a pod. nodeSelector: {} +# linuxNodeSelector tells the linux daemonset where to place the node-termination-handler +# pods. By default, this value is empty and every linux node will receive a pod. +linuxNodeSelector: {} +# windowsNodeSelector tells the windows daemonset where to place the node-termination-handler +# pods. By default, this value is empty and every windows node will receive a pod. +windowsNodeSelector: {} nodeSelectorTermsOs: "" nodeSelectorTermsArch: "" @@ -86,6 +100,8 @@ tolerations: - operator: "Exists" affinity: {} +linuxAffinity: {} +windowsAffinity: {} serviceAccount: # Specifies whether a service account should be created @@ -100,4 +116,4 @@ rbac: # rbac.pspEnabled: `true` if PodSecurityPolicy resources should be created pspEnabled: true -dnsPolicy: "ClusterFirstWithHostNet" +dnsPolicy: "" diff --git a/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml b/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml index 379e1793..be9b1254 100644 --- a/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml +++ b/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml @@ -1,4 +1,6 @@ {{- if .Values.ec2MetadataTestProxy.create -}} +{{- $isWindows := (contains "windows" .Values.targetNodeOs) -}} +{{- $osSelector := (semverCompare "<1.14" .Capabilities.KubeVersion.Version | ternary "beta.kubernetes.io/os" "kubernetes.io/os") -}} apiVersion: apps/v1 kind: DaemonSet metadata: @@ -15,11 +17,15 @@ spec: app: {{ .Values.ec2MetadataTestProxy.label }} spec: serviceAccountName: {{ template "ec2-metadata-test-proxy.serviceAccountName" . }} + nodeSelector: + {{ $osSelector }}: {{ $isWindows | ternary "windows" "linux" }} + {{- if (not $isWindows) }} hostNetwork: true + {{- end }} containers: - name: {{ .Values.ec2MetadataTestProxy.label }} image: {{ .Values.ec2MetadataTestProxy.image.repository }}:{{ .Values.ec2MetadataTestProxy.image.tag }} - imagePullPolicy: IfNotPresent + imagePullPolicy: {{ .Values.ec2MetadataTestProxy.image.pullPolicy }} ports: - containerPort: {{ .Values.ec2MetadataTestProxy.port }} hostPort: {{ .Values.ec2MetadataTestProxy.port }} diff --git a/config/helm/ec2-metadata-test-proxy/templates/regular-pod-test.yaml b/config/helm/ec2-metadata-test-proxy/templates/regular-pod-test.yaml index 69ccd562..18fe1dc5 100644 --- a/config/helm/ec2-metadata-test-proxy/templates/regular-pod-test.yaml +++ b/config/helm/ec2-metadata-test-proxy/templates/regular-pod-test.yaml @@ -1,4 +1,6 @@ {{- if .Values.regularPodTest.create -}} +{{- $isWindows := (contains "windows" .Values.targetNodeOs) -}} +{{- $osSelector := (semverCompare "<1.14" .Capabilities.KubeVersion.Version | ternary "beta.kubernetes.io/os" "kubernetes.io/os") -}} apiVersion: apps/v1 kind: Deployment metadata: @@ -14,15 +16,19 @@ spec: labels: app: {{ .Values.regularPodTest.label }} spec: + nodeSelector: + {{ $osSelector }}: {{ $isWindows | ternary "windows" "linux" }} + {{- if (not $isWindows) }} securityContext: runAsUser: 1000 runAsGroup: 3000 fsGroup: 2000 + {{- end }} serviceAccountName: {{ template "ec2-metadata-test-proxy.serviceAccountName" . }} containers: - name: {{ .Values.regularPodTest.label }} image: {{ .Values.ec2MetadataTestProxy.image.repository }}:{{ .Values.ec2MetadataTestProxy.image.tag }} - imagePullPolicy: IfNotPresent + imagePullPolicy: {{ .Values.ec2MetadataTestProxy.image.pullPolicy }} env: - name: PORT value: {{ .Values.regularPodTest.port | quote }} diff --git a/config/helm/ec2-metadata-test-proxy/values.yaml b/config/helm/ec2-metadata-test-proxy/values.yaml index 6451d615..3875faae 100644 --- a/config/helm/ec2-metadata-test-proxy/values.yaml +++ b/config/helm/ec2-metadata-test-proxy/values.yaml @@ -22,8 +22,10 @@ ec2MetadataTestProxy: image: repository: ec2-metadata-test-proxy tag: customtest + pullPolicy: IfNotPresent tolerations: [] regularPodTest: create: true label: regular-pod-test port: 1339 +targetNodeOs: "linux" diff --git a/go.mod b/go.mod index 0cc36d7c..89062694 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( go.opentelemetry.io/contrib/instrumentation/runtime v0.6.1 go.opentelemetry.io/otel v0.6.0 go.opentelemetry.io/otel/exporters/metric/prometheus v0.6.0 + golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f golang.org/x/time v0.0.0-20190921001708-c4c64cad1fd0 // indirect k8s.io/api v0.0.0-20191010143144-fbf594f18f80 k8s.io/apimachinery v0.0.0-20191016060620-86f2f1b9c076 diff --git a/pkg/config/config.go b/pkg/config/config.go index d93515a9..88a3c2c2 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -55,7 +55,8 @@ const ( taintNode = "TAINT_NODE" jsonLoggingConfigKey = "JSON_LOGGING" jsonLoggingDefault = false - + uptimeFromFileConfigKey = "UPTIME_FROM_FILE" + uptimeFromFileDefault = "" // prometheus enablePrometheusDefault = false enablePrometheusConfigKey = "ENABLE_PROMETHEUS_SERVER" @@ -85,6 +86,7 @@ type Config struct { CordonOnly bool TaintNode bool JsonLogging bool + UptimeFromFile string EnablePrometheus bool PrometheusPort int } @@ -120,6 +122,7 @@ func ParseCliArgs() (config Config, err error) { flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.") flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.") flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.") + flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).") flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.") flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.") @@ -159,6 +162,7 @@ func ParseCliArgs() (config Config, err error) { "\ttaint-node: %t,\n"+ "\tjson-logging: %t,\n"+ "\twebhook-proxy: %s,\n"+ + "\tuptime-from-file: %s,\n"+ "\tenable-prometheus-server: %t,\n"+ "\tprometheus-server-port: %d,\n", config.DryRun, @@ -177,6 +181,7 @@ func ParseCliArgs() (config Config, err error) { config.TaintNode, config.JsonLogging, config.WebhookProxy, + config.UptimeFromFile, config.EnablePrometheus, config.PrometheusPort, ) diff --git a/pkg/interruptionevent/scheduled-event_internal_test.go b/pkg/interruptionevent/scheduled-event_internal_test.go index 1eaeeb94..48872f4a 100644 --- a/pkg/interruptionevent/scheduled-event_internal_test.go +++ b/pkg/interruptionevent/scheduled-event_internal_test.go @@ -22,6 +22,7 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/node" h "github.com/aws/aws-node-termination-handler/pkg/test" + "github.com/aws/aws-node-termination-handler/pkg/uptime" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" @@ -58,7 +59,7 @@ func getNthConfig(t *testing.T) config.Config { } func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node { - tNode, err := node.NewWithValues(getNthConfig(t), drainHelper) + tNode, err := node.NewWithValues(getNthConfig(t), drainHelper, uptime.Uptime) if err != nil { t.Error("failed to create node") } @@ -78,7 +79,7 @@ func TestUncordonAfterRebootPreDrainSuccess(t *testing.T) { _, err := client.CoreV1().Nodes().Create(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}) h.Ok(t, err) - tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client)) + tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) h.Ok(t, err) err = uncordonAfterRebootPreDrain(drainEvent, *tNode) diff --git a/pkg/interruptionevent/spot-itn-event_internal_test.go b/pkg/interruptionevent/spot-itn-event_internal_test.go index ab2b5bb9..5a4aa516 100644 --- a/pkg/interruptionevent/spot-itn-event_internal_test.go +++ b/pkg/interruptionevent/spot-itn-event_internal_test.go @@ -21,6 +21,7 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/node" h "github.com/aws/aws-node-termination-handler/pkg/test" + "github.com/aws/aws-node-termination-handler/pkg/uptime" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" @@ -55,7 +56,7 @@ func TestSetInterruptionTaint(t *testing.T) { _, err := client.CoreV1().Nodes().Create(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}}) h.Ok(t, err) - tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client)) + tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client), uptime.Uptime) h.Ok(t, err) err = setInterruptionTaint(drainEvent, *tNode) @@ -86,7 +87,7 @@ func TestInterruptionTaintAlreadyPresent(t *testing.T) { _, err := client.CoreV1().Nodes().Create(newNode) h.Ok(t, err) - tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client)) + tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client), uptime.Uptime) h.Ok(t, err) err = setInterruptionTaint(drainEvent, *tNode) diff --git a/pkg/node/node.go b/pkg/node/node.go index 5e8e6870..c743cd4f 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -16,13 +16,13 @@ package node import ( "encoding/json" "fmt" - "io/ioutil" "os" "strconv" "strings" "time" "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/uptime" "github.com/rs/zerolog/log" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -58,12 +58,11 @@ var ( conflictRetryInterval time.Duration = 750 * time.Millisecond ) -var uptimeFile = "/proc/uptime" - // Node represents a kubernetes node with functions to manipulate its state via the kubernetes api server type Node struct { nthConfig config.Config drainHelper *drain.Helper + uptime uptime.UptimeFuncType } // New will construct a node struct to perform various node function through the kubernetes api server @@ -72,20 +71,19 @@ func New(nthConfig config.Config) (*Node, error) { if err != nil { return nil, err } - return &Node{ - nthConfig: nthConfig, - drainHelper: drainHelper, - }, nil + return NewWithValues(nthConfig, drainHelper, getUptimeFunc(nthConfig.UptimeFromFile)) } -// NewWithValues will construct a node struct with a drain helper -func NewWithValues(nthConfig config.Config, drainHelper *drain.Helper) (*Node, error) { +// NewWithValues will construct a node struct with a drain helper and an uptime function +func NewWithValues(nthConfig config.Config, drainHelper *drain.Helper, uptime uptime.UptimeFuncType) (*Node, error) { return &Node{ nthConfig: nthConfig, drainHelper: drainHelper, + uptime: uptime, }, nil } +// GetName returns node name from the configuration. func (n Node) GetName() string { return n.nthConfig.NodeName } @@ -362,11 +360,11 @@ func (n Node) UncordonIfRebooted() error { secondsSinceLabel := time.Now().Unix() - timeValNum switch actionVal := k8sNode.Labels[ActionLabelKey]; actionVal { case UncordonAfterRebootLabelVal: - uptime, err := getSystemUptime(uptimeFile) + uptime, err := n.uptime() if err != nil { return err } - if secondsSinceLabel < int64(uptime) { + if secondsSinceLabel < uptime { log.Log().Msg("The system has not restarted yet.") return nil } @@ -433,19 +431,6 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { return drainHelper, nil } -func getSystemUptime(filename string) (float64, error) { - data, err := ioutil.ReadFile(filename) - if err != nil { - return 0, fmt.Errorf("Not able to read %s: %w", filename, err) - } - - uptime, err := strconv.ParseFloat(strings.Split(string(data), " ")[0], 64) - if err != nil { - return 0, fmt.Errorf("Not able to parse %s to Float64: %w", filename, err) - } - return uptime, nil -} - func jsonPatchEscape(value string) string { value = strings.Replace(value, "~", "~0", -1) return strings.Replace(value, "/", "~1", -1) @@ -559,3 +544,12 @@ func removeTaint(node *corev1.Node, client kubernetes.Interface, taintKey string return true, nil } } + +func getUptimeFunc(uptimeFile string) uptime.UptimeFuncType { + if uptimeFile != "" { + return func() (int64, error) { + return uptime.UptimeFromFile(uptimeFile) + } + } + return uptime.Uptime +} diff --git a/pkg/node/node_internal_test.go b/pkg/node/node_internal_test.go index 97cbbc34..fa6caa23 100644 --- a/pkg/node/node_internal_test.go +++ b/pkg/node/node_internal_test.go @@ -23,6 +23,7 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/config" h "github.com/aws/aws-node-termination-handler/pkg/test" + "github.com/aws/aws-node-termination-handler/pkg/uptime" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" @@ -38,6 +39,12 @@ func resetFlagsForTest() { os.Setenv("NODE_NAME", nodeName) } +func getUptimeFromFile(filepath string) uptime.UptimeFuncType { + return func() (int64, error) { + return uptime.UptimeFromFile(filepath) + } +} + func getTestDrainHelper(client *fake.Clientset) *drain.Helper { return &drain.Helper{ Client: client, @@ -59,36 +66,16 @@ func getNthConfig(t *testing.T) config.Config { return nthConfig } -func getNode(t *testing.T, drainHelper *drain.Helper) *Node { - tNode, err := NewWithValues(getNthConfig(t), drainHelper) +func getNode(t *testing.T, drainHelper *drain.Helper, uptime uptime.UptimeFuncType) *Node { + tNode, err := NewWithValues(getNthConfig(t), drainHelper, uptime) if err != nil { t.Error("failed to create node") } return tNode } -func TestGetUptimeSuccess(t *testing.T) { - d1 := []byte("350735.47 234388.90") - ioutil.WriteFile(testFile, d1, 0644) - - value, err := getSystemUptime(testFile) - os.Remove(testFile) - h.Ok(t, err) - h.Equals(t, 350735.47, value) -} - -func TestGetUptimeFailure(t *testing.T) { - d1 := []byte("Something not time") - ioutil.WriteFile(testFile, d1, 0644) - - _, err := getSystemUptime(testFile) - os.Remove(testFile) - h.Assert(t, err != nil, "Failed to throw error for float64 parse") -} - func TestUncordonIfRebootedFileReadError(t *testing.T) { resetFlagsForTest() - uptimeFile = testFile client := fake.NewSimpleClientset() client.CoreV1().Nodes().Create(&v1.Node{ @@ -100,14 +87,13 @@ func TestUncordonIfRebootedFileReadError(t *testing.T) { }, }, }) - tNode := getNode(t, getTestDrainHelper(client)) + tNode := getNode(t, getTestDrainHelper(client), getUptimeFromFile("does-not-exist")) err := tNode.UncordonIfRebooted() - h.Assert(t, err != nil, "Failed to return error on UncordonIfReboted failure to read file") + h.Assert(t, err != nil, "Failed to return error on UncordonIfRebooted failure to read file") } func TestUncordonIfRebootedSystemNotRestarted(t *testing.T) { resetFlagsForTest() - uptimeFile = testFile d1 := []byte("350735.47 234388.90") ioutil.WriteFile(testFile, d1, 0644) @@ -121,7 +107,7 @@ func TestUncordonIfRebootedSystemNotRestarted(t *testing.T) { }, }, }) - tNode := getNode(t, getTestDrainHelper(client)) + tNode := getNode(t, getTestDrainHelper(client), getUptimeFromFile(testFile)) err := tNode.UncordonIfRebooted() os.Remove(testFile) h.Ok(t, err) @@ -129,7 +115,6 @@ func TestUncordonIfRebootedSystemNotRestarted(t *testing.T) { func TestUncordonIfRebootedFailureToRemoveLabel(t *testing.T) { resetFlagsForTest() - uptimeFile = testFile d1 := []byte("0 234388.90") ioutil.WriteFile(testFile, d1, 0644) @@ -143,7 +128,7 @@ func TestUncordonIfRebootedFailureToRemoveLabel(t *testing.T) { }, }, }) - tNode := getNode(t, getTestDrainHelper(client)) + tNode := getNode(t, getTestDrainHelper(client), getUptimeFromFile(testFile)) err := tNode.UncordonIfRebooted() os.Remove(testFile) h.Assert(t, err != nil, "Failed to return error on UncordonIfReboted failure remove NTH Label") @@ -151,7 +136,6 @@ func TestUncordonIfRebootedFailureToRemoveLabel(t *testing.T) { func TestUncordonIfRebootedFailureSuccess(t *testing.T) { resetFlagsForTest() - uptimeFile = testFile d1 := []byte("0 234388.90") ioutil.WriteFile(testFile, d1, 0644) @@ -166,8 +150,18 @@ func TestUncordonIfRebootedFailureSuccess(t *testing.T) { }, }, }) - tNode := getNode(t, getTestDrainHelper(client)) + tNode := getNode(t, getTestDrainHelper(client), getUptimeFromFile(testFile)) err := tNode.UncordonIfRebooted() os.Remove(testFile) h.Ok(t, err) } + +func TestGetUptimeFuncDefault(t *testing.T) { + uptimeFunc := getUptimeFunc("") + h.Assert(t, uptimeFunc != nil, "Failed to return a function.") +} + +func TestGetUptimeFuncWithFile(t *testing.T) { + uptimeFunc := getUptimeFunc(testFile) + h.Assert(t, uptimeFunc != nil, "Failed to return a function.") +} diff --git a/pkg/node/node_test.go b/pkg/node/node_test.go index 6524ab03..9c44dd1e 100644 --- a/pkg/node/node_test.go +++ b/pkg/node/node_test.go @@ -23,6 +23,7 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/node" h "github.com/aws/aws-node-termination-handler/pkg/test" + "github.com/aws/aws-node-termination-handler/pkg/uptime" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" @@ -59,7 +60,7 @@ func getNthConfig(t *testing.T) config.Config { } func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node { - tNode, err := node.NewWithValues(getNthConfig(t), drainHelper) + tNode, err := node.NewWithValues(getNthConfig(t), drainHelper, uptime.Uptime) if err != nil { t.Error("failed to create node") } diff --git a/pkg/uptime/common.go b/pkg/uptime/common.go new file mode 100644 index 00000000..e67f13c2 --- /dev/null +++ b/pkg/uptime/common.go @@ -0,0 +1,39 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package uptime + +import ( + "fmt" + "io/ioutil" + "strconv" + "strings" +) + +// UptimeFuncType cleans up function arguments or return type. +type UptimeFuncType func() (int64, error) + +// UptimeFromFile reads system uptime information from filepath and returns +// the number of seconds since last system boot. +func UptimeFromFile(filepath string) (int64, error) { + data, err := ioutil.ReadFile(filepath) + if err != nil { + return 0, fmt.Errorf("Not able to read %s: %w", filepath, err) + } + + uptime, err := strconv.ParseFloat(strings.Split(string(data), " ")[0], 64) + if err != nil { + return 0, fmt.Errorf("Not able to parse %s to int64: %w", filepath, err) + } + return int64(uptime), nil +} diff --git a/pkg/uptime/common_test.go b/pkg/uptime/common_test.go new file mode 100644 index 00000000..3da4a890 --- /dev/null +++ b/pkg/uptime/common_test.go @@ -0,0 +1,48 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package uptime + +import ( + "io/ioutil" + "os" + "testing" + + h "github.com/aws/aws-node-termination-handler/pkg/test" +) + +const testFile = "test.out" + +func TestUptimeFromFileSuccess(t *testing.T) { + d1 := []byte("350735.47 234388.90") + ioutil.WriteFile(testFile, d1, 0644) + + value, err := UptimeFromFile(testFile) + os.Remove(testFile) + h.Ok(t, err) + h.Equals(t, int64(350735), value) +} + +func TestUptimeFromFileReadFail(t *testing.T) { + _, err := UptimeFromFile("does-not-exist") + h.Assert(t, err != nil, "Failed to return error when ReadFile failed") +} + +func TestUptimeFromFileBadData(t *testing.T) { + d1 := []byte("Something not time") + ioutil.WriteFile(testFile, d1, 0644) + + _, err := UptimeFromFile(testFile) + os.Remove(testFile) + h.Assert(t, err != nil, "Failed to return error for int64 parse") +} diff --git a/pkg/uptime/uptime_darwin.go b/pkg/uptime/uptime_darwin.go new file mode 100644 index 00000000..f4c2c72b --- /dev/null +++ b/pkg/uptime/uptime_darwin.go @@ -0,0 +1,21 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package uptime + +import "errors" + +// Uptime returns an error on Darwin hosts. +func Uptime() (int64, error) { + return 0, errors.New("Not implemented on darwin platform") +} diff --git a/pkg/uptime/uptime_linux.go b/pkg/uptime/uptime_linux.go new file mode 100644 index 00000000..f625b777 --- /dev/null +++ b/pkg/uptime/uptime_linux.go @@ -0,0 +1,20 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package uptime + +// Uptime reads system uptime from /proc/uptime and returns the number +// of seconds since last system boot. +func Uptime() (int64, error) { + return UptimeFromFile("/proc/uptime") +} diff --git a/pkg/uptime/uptime_test.go b/pkg/uptime/uptime_test.go new file mode 100644 index 00000000..86baa24d --- /dev/null +++ b/pkg/uptime/uptime_test.go @@ -0,0 +1,28 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +// +build !darwin + +package uptime + +import ( + "testing" + + h "github.com/aws/aws-node-termination-handler/pkg/test" +) + +func TestUptime(t *testing.T) { + value, err := Uptime() + h.Ok(t, err) + h.Assert(t, value > 0, "Invalid system uptime") +} diff --git a/pkg/uptime/uptime_windows.go b/pkg/uptime/uptime_windows.go new file mode 100644 index 00000000..eccb3dfa --- /dev/null +++ b/pkg/uptime/uptime_windows.go @@ -0,0 +1,36 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package uptime + +import ( + "syscall" + "time" + + "golang.org/x/sys/windows" +) + +var ( + kernel32 = windows.NewLazySystemDLL("kernel32.dll") + getTickCount = kernel32.NewProc("GetTickCount") +) + +// Uptime returns the number of seconds since last system boot. +func Uptime() (int64, error) { + millis, _, err := syscall.Syscall(getTickCount.Addr(), 0, 0, 0, 0) + if err != 0 { + return 0, err + } + uptime := (time.Duration(millis) * time.Millisecond).Seconds() + return int64(uptime), nil +} diff --git a/scripts/build-docker-images b/scripts/build-docker-images index b80d3ffe..74b5b718 100755 --- a/scripts/build-docker-images +++ b/scripts/build-docker-images @@ -5,6 +5,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" REPO_ROOT_PATH=$SCRIPTPATH/../ MAKE_FILE_PATH=$REPO_ROOT_PATH/Makefile +DOCKERFILE_PATH=$REPO_ROOT_PATH/Dockerfile VERSION=$(make -s -f $MAKE_FILE_PATH version) PLATFORMS=("linux/amd64") @@ -52,10 +53,16 @@ for os_arch in "${PLATFORMS[@]}"; do img_tag="$IMAGE_REPO:$VERSION-$os-$arch" + dockerfile="$DOCKERFILE_PATH" + if [[ $os = "windows" ]]; then + dockerfile="${dockerfile}.windows" + fi + docker build \ + --file "${dockerfile}" \ --build-arg GOOS=${os} \ --build-arg GOARCH=${arch} \ --build-arg GOPROXY=${GOPROXY} \ - -t ${img_tag} \ + --tag ${img_tag} \ ${REPO_ROOT_PATH} -done \ No newline at end of file +done diff --git a/scripts/generate-k8s-yaml b/scripts/generate-k8s-yaml index 3fb66799..0488a7f6 100755 --- a/scripts/generate-k8s-yaml +++ b/scripts/generate-k8s-yaml @@ -44,7 +44,8 @@ rm -rf $BUILD_DIR/$PLATFORM-amd64 chmod +x $BUILD_DIR/helm $BUILD_DIR/helm template aws-node-termination-handler \ - --namespace kube-system \ + --namespace $NAMESPACE \ + --set targetNodeOs="linux windows" \ $SCRIPTPATH/../config/helm/aws-node-termination-handler/ > $AGG_RESOURCES_YAML # remove helm annotations from template @@ -53,6 +54,7 @@ mv $BUILD_DIR/helm_annotations_removed.yaml $AGG_RESOURCES_YAML $BUILD_DIR/helm template aws-node-termination-handler \ --namespace $NAMESPACE \ + --set targetNodeOs="linux windows" \ --output-dir $INDV_RESOURCES_DIR/ \ $SCRIPTPATH/../config/helm/aws-node-termination-handler/ diff --git a/scripts/run-unit-tests-in-docker b/scripts/run-unit-tests-in-docker new file mode 100755 index 00000000..dd3d7943 --- /dev/null +++ b/scripts/run-unit-tests-in-docker @@ -0,0 +1,71 @@ +#!/bin/bash + +set -euo pipefail + +project_root_dir="$(cd "$(dirname "$0")/.." && pwd -P)" +work_dir="/workplace/aws-node-termination-handler" +container_name="nth_unit_test_on_linux" +deps="go,git,make" +recreate=0 +usage=$(cat <&2 + exit + ;; + esac +done + +echo "unit tests will be run in docker container named $container_name" + +if [[ $recreate -eq 1 ]]; then + docker container rm "$container_name" >/dev/null 2>&1 || true +fi + +if ! [[ -n $(docker container ls -a | grep "$container_name") ]]; then + echo "creating container ..." + + IFS=',' read -ra deps <<< "$deps" + echo "dependencies to install: ${deps[@]}" + + docker container create \ + --name "$container_name" \ + --volume "$project_root_dir:$work_dir" \ + --env GOPROXY=direct \ + --env GO111MODULE=auto \ + --workdir "$work_dir" \ + --init \ + alpine:latest \ + sh -c "apk add ${deps[*]} && make clean unit-test" + + echo "container created" +else + echo "container exists" +fi + +echo "running unit tests ..." +docker container start --attach "$container_name" + diff --git a/test/e2e/cordon-only-test b/test/e2e/cordon-only-test index b062c91a..d6eae75d 100755 --- a/test/e2e/cordon-only-test +++ b/test/e2e/cordon-only-test @@ -10,26 +10,56 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Cordon Only Test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Cordon Only Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set cordonOnly="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 @@ -46,19 +76,31 @@ for i in `seq 1 10`; do done if [[ $DEPLOYED -eq 0 ]]; then - exit 2 + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 fi +cordoned=0 +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled > /dev/null; then echo "✅ Verified the worker node was cordoned!" - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then - echo "✅ Verified the regular-pod-test pod was NOT evicted!" - echo "✅ Cordon Only Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "✅ Verified the regular-pod-test pod was NOT evicted!" + echo "✅ Cordon Only Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +else + echo "❌ regular-pod-test was evicted" +fi + +echo "❌ Cordon Only Test Failed $CLUSTER_NAME ❌" +fail_and_exit 1 diff --git a/test/e2e/imds-v2-test b/test/e2e/imds-v2-test index 746182d3..75b320ba 100755 --- a/test/e2e/imds-v2-test +++ b/test/e2e/imds-v2-test @@ -10,28 +10,62 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ IMDSv2 Test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting IMDSv2 Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set enableSpotInterruptionDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableSpotITN="true" \ - --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ - --set ec2MetadataTestProxy.enableIMDSV2="true" \ +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableSpotITN="true" + --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" + --set ec2MetadataTestProxy.enableIMDSV2="true" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 @@ -48,19 +82,31 @@ for i in `seq 1 10`; do done if [[ $DEPLOYED -eq 0 ]]; then - exit 2 + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 fi +cordoned=0 +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "✅ Verified the regular-pod-test pod was evicted!" - echo "✅ IMDSv2 Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + echo "✅ IMDSv2 Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 \ No newline at end of file +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +else + echo "❌ regular-pod-test pod was not evicted" +fi + +echo "❌ IMDSv2 Test failed $CLUSTER_NAME ❌" +fail_and_exit 1 diff --git a/test/e2e/maintenance-event-cancellation-test b/test/e2e/maintenance-event-cancellation-test index fa3ebae7..27c91e11 100755 --- a/test/e2e/maintenance-event-cancellation-test +++ b/test/e2e/maintenance-event-cancellation-test @@ -10,37 +10,69 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Test Maintenance Event Cancellation failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Maintenance Event Cancellation Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set enableSpotInterruptionDraining="true" + --set enableScheduledEventDraining="true" --set taintNode="true" - -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ - --set ec2MetadataTestProxy.enableSpotITN="false" \ - --set ec2MetadataTestProxy.port="$IMDS_PORT" \ - --set 'ec2MetadataTestProxy.tolerations[0].effect=NoSchedule' \ +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" + --set ec2MetadataTestProxy.enableSpotITN="false" + --set ec2MetadataTestProxy.port="$IMDS_PORT" + --set 'ec2MetadataTestProxy.tolerations[0].effect=NoSchedule' --set 'ec2MetadataTestProxy.tolerations[0].operator=Exists' +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 DEPLOYED=0 -CORDONED=0 -TAINTED=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then @@ -52,60 +84,85 @@ for i in `seq 1 10`; do done if [[ $DEPLOYED -eq 0 ]]; then - echo "❌ Failed test setup for regular-pod" - exit 2 + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 fi +cordoned=0 +tainted=0 +evicted=0 +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker --no-headers | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node --no-headers | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi - if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then - echo "✅ Verified the worked node was tainted!" - TAINTED=1 - fi + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + echo "✅ Verified the worked node was tainted!" + tainted=1 + fi - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "✅ Verified the regular-pod-test pod was evicted!" - CORDONED=1 - break - fi + if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + evicted=1 + break fi sleep $TAINT_CHECK_SLEEP done -if [[ $CORDONED -eq 0 ]]; then - echo "❌ Failed cordoning node for scheduled maintenance event" - exit 3 +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" + fail_and_exit 1 +elif [[ $tainted -eq 0 ]]; then + echo "❌ Worker node was not tainted" + fail_and_exit 1 +elif [[ $evicted -eq 0 ]]; then + echo "❌ regular-pod-test pod was not evicted" + fail_and_exit 1 fi -if [[ $TAINTED -eq 0 ]]; then - echo "❌ Failed tainting node for scheduled maintenance event" - exit 3 -fi - -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ - --set ec2MetadataTestProxy.enableSpotITN="false" \ - --set ec2MetadataTestProxy.scheduledEventStatus="canceled" \ - --set ec2MetadataTestProxy.port="$IMDS_PORT" \ - --set 'ec2MetadataTestProxy.tolerations[0].effect=NoSchedule' \ +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" + --set ec2MetadataTestProxy.enableSpotITN="false" + --set ec2MetadataTestProxy.scheduledEventStatus="canceled" + --set ec2MetadataTestProxy.port="$IMDS_PORT" + --set 'ec2MetadataTestProxy.tolerations[0].effect=NoSchedule' --set 'ec2MetadataTestProxy.tolerations[0].operator=Exists' +) +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x +uncordoned=0 for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker --no-headers | grep -v SchedulingDisabled; then + if [[ $uncordoned -eq 0 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was uncordoned!" - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then - echo "✅ Verified the regular-pod-test pod was rescheduled" - echo "✅ Scheduled Maintenance Event Cancellation Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + uncordoned=1 + fi + + if [[ $uncordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "✅ Verified the regular-pod-test pod was rescheduled" + echo "✅ Test Maintenance Event Cancellation passed! $CLUSTER_NAME ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $uncordoned -eq 0 ]]; then + echo "❌ Worker node was not UNcordoned" +else + echo "❌ regular-pod-test pod was not rescheduled" +fi +fail_and_exit 1 diff --git a/test/e2e/maintenance-event-dry-run-test b/test/e2e/maintenance-event-dry-run-test index 2bebcc3a..0e9cc938 100755 --- a/test/e2e/maintenance-event-dry-run-test +++ b/test/e2e/maintenance-event-dry-run-test @@ -10,40 +10,85 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Scheduled Maintenance Events Dry-Run Test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Maintenance Events Dry-Run Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set dryRun="true" \ - --set enableSpotInterruptionDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set dryRun="true" + --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x +logs=0 +pod_id=$(get_nth_worker_pod) +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in $(seq 0 10); do - POD_ID=$(get_nth_worker_pod) - if [[ ! -z $(kubectl logs $POD_ID -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then + if [[ $logs -eq 0 && ! -z $(kubectl logs $pod_id -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then echo "✅ Verified the dryrun logs were executed" - if kubectl get nodes $CLUSTER_NAME-worker --no-headers | grep -v SchedulingDisabled; then - echo "✅ Verified the worker node was not cordoned!" - echo "✅ Scheduled Maintenance Event Dry Run Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + logs=1 + fi + + if [[ $logs -eq 1 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then + echo "✅ Verified the worker node was not cordoned!" + echo "✅ Scheduled Maintenance Event Dry Run Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep 10 done -exit 1 +if [[ $logs -eq 0 ]]; then + echo "❌ Dryrun logs were not executed" +else + echo "❌ Worker node was cordoned" +fi + +echo "❌ Scheduled Maintenance Event Dry Run Test failed $CLUSTER_NAME ❌" +fail_and_exit 1 diff --git a/test/e2e/maintenance-event-reboot-test b/test/e2e/maintenance-event-reboot-test index 0133bdab..3f0ae48f 100755 --- a/test/e2e/maintenance-event-reboot-test +++ b/test/e2e/maintenance-event-reboot-test @@ -6,124 +6,184 @@ set -euo pipefail # $CLUSTER_NAME # $KUBECONFIG +function fail_and_exit { + echo "❌ Scheduled Maintenance Event System Reboot Test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Maintenance Event Cancellation Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set enableSpotInterruptionDraining="true" + --set enableScheduledEventDraining="true" --set taintNode="true" - -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ - --set ec2MetadataTestProxy.enableSpotITN="false" \ +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" + --set ec2MetadataTestProxy.enableSpotITN="false" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 -DEPLOYED=0 -CORDONED=0 -TAINTED=0 - +deployed=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" - DEPLOYED=1 + deployed=1 break fi sleep 5 done -if [[ $DEPLOYED -eq 0 ]]; then +if [[ $deployed -eq 0 ]]; then echo "❌ Failed test setup for regular-pod" - exit 2 + fail_and_exit 2 fi +cordoned=0 +tainted=0 +evicted=0 +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned for maintenance event reboot!" + cordoned=1 + fi - if kubectl get nodes $CLUSTER_NAME-worker -o json | grep "aws-node-termination-handler/scheduled-maintenance"; then - echo "✅ Verified the worked node was tainted!" - TAINTED=1 - fi + if [[ $cordoned -eq 1 ]] && kubectl get nodes $test_node -o json | grep "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + echo "✅ Verified the worker node was tainted!" + tainted=1 + fi - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "✅ Verified the regular-pod-test pod was evicted!" - CORDONED=1 - break - fi + if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + evicted=1 + break fi sleep $TAINT_CHECK_SLEEP done -if [[ $CORDONED -eq 0 ]]; then +if [[ $cordoned -eq 0 ]]; then echo "❌ Failed cordoning node for scheduled maintenance event" - exit 3 + fail_and_exit 3 fi -if [[ $TAINTED -eq 0 ]]; then +if [[ $tainted -eq 0 ]]; then echo "❌ Failed tainting node for scheduled maintenance event" - exit 3 + fail_and_exit 3 fi -## Copy uptime file to Kind k8s nodes -for node in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do - docker exec $node sh -c "rm -rf /uptime" - docker cp $SCRIPTPATH/../assets/uptime-reboot $node:/uptime - docker exec $node sh -c "chmod 0444 /uptime && chown root /uptime && chgrp root /uptime" -done +mock_uptime_filepath="/uptime" +if [[ "${TEST_WINDOWS:-"false"}" != "true" ]]; then + echo "Copy uptime file to Kind k8s nodes" + for node in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do + docker exec $node sh -c "rm -rf $mock_uptime_filepath" + docker cp $SCRIPTPATH/../assets/uptime-reboot $node:$mock_uptime_filepath + docker exec $node sh -c "chmod 0444 $mock_uptime_filepath && chown root $mock_uptime_filepath && chgrp root $mock_uptime_filepath" + done +else + echo "Copy uptime file to $TEST_NODE" + kubectl cp $SCRIPTPATH/../assets/uptime-root kube-system/$(get_nth_worker_pod):$mock_uptime_filepath +fi -## Remove ec2-metadata-test-proxy to prevent another drain event but keep regular-test-pod +echo "Remove ec2-metadata-test-proxy to prevent another drain event but keep regular-test-pod" daemonset=$(kubectl get daemonsets | grep 'ec2-metadata-test-proxy' | cut -d' ' -f1) kubectl delete daemonsets $daemonset ## Restart NTH which will simulate a system reboot by mounting a new uptime file -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set procUptimeFile="/uptime" \ - --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" \ +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set procUptimeFile=$mock_uptime_filepath + --set enableSpotInterruptionDraining="true" + --set enableScheduledEventDraining="true" --set taintNode="true" +) +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") +set -x +helm "${anth_helm_args[@]}" +set +x + +uncordoned=0 +untainted=0 for i in `seq 1 $TAINT_CHECK_CYCLES`; do - NODE_LINE=$(kubectl get nodes $CLUSTER_NAME-worker | grep -v 'STATUS') - if [[ -z $(echo $NODE_LINE | grep SchedulingDisabled) ]] && [[ ! -z $(echo $NODE_LINE | grep Ready) ]]; then + NODE_LINE=$(kubectl get nodes $test_node | grep -v 'STATUS') + if [[ $uncordoned -eq 0 && -z $(echo $NODE_LINE | grep SchedulingDisabled) ]] && [[ ! -z $(echo $NODE_LINE | grep Ready) ]]; then echo "✅ Verified the worker node was uncordoned!" + uncordoned=1 + fi - if ! kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then - echo "✅ Verified the worked node was untainted!" - TAINTED=0 - else - echo "❌ Failed clearing the worked node taint after a reboot!" - exit 3 - fi - - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then - echo "✅ Verified the regular-pod-test pod was rescheduled" - echo "✅ Scheduled Maintenance Event System Reboot Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + if [[ $uncordoned -eq 1 && $untainted -eq 0 ]] && ! kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + echo "✅ Verified the worked node was untainted!" + untainted=1 + fi + + if [[ $untainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "✅ Verified the regular-pod-test pod was rescheduled" + echo "✅ Scheduled Maintenance Event System Reboot Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $uncordoned -eq 0 ]]; then + echo "❌ Worker node was not UNcordoned" +elif [[ $untainted -eq 0 ]]; then + echo "❌ Worked node was not UNtainted" +else + echo "❌ regular-pod-test pod was not rescheduled" +fi +fail_and_exit 1 diff --git a/test/e2e/maintenance-event-test b/test/e2e/maintenance-event-test index 8f405c47..dac25cde 100755 --- a/test/e2e/maintenance-event-test +++ b/test/e2e/maintenance-event-test @@ -10,70 +10,111 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Maintenance Events Test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Maintenance Events Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set enableSpotInterruptionDraining="true" + --set enableScheduledEventDraining="true" --set taintNode="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") +set -x +helm "${anth_helm_args[@]}" +set +x -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ - --set ec2MetadataTestProxy.enableSpotITN="false" \ +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" + --set ec2MetadataTestProxy.enableSpotITN="false" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetdataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 -DEPLOYED=0 +deployed=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" - DEPLOYED=1 + deployed=1 break fi sleep 5 done -if [[ $DEPLOYED -eq 0 ]]; then - exit 2 +if [[ $deployed -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 fi -TAINTED=0 - +cordoned=0 +tainted=0 +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + echo "✅ Verified the worked node was tainted!" + tainted=1 + fi - if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then - echo "✅ Verified the worked node was tainted!" - else - echo "❌ Failed tainting node for maintenance event" - exit 3 - fi - - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "✅ Verified the regular-pod-test pod was evicted!" - echo "✅ Scheduled Maintenance Event Handling Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + echo "✅ Maintenance Event Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +elif [[ $tainted -eq 0 ]]; then + echo "❌ Worker node was not tainted" +else + echo "❌ regular-pod-test pod was not evicted" +fi +fail_and_exit 1 diff --git a/test/e2e/spot-interruption-dry-run-test b/test/e2e/spot-interruption-dry-run-test index c8112adb..d95c9c5f 100755 --- a/test/e2e/spot-interruption-dry-run-test +++ b/test/e2e/spot-interruption-dry-run-test @@ -10,41 +10,83 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Spot Interruption Dry Run test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Maintenance Events Dry-Run Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set dryRun="true" \ - --set enableSpotInterruptionDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set dryRun="true" + --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") +set -x +helm "${emtp_helm_args[@]}" +set +x +logs=0 +pod_id=$(get_nth_worker_pod) +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in $(seq 0 10); do - POD_ID=$(get_nth_worker_pod) - if [[ ! -z $(kubectl logs $POD_ID -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then + if [[ $logs -eq 0 && ! -z $(kubectl logs $pod_id -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then echo "✅ Verified the dryrun logs were executed" - if kubectl get nodes $CLUSTER_NAME-worker --no-headers | grep -v SchedulingDisabled; then - echo "✅ Verified the worker node was not cordoned!" - echo "✅ Spot Interruption Dry Run Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + logs=1 + fi + + if [[ $logs -eq 1 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then + echo "✅ Verified the worker node was not cordoned!" + echo "✅ Spot Interruption Dry Run Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep 10 done -exit 1 +if [[ $logs -eq 0 ]]; then + echo "❌ dryrun logs were not executed" +else + echo "❌ Worker node was cordoned" +fi +fail_and_exit 1 diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test index db530161..a55c3f8e 100755 --- a/test/e2e/spot-interruption-test +++ b/test/e2e/spot-interruption-test @@ -10,67 +10,110 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Spot Interruption test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Spot Interruption Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set enableScheduledEventDraining="false" \ - --set enableSpotInterruptionDraining="true" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set enableScheduledEventDraining="false" + --set enableSpotInterruptionDraining="true" --set taintNode="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableSpotITN="true" \ - --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="false" \ +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableSpotITN="true" + --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="false" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 -DEPLOYED=0 - +deployed=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" - DEPLOYED=1 + deployed=1 break fi sleep 5 done -if [[ $DEPLOYED -eq 0 ]]; then - exit 2 +if [[ $deployed -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 fi +cordoned=0 +tainted=0 +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then + echo "✅ Verified the worked node was tainted!" + tainted=1 + fi - if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/spot-itn"; then - echo "✅ Verified the worked node was tainted!" - else - echo "❌ Failed tainting node for spot termination event" - exit 3 - fi - - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "✅ Verified the regular-pod-test pod was evicted!" - echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +elif [[ $tainted -eq 0 ]]; then + echo "❌ Worker node was not tainted" +else + echo "❌ regular-pod-test pod was not evicted" +fi +fail_and_exit 1 diff --git a/test/e2e/webhook-http-proxy-test b/test/e2e/webhook-http-proxy-test index 99c7989b..bbfbd91b 100755 --- a/test/e2e/webhook-http-proxy-test +++ b/test/e2e/webhook-http-proxy-test @@ -10,6 +10,11 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Webhook HTTP Proxy Test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Webhook HTTP Proxy Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" @@ -17,11 +22,10 @@ SQUID_DOCKERHUB_IMG="sameersbn/squid:3.5.27-2@sha256:e98299069f0c6e3d9b918890351 SQUID_DOCKER_IMG="squid:customtest" ### LOCAL ONLY TESTS FOR 200 RESPONSE FROM LOCAL CLUSTER, MASTER WILL TEST WITH TRAVIS SECRET URL -if [[ -z $(env | grep "WEBHOOK_URL=") ]]; then +if [[ -z "${WEBHOOK_URL-}" ]]; then WEBHOOK_URL="http://127.0.0.1:$IMDS_PORT" fi - docker pull $SQUID_DOCKERHUB_IMG docker tag $SQUID_DOCKERHUB_IMG $SQUID_DOCKER_IMG kind load docker-image --name $CLUSTER_NAME --nodes=$CLUSTER_NAME-worker,$CLUSTER_NAME-control-plane $SQUID_DOCKER_IMG @@ -39,65 +43,108 @@ helm upgrade --install $CLUSTER_NAME-squid $SCRIPTPATH/../../config/helm/squid/ sleep 20 -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" \ - --set webhookURL="$WEBHOOK_URL" \ - --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" \ +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") + +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --force + --namespace kube-system + --set instanceMetadataURL="http://localhost:$IMDS_PORT" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set enableSpotInterruptionDraining="true" + --set enableScheduledEventDraining="true" + --set webhookURL="$WEBHOOK_URL" + --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" --set webhookProxy="tcp://localhost:3128" - -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ - --set ec2MetadataTestProxy.enableSpotITN="true" \ +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" + --set ec2MetadataTestProxy.enableSpotITN="true" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 -DEPLOYED=0 - +deployed=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" - DEPLOYED=1 + deployed=1 break fi sleep 5 done -if [[ $DEPLOYED -eq 0 ]]; then - exit 2 +if [[ $deployed -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_end_exit 2 fi +cordoned=0 +evicted=0 +sent=0 for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then - echo "✅ Verified the worker node was cordoned!" - if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "✅ Verified the regular-pod-test pod was evicted!" - NTH_POD_NAME=$(get_nth_worker_pod) - if kubectl logs $NTH_POD_NAME -n kube-system | grep 'Webhook Success'; then - echo "✅ Webhook Successfully Sent $CLUSTER_NAME! ✅" - pods=$(kubectl get pods -o json) - ## queries for the squid pod on the worker node - squid_worker_pods=$(echo $pods | jq '.items[] | select( .metadata.name | contains("squid") ) | .metadata.name as $name | select( .spec.nodeName | contains("worker") ) | .spec.nodeName as $nodename | $name' -r) - ## return only 1 pod - if kubectl exec -it $(echo $squid_worker_pods | cut -d' ' -f1) -- cat /var/log/squid/access.log | grep -e 'TCP_MISS/200' -e 'TCP_TUNNEL/200' -e 'TCP_MISS_ABORTED/200'; then - echo "✅ Verified the webhook POST used the http proxy" - exit 0 - fi - fi - fi - - fi - sleep $TAINT_CHECK_SLEEP + if [[ $cordoned -eq 0 ]] && kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $evicted -eq 0 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + evicted=1 + pod_id=$(get_nth_worker_pod) + fi + + if [[ $evicted -eq 1 && $sent -eq 0 ]] && kubectl logs $pod_id -n kube-system | grep 'Webhook Success' >/dev/null; then + echo "✅ Verified that webhook successfully sent" + sent=1 + squid_worker_pods=$(kubectl get pods -o json | jq '.items[] | select( .metadata.name | contains("squid") ) | .metadata.name as $name | select( .spec.nodeName | contains("worker") ) | .spec.nodeName as $nodename | $name' -r) + fi + + if [[ $sent -eq 1 ]] && kubectl exec -it $(echo $squid_worker_pods | cut -d' ' -f1) -- cat /var/log/squid/access.log | grep -e 'TCP_MISS/200' -e 'TCP_TUNNEL/200' -e 'TCP_MISS_ABORTED/200' >/dev/null; then + echo "✅ Verified the webhook POST used the http proxy" + exit 0 + fi + sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +elif [[ $evicted -eq 0 ]]; then + echo "❌ regular-pod-test pod was not evicted" +elif [[ $sent -eq 0 ]]; then + echo "❌ Webhook not sent" +else + echo "❌ Webhook POST did not use http proxy" +fi +fail_and_exit 1 diff --git a/test/e2e/webhook-test b/test/e2e/webhook-test index 3fdc85c5..12b4d9f0 100755 --- a/test/e2e/webhook-test +++ b/test/e2e/webhook-test @@ -10,63 +10,102 @@ set -euo pipefail # $EC2_METADATA_DOCKER_REPO # $EC2_METADATA_DOCKER_TAG +function fail_and_exit { + echo "❌ Webhook test failed $CLUSTER_NAME ❌" + exit ${1:-1} +} + echo "Starting Webhook Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -### LOCAL ONLY TESTS FOR 200 RESPONSE FROM LOCAL CLUSTER, MASTER WILL TEST WITH TRAVIS SECRET URL -if [[ -z $(env | grep "WEBHOOK_URL=") ]]; then - WEBHOOK_URL="http://localhost:$IMDS_PORT" -fi +common_helm_args=() +[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") -helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \ - --wait \ - --force \ - --namespace kube-system \ - --set instanceMetadataURL="http://localhost:$IMDS_PORT" \ - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ - --set webhookURL="$WEBHOOK_URL" \ - --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" \ - --set enableSpotInterruptionDraining="true" \ +anth_helm_args=( + upgrade + --install + $CLUSTER_NAME-anth + $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ + --wait + --force + --namespace kube-system + --set instanceMetadataURL=${INSTANCE_METADATA_URL:-"http://localhost:$IMDS_PORT"} + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set webhookURL="${WEBHOOK_URL:-"http://localhost:$IMDS_PORT"}" + --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" + --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") -helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ - --wait \ - --force \ - --namespace default \ - --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \ - --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + $CLUSTER_NAME-emtp + $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ + --wait + --force + --namespace default + --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" + --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" --set ec2MetadataTestProxy.port="$IMDS_PORT" +) +[[ -n "${EC2_METADATA_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set ec2MetadataTestProxy.image.pullPolicy="$EC2_METADATA_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 -DEPLOYED=0 +deployed=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" - DEPLOYED=1 + deployed=1 break fi sleep 5 done -if [[ $DEPLOYED -eq 0 ]]; then - exit 2 +if [[ $deployed -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 fi +cordoned=0 +nth_pod_name=$(get_nth_worker_pod) +test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then + if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" - NTH_POD_NAME=$(get_nth_worker_pod) - if kubectl logs $NTH_POD_NAME -n kube-system | grep 'Webhook Success'; then - echo "✅ Verified the webhook message was sent!" - echo "✅ Webhook Test Passed $CLUSTER_NAME! ✅" - exit 0 - fi + cordoned=1 + fi + + if [[ $cordoned -eq 1 ]] && kubectl logs $nth_pod_name -n kube-system | grep 'Webhook Success' >/dev/null; then + echo "✅ Verified the webhook message was sent!" + echo "✅ Webhook Test Passed $CLUSTER_NAME! ✅" + exit 0 fi sleep $TAINT_CHECK_SLEEP done -exit 1 +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +else + echo "❌ Webhook message was not sent" +fi +fail_and_exit 1 diff --git a/test/ec2-metadata-test-proxy/Dockerfile.windows b/test/ec2-metadata-test-proxy/Dockerfile.windows new file mode 100644 index 00000000..086293ef --- /dev/null +++ b/test/ec2-metadata-test-proxy/Dockerfile.windows @@ -0,0 +1,23 @@ +ARG WINDOWS_VERSION=1903 + +# Build the manager binary +FROM --platform=windows/amd64 golang:1.14 AS builder + +## GOLANG env +ENV GO111MODULE="on" CGO_ENABLED="0" GOOS="windows" GOARCH="amd64" +ARG GOPROXY="https://proxy.golang.org,direct" + +WORKDIR /ec2-metadata-test-proxy + +## Build +COPY . . +RUN go build -a -o ec2-metadata-test-proxy cmd/ec2-metadata-test-proxy.go +ENTRYPOINT ["ec2-metadata-test-proxy"] + +## Copy binary to a thin image +FROM mcr.microsoft.com/windows/nanoserver:${WINDOWS_VERSION} +WORKDIR / +COPY --from=builder /ec2-metadata-test-proxy . +COPY THIRD_PARTY_LICENSES . +ENTRYPOINT ["/ec2-metadata-test-proxy"] + diff --git a/test/eks-cluster-test/cluster-spec.yaml b/test/eks-cluster-test/cluster-spec.yaml new file mode 100644 index 00000000..a1d65433 --- /dev/null +++ b/test/eks-cluster-test/cluster-spec.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: windows-prod + region: us-west-2 + version: '1.16' +managedNodeGroups: + - name: linux-ng + instanceType: t2.large + minSize: 1 + maxSize: 1 +nodeGroups: + - name: windows-ng + instanceType: m5.large + minSize: 1 + maxSize: 1 + volumeSize: 100 + amiFamily: WindowsServer2019FullContainer diff --git a/test/eks-cluster-test/reset-cluster b/test/eks-cluster-test/reset-cluster new file mode 100755 index 00000000..cbdd9313 --- /dev/null +++ b/test/eks-cluster-test/reset-cluster @@ -0,0 +1,32 @@ +#!/bin/bash + +echo "Uninstall Helm charts ..." +helm list -a -A | + tail -n +2 | + tr -s "\t" " " | + awk '{ printf("helm uninstall -n %s %s\n", $2, $1) }' | + bash -s +echo "✅ done" + +echo +echo "Remove NTH labels from $TEST_NODE ..." +kubectl label --list nodes $TEST_NODE | + grep aws-node-termination-handler | + cut -d"=" -f1 | + awk '{ printf("%s-\n", $1) }' | + xargs kubectl label nodes $TEST_NODE +echo "✅ done" + +echo +echo "Remove NTH taints from $TEST_NODE ..." +# Weird: if query specifies actual node then it returns no results +kubectl get nodes -l kubernetes.io/os=windows -o=jsonpath='{range .items[*]}{.spec.taints[*].key}{"\n"}{end}' | + awk '{ printf("%s-\n", $1) }' | + xargs kubectl taint nodes $TEST_NODE +echo "✅ done" + +echo +echo "Uncordon $TEST_NODE ..." +kubectl uncordon $TEST_NODE +echo "✅ done" + diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test new file mode 100755 index 00000000..2ea0889c --- /dev/null +++ b/test/eks-cluster-test/run-test @@ -0,0 +1,147 @@ +#!/bin/bash + +set -euo pipefail + +USAGE=$(cat << 'EOM' + Usage: run-test [-a script1,scripts2,...] [-w] [-r] CONFIG + + Options: + -a Assertion script(s), default is ALL scripts in e2e dir + -w Target Windows platform + -r Reset cluster then exit + + Arguments: + CONFIG File to source, it should export the following values: + CLUSTER_NAME EKS cluster name + DOCKER_PULL_POLICY Docker image pull policy (defaults to IfNotPresent) + NODE_TERMINATION_HANDLER_DOCKER_REPO Node Termination Handler Docker repository + NODE_TERMINATION_HANDLER_DOCKER_TAG Node Termination Handler Docker tag + NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY Node Termination Handler Docker image pull policy (defaults to DOCKER_PULL_POLICY) + EC2_METADATA_DOCKER_REPO EC2 Metadata Test Proxy Docker repository + EC2_METADATA_DOCKER_TAG EC2 Metadata Test Proxy Docker tag + EC2_METADATA_DOCKER_PULL_POLICY EC2 Metadata Test Proxy Docker image pull policy (defaults to DOCKER_PULL_POLICY) + EC2_METADATA_PORT EC2 Metadata Test Proxy port (defaults to 18999) + EC2_METADATA_URL EC2 Metadata Test Proxy URL (defaults to "http://ec2-metadata-test-proxy.default.svc.cluster.local:$EC2_METADATA_PORT") + WEBHOOK_URL Webhook URL (defaults to EC2_METADATA_URL) +EOM +) + +reset_cluster_only=0 +export TEST_WINDOWS="false" + +while getopts "a:w" opt; do + case ${opt} in + a ) # Assertion script(s) + assertion_scripts=$(echo $OPTARG | tr "," "\n") + ;; + w ) # Windows platform + TEST_WINDOWS="true" + ;; + r ) # Reset cluster + reset_cluster_only=1 + ;; + \? ) + echo "$USAGE" 1>&2 + exit + esac +done + +config=${@:$OPTIND:1} +echo "Reading configuration from ${config:?"missing argument"}" +set -a # Export variables by default. +source $config +set +a # Disable exporting variables by default. + +echo "CLUSTER_NAME=${CLUSTER_NAME:?"not found"}" +echo "DOCKER_PULL_POLICY=${DOCKER_PULL_POLICY:="IfNotPresent"}" +echo "NODE_TERMINATION_HANDLER_DOCKER_REPO=${NODE_TERMINATION_HANDLER_DOCKER_REPO:?"not found"}" +echo "NODE_TERMINATION_HANDLER_DOCKER_TAG=${NODE_TERMINATION_HANDLER_DOCKER_TAG:?"not found"}" +echo "NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY=${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY:=$DOCKER_PULL_POLICY}" +echo "EC2_METADATA_DOCKER_REPO=${EC2_METADATA_DOCKER_REPO:?"not found"}" +echo "EC2_METADATA_DOCKER_TAG=${EC2_METADATA_DOCKER_TAG:?"not found"}" +echo "EC2_METADATA_DOCKER_PULL_POLICY=${EC2_METADATA_DOCKER_PULL_POLICY:=$DOCKER_PULL_POLICY}" +echo "EC2_METADATA_PORT=${EC2_METADATA_PORT:=18999}" +echo "EC2_METADATA_URL=${EC2_METADATA_URL:="http://ec2-metadata-test-proxy.default.svc.cluster.local:$EC2_METADATA_PORT"}" +echo "WEBHOOK_URL=${WEBHOOK_URL:=$EC2_METADATA_URL}" + +# The e2e test scripts use other variable names. +echo "IMDS_PORT=${IMDS_PORT:=$EC2_METADATA_PORT}" +echo "INSTANCE_METADATA_URL=${INSTANCE_METADATA_URL:=$EC2_METADATA_URL}" + +export NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY \ + EC2_METADATA_DOCKER_PULL_POLICY \ + WEBHOOK_URL \ + IMDS_PORT \ + INSTANCE_METADATA_URL + +if ! command -v kubectl >/dev/null 2>&1; then + echo "🚫 Cannot find kubectl command" + exit 1 +fi +if ! kubectl get svc >/dev/null 2>&1; then + echo "🚫 kubectl failed test communication with cluster" + exit 1 +fi + +export TEST_NODE=$(kubectl get nodes -l kubernetes.io/os=windows | + tail -n +2 | + tr -s "\t" " " | + cut -d" " -f1) +echo "TEST_NODE=${TEST_NODE:?"not found"}" + +echo "TEST_WINDOWS=${TEST_WINDOWS:="false"}" + +function get_nth_worker_pod { + kubectl get pods -n kube-system -l k8s-app=aws-node-termination-handler | + tail -n +2 | + tr -s "\t" " " | + cut -d " " -f 1 +} +export -f get_nth_worker_pod + +function reset_cluster { + echo "-------------------------------------------------------------------------------------------------" + echo "🧹 Resetting cluster $CLUSTER_NAME" + echo "-------------------------------------------------------------------------------------------------" + ./reset-cluster + sleep ${1:-15} +} +if [[ $reset_cluster_only -eq 1 ]]; then + reset_cluster 1 + exit 0 +fi + +if [[ -z ${assertion_scripts+x} ]]; then + assertion_scripts=( + ../e2e/cordon-only-test + ../e2e/imds-v2-test + ../e2e/maintenance-event-cancellation-test + ../e2e/maintenance-event-dry-run-test + #../e2e/maintenance-event-reboot-test + ../e2e/maintenance-event-test + ../e2e/spot-interruption-dry-run-test + ../e2e/spot-interruption-test + #../e2e/webhook-http-proxy-test + ../e2e/webhook-test + ) +fi + +echo "Assertion script(s): ${assertion_scripts[@]}" + +for assertion_script in ${assertion_scripts[@]}; do + reset_cluster + echo "=================================================================================================" + echo "🥑 Running assertion script $(basename $assertion_script)" + echo "=================================================================================================" + assertion_start=$(date +%s) + $assertion_script + assertion_end=$(date +%s) + echo "⏰ Took $(expr $assertion_end - $assertion_start)sec" + echo "✅ Assertion test $assertion_script PASSED! ✅" +done +reset_cluster + +echo "=====================================================================================================" +echo "✅ All tests passed! ✅" +echo "=====================================================================================================" +