From ccbe23f6edd04e3844c8ccb53a05b422d77ebcb6 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 17 Aug 2023 15:56:35 -0500 Subject: [PATCH 01/27] Added check of ASG lifecycle hook informing of an EC2 instance launch before ASG termination event --- pkg/monitor/sqsevent/sqs-monitor.go | 48 ++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index e028506c..0d9eea1c 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -148,7 +148,8 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri } return eventBridgeEvent, skip{err} - case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING": + case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" && + lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING": log.Err(err).Msg("only lifecycle termination events from ASG to SQS are supported outside EventBridge") err = fmt.Errorf("unsupported message type (%s)", message.String()) return eventBridgeEvent, err @@ -157,12 +158,49 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri eventBridgeEvent.Source = "aws.autoscaling" eventBridgeEvent.Time = lifecycleEvent.Time eventBridgeEvent.ID = lifecycleEvent.RequestID + eventBridgeEvent.DetailType = lifecycleEvent.LifecycleTransition eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent) log.Debug().Msg("processing lifecycle termination event from ASG") return eventBridgeEvent, err } +// Receives and processes SQS messages to check for ASG lifecycle hook informing of an EC2 instance launch +func (m SQSMonitor) newASGInstanceLifeCycleReceived() (bool, error) { + newInstanceCreated := false + messages, err := m.receiveQueueMessages(m.QueueURL) + if err != nil { + log.Err(err).Msg("Error receiveing SQS queue messages.") + return false, err + } + + failedEventBridgeEvents := 0 + for _, message := range messages { + eventBridgeEvent, err := m.processSQSMessage(message) + if err != nil { + var s skip + if errors.As(err, &s) { + log.Warn().Err(s).Msg("skip processing SQS message") + } else { + log.Err(err).Msg("error processing SQS message") + } + continue + } + + if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" { + log.Info().Msg("New EC2 instance created by ASG") + newInstanceCreated = true + break + } + } + + if len(messages) > 0 && failedEventBridgeEvents == len(messages) { + err = fmt.Errorf("none of the waiting queue events could be processed") + } + + return newInstanceCreated, err +} + // processEventBridgeEvent processes an EventBridge event and returns interruption event wrappers func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { interruptionEventWrappers := []InterruptionEventWrapper{} @@ -171,6 +209,14 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, switch eventBridgeEvent.Source { case "aws.autoscaling": + newInstanceCreated, err := m.newASGInstanceLifeCycleReceived() + for !newInstanceCreated { + if err != nil { + log.Err(err) + break + } + newInstanceCreated, err = m.newASGInstanceLifeCycleReceived() + } interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) From ed2a317e1b18a2da9181a6a1c2dfc6a244b9df7e Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Mon, 21 Aug 2023 14:38:29 -0500 Subject: [PATCH 02/27] Completes ASG launch lifecycle hook if new node is ready in cluster --- pkg/monitor/sqsevent/asg-lifecycle-event.go | 92 +++++++++++++++++++-- pkg/monitor/sqsevent/sqs-monitor.go | 50 ++--------- 2 files changed, 90 insertions(+), 52 deletions(-) diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 5c088030..77f59eb3 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -14,8 +14,10 @@ package sqsevent import ( + "context" "encoding/json" "fmt" + "strings" "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-node-termination-handler/pkg/node" @@ -24,6 +26,10 @@ import ( "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/sqs" "github.com/rs/zerolog/log" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" ) /* Example SQS ASG Lifecycle Termination Event Message: @@ -92,13 +98,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m } interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error { - _, err := m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{ - AutoScalingGroupName: &lifecycleDetail.AutoScalingGroupName, - LifecycleActionResult: aws.String("CONTINUE"), - LifecycleHookName: &lifecycleDetail.LifecycleHookName, - LifecycleActionToken: &lifecycleDetail.LifecycleActionToken, - InstanceId: &lifecycleDetail.EC2InstanceID, - }) + _, err := m.continueLifecycleAction(lifecycleDetail) if err != nil { if aerr, ok := err.(awserr.RequestFailure); ok && aerr.StatusCode() != 400 { return err @@ -124,3 +124,81 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m return &interruptionEvent, nil } + +func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*autoscaling.CompleteLifecycleActionOutput, error) { + return m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{ + AutoScalingGroupName: &lifecycleDetail.AutoScalingGroupName, + LifecycleActionResult: aws.String("CONTINUE"), + LifecycleHookName: &lifecycleDetail.LifecycleHookName, + LifecycleActionToken: &lifecycleDetail.LifecycleActionToken, + InstanceId: &lifecycleDetail.EC2InstanceID, + }) +} + +func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent) error { + lifecycleDetail := &LifecycleDetail{} + err := json.Unmarshal(event.Detail, lifecycleDetail) + if err != nil { + return err + } + + if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { + return skip{fmt.Errorf("message is an ASG test notification")} + } + + if m.isNodeReady(lifecycleDetail) { + _, err = m.continueLifecycleAction(lifecycleDetail) + } else { + err = skip{fmt.Errorf("New ASG instance has not connected to cluster")} + } + return err +} + +// If the Node, new EC2 instance, is ready in the K8s cluster +func (m SQSMonitor) isNodeReady(lifecycleDetail *LifecycleDetail) bool { + nodes, err := m.getNodes() + if err != nil { + return false + } + + for _, node := range nodes.Items { + instanceID := m.getInstanceID(node) + if instanceID != lifecycleDetail.EC2InstanceID { + break + } + + conditions := node.Status.Conditions + for _, condition := range conditions { + if condition.Type == "Ready" && condition.Status == "True" { + return true + } + } + } + return false +} + +// Gets Nodes connected to K8s cluster +func (m SQSMonitor) getNodes() (*v1.NodeList, error) { + clusterConfig, err := rest.InClusterConfig() + if err != nil { + return nil, err + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(clusterConfig) + if err != nil { + return nil, err + } + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, err + } + return nodes, err +} + +// Gets EC2 InstanceID from ProviderID, format: aws:///$az/$instanceid +func (m SQSMonitor) getInstanceID(node v1.Node) string { + providerID := node.Spec.ProviderID + providerIDSplit := strings.Split(providerID, "/") + instanceID := providerIDSplit[len(providerID)-1] + return instanceID +} diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 0d9eea1c..0898eb50 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -161,46 +161,10 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri eventBridgeEvent.DetailType = lifecycleEvent.LifecycleTransition eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent) - log.Debug().Msg("processing lifecycle termination event from ASG") + log.Debug().Msg("processing lifecycle event from ASG") return eventBridgeEvent, err } -// Receives and processes SQS messages to check for ASG lifecycle hook informing of an EC2 instance launch -func (m SQSMonitor) newASGInstanceLifeCycleReceived() (bool, error) { - newInstanceCreated := false - messages, err := m.receiveQueueMessages(m.QueueURL) - if err != nil { - log.Err(err).Msg("Error receiveing SQS queue messages.") - return false, err - } - - failedEventBridgeEvents := 0 - for _, message := range messages { - eventBridgeEvent, err := m.processSQSMessage(message) - if err != nil { - var s skip - if errors.As(err, &s) { - log.Warn().Err(s).Msg("skip processing SQS message") - } else { - log.Err(err).Msg("error processing SQS message") - } - continue - } - - if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" { - log.Info().Msg("New EC2 instance created by ASG") - newInstanceCreated = true - break - } - } - - if len(messages) > 0 && failedEventBridgeEvents == len(messages) { - err = fmt.Errorf("none of the waiting queue events could be processed") - } - - return newInstanceCreated, err -} - // processEventBridgeEvent processes an EventBridge event and returns interruption event wrappers func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { interruptionEventWrappers := []InterruptionEventWrapper{} @@ -209,15 +173,11 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, switch eventBridgeEvent.Source { case "aws.autoscaling": - newInstanceCreated, err := m.newASGInstanceLifeCycleReceived() - for !newInstanceCreated { - if err != nil { - log.Err(err) - break - } - newInstanceCreated, err = m.newASGInstanceLifeCycleReceived() + if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" { + err = m.asgCompleteLaunchLifecycle(eventBridgeEvent) + } else if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_TERMINATING" { + interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) } - interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) case "aws.ec2": From 4f47cb1a533b03eb7a0f34cac625eaab196313e9 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Mon, 21 Aug 2023 14:46:47 -0500 Subject: [PATCH 03/27] Avoid processing of interuption event for launching --- pkg/monitor/sqsevent/sqs-monitor.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 0898eb50..33c979a7 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -175,6 +175,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, case "aws.autoscaling": if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" { err = m.asgCompleteLaunchLifecycle(eventBridgeEvent) + interruptionEvent = nil } else if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_TERMINATING" { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) } From 2415c4c292b91d4faa146cce9d2f5c69cc4fedaf Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Wed, 23 Aug 2023 16:15:22 -0500 Subject: [PATCH 04/27] Fixed logic flow for how ASG hooks were checked. Added error logs and error messages --- pkg/monitor/sqsevent/asg-lifecycle-event.go | 29 ++++++++++++--------- pkg/monitor/sqsevent/sqs-monitor.go | 8 +++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 77f59eb3..cf6ba327 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -125,6 +125,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m return &interruptionEvent, nil } +// Continues the lifecycle hook thereby indicating a successful action occured func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*autoscaling.CompleteLifecycleActionOutput, error) { return m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{ AutoScalingGroupName: &lifecycleDetail.AutoScalingGroupName, @@ -135,36 +136,38 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* }) } +// Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent) error { lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { - return err + return fmt.Errorf("unmarshing ASG lifecycle event: %w", err) } if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { return skip{fmt.Errorf("message is an ASG test notification")} } - if m.isNodeReady(lifecycleDetail) { + if isNodeReady(lifecycleDetail) { _, err = m.continueLifecycleAction(lifecycleDetail) } else { - err = skip{fmt.Errorf("New ASG instance has not connected to cluster")} + err = skip{fmt.Errorf("new ASG instance has not connected to cluster")} } return err } // If the Node, new EC2 instance, is ready in the K8s cluster -func (m SQSMonitor) isNodeReady(lifecycleDetail *LifecycleDetail) bool { - nodes, err := m.getNodes() +func isNodeReady(lifecycleDetail *LifecycleDetail) bool { + nodes, err := getNodes() if err != nil { + log.Err(fmt.Errorf("getting nodes from cluster: %w", err)) return false } for _, node := range nodes.Items { - instanceID := m.getInstanceID(node) + instanceID := getInstanceID(node) if instanceID != lifecycleDetail.EC2InstanceID { - break + continue } conditions := node.Status.Conditions @@ -173,30 +176,32 @@ func (m SQSMonitor) isNodeReady(lifecycleDetail *LifecycleDetail) bool { return true } } + log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID)) } + log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", lifecycleDetail.EC2InstanceID)) return false } // Gets Nodes connected to K8s cluster -func (m SQSMonitor) getNodes() (*v1.NodeList, error) { +func getNodes() (*v1.NodeList, error) { clusterConfig, err := rest.InClusterConfig() if err != nil { - return nil, err + return nil, fmt.Errorf("retreiving cluster config: %w", err) } // creates the clientset clientset, err := kubernetes.NewForConfig(clusterConfig) if err != nil { - return nil, err + return nil, fmt.Errorf("creating new clientset with config: %w", err) } nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { - return nil, err + return nil, fmt.Errorf("retreiving nodes from cluster: %w", err) } return nodes, err } // Gets EC2 InstanceID from ProviderID, format: aws:///$az/$instanceid -func (m SQSMonitor) getInstanceID(node v1.Node) string { +func getInstanceID(node v1.Node) string { providerID := node.Spec.ProviderID providerIDSplit := strings.Split(providerID, "/") instanceID := providerIDSplit[len(providerID)-1] diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 33c979a7..c11a7e32 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -158,7 +158,6 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri eventBridgeEvent.Source = "aws.autoscaling" eventBridgeEvent.Time = lifecycleEvent.Time eventBridgeEvent.ID = lifecycleEvent.RequestID - eventBridgeEvent.DetailType = lifecycleEvent.LifecycleTransition eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent) log.Debug().Msg("processing lifecycle event from ASG") @@ -169,14 +168,15 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { interruptionEventWrappers := []InterruptionEventWrapper{} interruptionEvent := &monitor.InterruptionEvent{} - var err error + lifecycleEvent := LifecycleDetail{} + err := json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) switch eventBridgeEvent.Source { case "aws.autoscaling": - if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" { + if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { err = m.asgCompleteLaunchLifecycle(eventBridgeEvent) interruptionEvent = nil - } else if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_TERMINATING" { + } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) } return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) From a28edd69b47b0187f5022ff61676a9c4dc2071d2 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 25 Aug 2023 11:45:01 -0500 Subject: [PATCH 05/27] Created ASG launch lifecyle test script --- test/e2e/asg-launch-lifecycle-sqs-test | 197 ++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100755 test/e2e/asg-launch-lifecycle-sqs-test diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test new file mode 100755 index 00000000..8751fe9a --- /dev/null +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -0,0 +1,197 @@ +#!/bin/bash +set -euo pipefail + +# Available env vars: +# $TMP_DIR +# $CLUSTER_NAME +# $KUBECONFIG +# $NODE_TERMINATION_HANDLER_DOCKER_REPO +# $NODE_TERMINATION_HANDLER_DOCKER_TAG +# $WEBHOOK_DOCKER_REPO +# $WEBHOOK_DOCKER_TAG +# $AEMM_URL +# $AEMM_VERSION + + +function fail_and_exit { + echo "โŒ ASG Lifecycle SQS Test failed $CLUSTER_NAME โŒ" + exit "${1:-1}" +} + +echo "Starting ASG Lifecycle SQS Test for Node Termination Handler" +START_TIME=$(date -u +"%Y-%m-%dT%TZ") + +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" + +common_helm_args=() + +localstack_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-localstack" + "$SCRIPTPATH/../../config/helm/localstack/" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set defaultRegion="${AWS_REGION}" + --wait +) + +set -x +helm "${localstack_helm_args[@]}" +set +x + +sleep 10 + +RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'" +localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \ + -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ + | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }') +echo "๐Ÿฅ‘ Using localstack pod ${localstack_pod}" +run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}") +private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName') +instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId') +echo "๐Ÿฅ‘ Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}" +set -x +CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}" +queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl) + +echo "๐Ÿฅ‘ Created SQS Queue ${queue_url}" + +anth_helm_args=( + upgrade + --install + --namespace kube-system + "$CLUSTER_NAME-acth" + "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists + --set awsAccessKeyID=foo + --set awsSecretAccessKey=bar + --set awsRegion="${AWS_REGION}" + --set awsEndpoint="http://localstack.default" + --set checkTagBeforeDraining=false + --set enableSqsTerminationDraining=true + --set queueURL="${queue_url}" + --wait +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-emtp" + "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" + --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" + --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait +) +[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x + +TAINT_CHECK_CYCLES=15 +TAINT_CHECK_SLEEP=15 + +DEPLOYED=0 + +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "โœ… Verified regular-pod-test pod was scheduled and started!" + DEPLOYED=1 + break + fi + echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $DEPLOYED -eq 0 ]]; then + echo "โŒ regular-pod-test pod deployment failed" + fail_and_exit 2 +fi + +test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" +nodes=$(kubectl get nodes "${test_node}") + +if [[${#nodes[@]} -eq 0]]; then + echo "โŒ new instance was not found in the cluster" + fail_and_exit 2 +conditions=$(kubectl get nodes -o jsonpath='{.status.conditions}') +launched=0 + +for i in $conditions; do + if [[$i.type == "Ready" && $i.status == "True"]]; then + echo "โœ… Verified the new instance in ready in the cluster!" + launched=1 + fi +done + +if [[$launched -eq 0]]; then + echo "โŒ new instance" + +ASG_TERMINATE_EVENT=$(cat < /dev/null; then + echo "โœ… Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "โœ… Verified the regular-pod-test pod was evicted!" + echo "โœ… ASG Lifecycle SQS Test Passed $CLUSTER_NAME! โœ…" + exit 0 + fi + echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $cordoned -eq 0 ]]; then + echo "โŒ Worker node was not cordoned" +else + echo "โŒ regular-pod-test was not evicted" +fi + +fail_and_exit 1 From 12995c253473704b2aaa4f55097620d5206ed16d Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 8 Sep 2023 15:32:40 -0500 Subject: [PATCH 06/27] ASG Launch Lifecyle can be completed. ASG Terminate Lifecycle hook is not completed by NTH to allow for Capacity Rebalance --- pkg/monitor/sqsevent/asg-lifecycle-event.go | 47 +++-- pkg/monitor/sqsevent/sqs-monitor.go | 22 ++- test/e2e/asg-launch-lifecycle-sqs-test | 197 -------------------- 3 files changed, 49 insertions(+), 217 deletions(-) delete mode 100755 test/e2e/asg-launch-lifecycle-sqs-test diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index cf6ba327..e03ba9b9 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -22,7 +22,6 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-node-termination-handler/pkg/node" "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/sqs" "github.com/rs/zerolog/log" @@ -56,6 +55,10 @@ import ( const TEST_NOTIFICATION = "autoscaling:TEST_NOTIFICATION" +type LifecycleDetailMessage struct { + Message interface{} `json:"Message"` +} + // LifecycleDetail provides the ASG lifecycle event details type LifecycleDetail struct { LifecycleActionToken string `json:"LifecycleActionToken"` @@ -98,15 +101,6 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m } interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error { - _, err := m.continueLifecycleAction(lifecycleDetail) - if err != nil { - if aerr, ok := err.(awserr.RequestFailure); ok && aerr.StatusCode() != 400 { - return err - } - } - log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", - lifecycleDetail.LifecycleHookName, - lifecycleDetail.EC2InstanceID) errs := m.deleteMessages([]*sqs.Message{message}) if errs != nil { return errs[0] @@ -125,6 +119,17 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m return &interruptionEvent, nil } +func (m SQSMonitor) logAndDeleteLifecycle(lifecycleDetail *LifecycleDetail, message *sqs.Message) error { + log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", + lifecycleDetail.LifecycleHookName, + lifecycleDetail.EC2InstanceID) + errs := m.deleteMessages([]*sqs.Message{message}) + if errs != nil { + return errs[0] + } + return nil +} + // Continues the lifecycle hook thereby indicating a successful action occured func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*autoscaling.CompleteLifecycleActionOutput, error) { return m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{ @@ -137,22 +142,28 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* } // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster -func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent) error { +func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error { lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { - return fmt.Errorf("unmarshing ASG lifecycle event: %w", err) + return fmt.Errorf("unmarshaling ASG lifecycle event: %w", err) } if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { - return skip{fmt.Errorf("message is an ASG test notification")} + return ignore{skip{fmt.Errorf("message is an ASG test notification")}} + } + + if !isNodeReady(lifecycleDetail) { + return ignore{skip{fmt.Errorf("new ASG instance has not connected to cluster")}} } - if isNodeReady(lifecycleDetail) { - _, err = m.continueLifecycleAction(lifecycleDetail) - } else { - err = skip{fmt.Errorf("new ASG instance has not connected to cluster")} + _, err = m.continueLifecycleAction(lifecycleDetail) + + if err != nil { + return ignore{skip{fmt.Errorf("completing ASG launch lifecyle: %w", err)}} } + + err = m.logAndDeleteLifecycle(lifecycleDetail, message) return err } @@ -204,6 +215,6 @@ func getNodes() (*v1.NodeList, error) { func getInstanceID(node v1.Node) string { providerID := node.Spec.ProviderID providerIDSplit := strings.Split(providerID, "/") - instanceID := providerIDSplit[len(providerID)-1] + instanceID := providerIDSplit[len(providerIDSplit)-1] return instanceID } diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index c11a7e32..7e7691d6 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -72,6 +72,18 @@ func (s skip) Unwrap() error { return s.err } +type ignore struct { + err error +} + +func (i ignore) Error() string { + return i.err.Error() +} + +func (i ignore) Unwrap() error { + return i.err +} + // Kind denotes the kind of monitor func (m SQSMonitor) Kind() string { return SQSMonitorKind @@ -133,8 +145,10 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { eventBridgeEvent := EventBridgeEvent{} + lifecycleEventMessage := LifecycleDetailMessage{} lifecycleEvent := LifecycleDetail{} - err := json.Unmarshal([]byte(*message.Body), &lifecycleEvent) + err := json.Unmarshal([]byte(*message.Body), &lifecycleEventMessage) + err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) switch { case err != nil: @@ -174,7 +188,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, switch eventBridgeEvent.Source { case "aws.autoscaling": if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { - err = m.asgCompleteLaunchLifecycle(eventBridgeEvent) + err = m.asgCompleteLaunchLifecycle(eventBridgeEvent, message) interruptionEvent = nil } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) @@ -206,9 +220,13 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr dropMessageSuggestionCount := 0 failedInterruptionEventsCount := 0 var skipErr skip + var ignoreErr ignore for _, eventWrapper := range interruptionEventWrappers { switch { + case errors.As(eventWrapper.Err, &ignoreErr): + log.Warn().Err(ignoreErr).Msg("ASG launch cycle not continued") + case errors.As(eventWrapper.Err, &skipErr): log.Warn().Err(skipErr).Msg("dropping event") dropMessageSuggestionCount++ diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test deleted file mode 100755 index 8751fe9a..00000000 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Available env vars: -# $TMP_DIR -# $CLUSTER_NAME -# $KUBECONFIG -# $NODE_TERMINATION_HANDLER_DOCKER_REPO -# $NODE_TERMINATION_HANDLER_DOCKER_TAG -# $WEBHOOK_DOCKER_REPO -# $WEBHOOK_DOCKER_TAG -# $AEMM_URL -# $AEMM_VERSION - - -function fail_and_exit { - echo "โŒ ASG Lifecycle SQS Test failed $CLUSTER_NAME โŒ" - exit "${1:-1}" -} - -echo "Starting ASG Lifecycle SQS Test for Node Termination Handler" -START_TIME=$(date -u +"%Y-%m-%dT%TZ") - -SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" - -common_helm_args=() - -localstack_helm_args=( - upgrade - --install - --namespace default - "$CLUSTER_NAME-localstack" - "$SCRIPTPATH/../../config/helm/localstack/" - --set nodeSelector."${NTH_CONTROL_LABEL}" - --set defaultRegion="${AWS_REGION}" - --wait -) - -set -x -helm "${localstack_helm_args[@]}" -set +x - -sleep 10 - -RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'" -localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \ - -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ - | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }') -echo "๐Ÿฅ‘ Using localstack pod ${localstack_pod}" -run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}") -private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName') -instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId') -echo "๐Ÿฅ‘ Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}" -set -x -CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}" -queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl) - -echo "๐Ÿฅ‘ Created SQS Queue ${queue_url}" - -anth_helm_args=( - upgrade - --install - --namespace kube-system - "$CLUSTER_NAME-acth" - "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" - --set nodeSelector."${NTH_CONTROL_LABEL}" - --set tolerations[0].operator=Exists - --set awsAccessKeyID=foo - --set awsSecretAccessKey=bar - --set awsRegion="${AWS_REGION}" - --set awsEndpoint="http://localstack.default" - --set checkTagBeforeDraining=false - --set enableSqsTerminationDraining=true - --set queueURL="${queue_url}" - --wait -) -[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && - anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") -[[ ${#common_helm_args[@]} -gt 0 ]] && - anth_helm_args+=("${common_helm_args[@]}") - -set -x -helm "${anth_helm_args[@]}" -set +x - -emtp_helm_args=( - upgrade - --install - --namespace default - "$CLUSTER_NAME-emtp" - "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" - --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" - --wait -) -[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && - emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") -[[ ${#common_helm_args[@]} -gt 0 ]] && - emtp_helm_args+=("${common_helm_args[@]}") - -set -x -helm "${emtp_helm_args[@]}" -set +x - -TAINT_CHECK_CYCLES=15 -TAINT_CHECK_SLEEP=15 - -DEPLOYED=0 - -for i in $(seq 1 $TAINT_CHECK_CYCLES); do - if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then - echo "โœ… Verified regular-pod-test pod was scheduled and started!" - DEPLOYED=1 - break - fi - echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" - sleep $TAINT_CHECK_SLEEP -done - -if [[ $DEPLOYED -eq 0 ]]; then - echo "โŒ regular-pod-test pod deployment failed" - fail_and_exit 2 -fi - -test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -nodes=$(kubectl get nodes "${test_node}") - -if [[${#nodes[@]} -eq 0]]; then - echo "โŒ new instance was not found in the cluster" - fail_and_exit 2 -conditions=$(kubectl get nodes -o jsonpath='{.status.conditions}') -launched=0 - -for i in $conditions; do - if [[$i.type == "Ready" && $i.status == "True"]]; then - echo "โœ… Verified the new instance in ready in the cluster!" - launched=1 - fi -done - -if [[$launched -eq 0]]; then - echo "โŒ new instance" - -ASG_TERMINATE_EVENT=$(cat < /dev/null; then - echo "โœ… Verified the worker node was cordoned!" - cordoned=1 - fi - - if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then - echo "โœ… Verified the regular-pod-test pod was evicted!" - echo "โœ… ASG Lifecycle SQS Test Passed $CLUSTER_NAME! โœ…" - exit 0 - fi - echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" - sleep $TAINT_CHECK_SLEEP -done - -if [[ $cordoned -eq 0 ]]; then - echo "โŒ Worker node was not cordoned" -else - echo "โŒ regular-pod-test was not evicted" -fi - -fail_and_exit 1 From 000aa2c630534e598fcd0a3de46913ae6ba2786e Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 19 Sep 2023 12:29:41 -0500 Subject: [PATCH 07/27] Created bash script for testing ASG lifecycle hook completion Add functionality for NTH to catch and complete ASG launch lifecycle hooks. Created acceptance test script to test ASG launch lifecycle hook completion --- test/e2e/asg-launch-lifecycle-sqs-test | 355 +++++++++++++++++++++ test/eks-cluster-test/node_group-spec.yaml | 15 + 2 files changed, 370 insertions(+) create mode 100755 test/e2e/asg-launch-lifecycle-sqs-test create mode 100644 test/eks-cluster-test/node_group-spec.yaml diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test new file mode 100755 index 00000000..db7013f1 --- /dev/null +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -0,0 +1,355 @@ +#!/bin/bash +set -euo pipefail + +REGION="us-west-2" +CLUSTER_NAME="nth-eks-cluster-test" + +node_group_name="nth-eks-cluster-test-spot-ng" +sqs_queue_name="nth-sqs-test" +sns_topic_name="nth-sns-test" +auto_scaling_role_name="AWSServiceRoleForAutoScaling_nth-test" +auto_scaling_policy_arn="arn:aws:iam::aws:policy/aws-service-role/AutoScalingServiceRolePolicy" +fis_role_name="nth-test-fis-role" +fis_template_name="nth-fis-test" +fis_policy_arn="arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access" +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" +NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml" +account_id=$(aws sts get-caller-identity | jq -r '.Account') +nth_label="Use-Case=NTH" + + +## Queue Policy +QUEUE_POLICY=$(cat < /tmp/queue-attributes.json +{ +"MessageRetentionPeriod": "300", +"Policy": "$(echo $QUEUE_POLICY | sed 's/\"/\\"/g' | tr -d -s '\n' " ")", +"SqsManagedSseEnabled": "true" +} +EOF + +queue_url=$(aws sqs create-queue --queue-name "${sqs_queue_name}" --attributes file:///tmp/queue-attributes.json | jq -r .QueueUrl) +} + +function provision_sqs_queue { + queue_exists=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name) + if [[ -z $queue_exists ]]; then + echo "๐Ÿฅ‘ Provisioning SQS Queue" + create_queue + else + echo "๐Ÿฅ‘ $sqs_queue_name already exists; continuing with test run" + queue_url=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name | jq -r '.QueueUrls | .[0]') + fi + + echo "Queue URL: $queue_url" + sqs_arn=$(aws sqs get-queue-attributes --queue-url=$queue_url --attribute-names=QueueArn | jq -r .Attributes.QueueArn) +} + +function provision_sns_topic { + topic_exists=$(aws sns list-topics | grep "$sns_topic_name" || :) + if [[ -z $topic_exists ]]; then + echo "๐Ÿฅ‘ Provisioning SNS Topic" + sns_arn=$(aws sns create-topic --name $sns_topic_name | jq -r .TopicArn) + else + echo "๐Ÿฅ‘ $sns_topic_name already exists; continuing with test run" + sns_arn=$(aws sns list-topics | jq -r '.Topics | .[] | .TopicArn' | grep "nth-sns-test") + fi + echo "SNS ARN: $sns_arn" +} + +function subscribe_sqs_to_sns { + num_subscriptions=$(aws sns list-subscriptions-by-topic --topic-arn $sns_arn | jq '.Subscriptions | length') + if [[ $num_subscriptions -eq 0 ]]; then + echo "๐Ÿฅ‘ Subscribing $sns_topic_name to $sqs_queue_name" + aws sns subscribe --topic-arn $sns_arn --protocol sqs --notification-endpoint $sqs_arn + else + echo "๐Ÿฅ‘ $sns_topic_name already subscribed to $sqs_queue_name; continuing with test run" + fi +} + +function install_helm { + anth_helm_args=( + upgrade + --install + --namespace kube-system + "$CLUSTER_NAME-acth" + "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set image.pullPolicy="Always" + --set nodeSelector."${nth_label}" + --set tolerations[0].operator=Exists + --set awsAccessKeyID=foo + --set awsSecretAccessKey=bar + --set awsRegion="${REGION}" + --set checkTagBeforeDraining=false + --set enableSqsTerminationDraining=true + --set queueURL="${queue_url}" + --wait + ) + + set -x + helm "${anth_helm_args[@]}" + set +x + + sleep 15 +} + +function provision_node_group { + node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) + if [[ -z $node_group_exists ]]; then + echo "๐Ÿฅ‘ Provisioning Spot Node Group" + else + echo "๐Ÿฅ‘ Re-initializing $node_group_name for testing purposes" + eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve + echo "" + + node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) + echo -n "Node group Deleting." + while [[ -n $node_group_exists ]]; do + echo -n "." + node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) + sleep 10 + done + echo "" + sleep 20 + # asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName') + fi + + eksctl create nodegroup --config-file=$NODE_GROUP_CONFIG_FILE + update_ASG + + instance_ids=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_name | jq -r '.AutoScalingGroups | .[0].Instances | .[].InstanceId') + instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[] | .Instances | .[].InstanceId, .[].PrivateDnsName]') + + nth_node_id=$(jq -r '.[0]' <<< $instance_data) + nth_node_ip=$(jq -r '.[1]' <<< $instance_data) + termination_node_id=$(jq -r '.[2]' <<< $instance_data) + termination_node_ip=$(jq -r '.[3]' <<< $instance_data) + + kubectl label nodes $nth_node_ip $nth_label +} + +function update_ASG { + asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName') + echo "Auto Scaling Group: $asg_name" + + echo "๐Ÿฅ‘ Setting Capacity Rebalance" + aws autoscaling update-auto-scaling-group --auto-scaling-group-name $asg_name --capacity-rebalance + echo "๐Ÿฅ‘ Tagging ASG" + aws autoscaling create-or-update-tags --tags ResourceId=$asg_name,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true + + create_auto_scaling_role + echo "๐Ÿฅ‘ Creating Lifecycle Hooks" + aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Launch-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_LAUNCHING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="ABANDON" + aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Terminate-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_TERMINATING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="CONTINUE" +} + +function create_auto_scaling_role { + auto_scaling_role_exists=$(aws iam get-role --role-name=$auto_scaling_role_name | grep "$auto_scaling_role_name" || :) + if [[ -z $auto_scaling_role_exists ]]; then + echo "๐Ÿฅ‘ Creating Auto Scaling Role" + auto_scaling_role_arn=$(aws iam create-service-linked-role --aws-service-name autoscaling.amazonaws.com --custom-suffix "nth-test" | jq -r '.Role.Arn') + sleep 10 + else + echo "๐Ÿฅ‘ $auto_scaling_role_name already exists; continuing with test run" + auto_scaling_role_arn=$(aws iam get-role --role-name=$auto_scaling_role_name | jq -r '.Role.Arn') + fi +} + +function create_FIS_role { + fis_role_exists=$(aws iam get-role --role-name $fis_role_name | grep "$fis_role_name" || :) + if [[ -z $fis_role_exists ]]; then + cat << EOF > /tmp/fis-role-trust-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "fis.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] +} +EOF + echo "๐Ÿฅ‘ Creating FIS Role" + fis_role_arn=$(aws iam create-role --role-name $fis_role_name --assume-role-policy-document file:///tmp/fis-role-trust-policy.json | jq -r '.Role.Arn') + aws iam attach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn + sleep 10 + else + echo "๐Ÿฅ‘ $fis_role_name already exists; continuing with test run" + fis_role_arn=$(aws iam get-role --role-name=$fis_role_name | jq -r '.Role.Arn') + fi +} + +function create_experiment_template { + experiment_exists=$(aws fis list-experiment-templates | grep "$fis_template_name" || :) + if [[ -z $experiment_exists ]]; then + create_FIS_role + cat << EOF > /tmp/fis-experiment-template.json +{ + "description": "Test Spot Instance interruptions", + "targets": { + "oneSpotInstance": { + "resourceType": "aws:ec2:spot-instance", + "resourceTags": { + "Name": "interruptMe" + }, + "filters": [ + { + "path": "State.Name", + "values": [ + "running" + ] + } + ], + "selectionMode": "COUNT(1)" + } + }, + "actions": { + "interruptSpotInstance": { + "actionId": "aws:ec2:send-spot-instance-interruptions", + "parameters": { + "durationBeforeInterruption": "PT2M" + }, + "targets": { + "SpotInstances": "oneSpotInstance" + } + } + }, + "stopConditions": [ + { + "source": "none" + } + ], + "roleArn": "$fis_role_arn", + "tags": { + "Name": "$fis_template_name" + } +} +EOF + echo "๐Ÿฅ‘ Creating experiment template" + template_id=$(aws fis create-experiment-template --cli-input-json file:///tmp/fis-experiment-template.json | jq -r .experimentTemplate.id) + echo "Template_ID: $template_id" + else + template_id=$(aws fis list-experiment-templates | jq -r --arg template_name $fis_template_name '.experimentTemplates | .[] | select(.tags | has("Name")) | select(.tags.Name | contains($template_name)) | .id') + echo "๐Ÿฅ‘ $fis_template_name already exists; continuing with test run" + fi +} + +function start_FIS_experiment { + create_experiment_template + echo "๐Ÿฅ‘ Starting Experiment" + experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime') +} + +function create_tags { + echo "๐Ÿฅ‘ Creating instance tags" + instance_id_string=$(tr '\n' ' ' <<< ${instance_ids}) + eval 'aws ec2 create-tags --resources'" $instance_id_string "'--tags 'Key="aws-node-termination-handler/managed",Value='' + aws ec2 create-tags --resources "${termination_node_id}" --tags Key=Name,Value=interruptMe +} + +function is_new_instance { + is_new_instance="" + if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then + is_new_instance=false + else + is_new_instance="" + fi +} + +function get_launch_activity { + launch_activity="" + while [[ -z $launch_activity ]]; do + sleep 5 + activities=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name) + activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode]' <<< $activities) + num_activities=$(jq -r 'length' <<< $activities_details) + for i in $(seq 0 3 $((--num_activities))); do + id=$(jq -r .[$i] <<< $activities_details) + description=$(jq -r .[$((++i))] <<< $activities_details) + status=$(jq -r .[$((i+=2))] <<< $activities_details) + activity_instance=${description##*:} + is_new_instance $activity_instance + if [[ $description =~ .*"Launching".* && -z $is_new_instance ]]; then + launch_activity=$id + echo "๐Ÿฅ‘ Launch Activity found for instance $activity_instance" + break + fi + done + done +} + +function test_launch_lifecycle { + echo -n "๐Ÿฅ‘ Waiting for launch hook completion." + while [[ true ]]; do + activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[] | .StatusCode') + if [[ $activity_status == "Success" ]]; then + echo "" + echo "โœ… Launch Lifecycle Successfully Completed โœ…" + # exit 0 + break + fi + + if [[ $activity_status == "Cancelled" ]]; then + echo "" + echo "โŒ Launch Lifecycle Failed โŒ" + # exit 1 + break + fi + echo -n "." + sleep 10 + done +} + +function clean_up { + echo "=====================================================================================================" + echo "๐Ÿงน Cleaning up SQS, SNS, NodeGroup, IAM, FIS ๐Ÿงน" + echo "=====================================================================================================" + helm uninstall nth-eks-cluster-test-acth -n kube-system + eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve + aws sqs delete-queue --queue-url $queue_url + aws sns delete-topic --topic-arn $sns_arn + deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate) + aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn + aws iam delete-role --role-name $fis_role_name + aws iam delete-service-linked-role --role-name $auto_scaling_role_name +} + +function main { + provision_sqs_queue + provision_sns_topic + subscribe_sqs_to_sns + provision_node_group + install_helm + create_tags + start_FIS_experiment + get_launch_activity + test_launch_lifecycle +} + +main +trap "clean_up" EXIT diff --git a/test/eks-cluster-test/node_group-spec.yaml b/test/eks-cluster-test/node_group-spec.yaml new file mode 100644 index 00000000..2fa39a78 --- /dev/null +++ b/test/eks-cluster-test/node_group-spec.yaml @@ -0,0 +1,15 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: nth-eks-cluster-test + region: us-west-2 +managedNodeGroups: + - name: nth-eks-cluster-test-spot-ng + instanceType: t3.medium + amiFamily: AmazonLinux2 + desiredCapacity: 2 + minSize: 2 + maxSize: 2 + spot: true +iam: + withOIDC: true \ No newline at end of file From 75ed706f8b99e5d5f290ae7360927e78b1b861d5 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 5 Oct 2023 16:42:15 -0500 Subject: [PATCH 08/27] E2E tests for ASG launch lifecycle hook completion is complete --- test/e2e/asg-launch-lifecycle-sqs-test | 341 ++++++++++++++++--------- 1 file changed, 216 insertions(+), 125 deletions(-) diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index db7013f1..95d80a6f 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -7,6 +7,7 @@ CLUSTER_NAME="nth-eks-cluster-test" node_group_name="nth-eks-cluster-test-spot-ng" sqs_queue_name="nth-sqs-test" sns_topic_name="nth-sns-test" +node_policy_name="nth-test-node-policy" auto_scaling_role_name="AWSServiceRoleForAutoScaling_nth-test" auto_scaling_policy_arn="arn:aws:iam::aws:policy/aws-service-role/AutoScalingServiceRolePolicy" fis_role_name="nth-test-fis-role" @@ -17,9 +18,10 @@ NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml" account_id=$(aws sts get-caller-identity | jq -r '.Account') nth_label="Use-Case=NTH" +##### JSON FILES ##### -## Queue Policy -QUEUE_POLICY=$(cat < /tmp/queue-attributes.json +cat << EOF > /tmp/sqs-subscription-policy.json +{ + "Policy": "{\"Version\":\"2012-10-17\",\"Id\":\"MyQueuePolicy\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Service\":[\"events.amazonaws.com\",\"sqs.amazonaws.com\"]},\"Action\":\"sqs:SendMessage\",\"Resource\":\"arn:aws:sqs:${REGION}:${account_id}:${sqs_queue_name}\"},{\"Sid\":\"topic-subscription-arn:aws:sns:${REGION}:${account_id}:${sns_topic_name}\",\"Effect\":\"Allow\",\"Principal\":{\"AWS\":\"*\"},\"Action\":\"SQS:SendMessage\",\"Resource\":\"arn:aws:sqs:${REGION}:${account_id}:${sqs_queue_name}\",\"Condition\":{\"ArnLike\":{\"aws:SourceArn\":\"arn:aws:sns:${REGION}:${account_id}:${sns_topic_name}\"}}}]}" +} +EOF + +cat << EOF > /tmp/queue-attributes.json { "MessageRetentionPeriod": "300", -"Policy": "$(echo $QUEUE_POLICY | sed 's/\"/\\"/g' | tr -d -s '\n' " ")", +"Policy": "$(echo $sqs_queue_policy | sed 's/\"/\\"/g' | tr -d -s '\n' " ")", "SqsManagedSseEnabled": "true" } EOF -queue_url=$(aws sqs create-queue --queue-name "${sqs_queue_name}" --attributes file:///tmp/queue-attributes.json | jq -r .QueueUrl) +### NODEGROUP ### +cat << EOF > /tmp/nth-nodegroup-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:CompleteLifecycleAction", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeTags", + "ec2:DescribeInstances", + "sqs:DeleteMessage", + "sqs:ReceiveMessage" + ], + "Resource": "*" + } + ] } +EOF +### FIS ### +cat << EOF > /tmp/fis-role-trust-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "fis.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] +} +EOF + +function create_FIS_Template_JSON { +cat << EOF > /tmp/fis-experiment-template.json +{ + "description": "Test Spot Instance interruptions", + "targets": { + "oneSpotInstance": { + "resourceType": "aws:ec2:spot-instance", + "resourceTags": { + "Name": "interruptMe" + }, + "filters": [ + { + "path": "State.Name", + "values": [ + "running" + ] + } + ], + "selectionMode": "COUNT(1)" + } + }, + "actions": { + "interruptSpotInstance": { + "actionId": "aws:ec2:send-spot-instance-interruptions", + "parameters": { + "durationBeforeInterruption": "PT2M" + }, + "targets": { + "SpotInstances": "oneSpotInstance" + } + } + }, + "stopConditions": [ + { + "source": "none" + } + ], + "roleArn": "$fis_role_arn", + "tags": { + "Name": "$fis_template_name" + } +} +EOF +} + + +##### SETUP ##### + +### SQS ### function provision_sqs_queue { queue_exists=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name) if [[ -z $queue_exists ]]; then echo "๐Ÿฅ‘ Provisioning SQS Queue" - create_queue + queue_url=$(aws sqs create-queue --queue-name "${sqs_queue_name}" --attributes file:///tmp/queue-attributes.json | jq -r .QueueUrl) else echo "๐Ÿฅ‘ $sqs_queue_name already exists; continuing with test run" queue_url=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name | jq -r '.QueueUrls | .[0]') fi - - echo "Queue URL: $queue_url" sqs_arn=$(aws sqs get-queue-attributes --queue-url=$queue_url --attribute-names=QueueArn | jq -r .Attributes.QueueArn) + aws sqs set-queue-attributes --queue-url $queue_url --attributes file:///tmp/sqs-subscription-policy.json } +### SNS ### function provision_sns_topic { topic_exists=$(aws sns list-topics | grep "$sns_topic_name" || :) if [[ -z $topic_exists ]]; then @@ -70,56 +162,28 @@ function provision_sns_topic { sns_arn=$(aws sns create-topic --name $sns_topic_name | jq -r .TopicArn) else echo "๐Ÿฅ‘ $sns_topic_name already exists; continuing with test run" - sns_arn=$(aws sns list-topics | jq -r '.Topics | .[] | .TopicArn' | grep "nth-sns-test") + sns_arn=$(aws sns list-topics | jq -r '.Topics | .[].TopicArn' | grep "$sns_topic_name") fi - echo "SNS ARN: $sns_arn" } function subscribe_sqs_to_sns { num_subscriptions=$(aws sns list-subscriptions-by-topic --topic-arn $sns_arn | jq '.Subscriptions | length') if [[ $num_subscriptions -eq 0 ]]; then echo "๐Ÿฅ‘ Subscribing $sns_topic_name to $sqs_queue_name" - aws sns subscribe --topic-arn $sns_arn --protocol sqs --notification-endpoint $sqs_arn + subscription_arn=$(aws sns subscribe --topic-arn $sns_arn --protocol sqs --notification-endpoint $sqs_arn | jq -r .SubscriptionArn) else echo "๐Ÿฅ‘ $sns_topic_name already subscribed to $sqs_queue_name; continuing with test run" + subscription_arn=$(aws sns list-subscriptions-by-topic --topic-arn $sns_arn | jq -r '.Subscriptions | .[0].SubscriptionArn') fi } -function install_helm { - anth_helm_args=( - upgrade - --install - --namespace kube-system - "$CLUSTER_NAME-acth" - "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" - --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" - --set image.pullPolicy="Always" - --set nodeSelector."${nth_label}" - --set tolerations[0].operator=Exists - --set awsAccessKeyID=foo - --set awsSecretAccessKey=bar - --set awsRegion="${REGION}" - --set checkTagBeforeDraining=false - --set enableSqsTerminationDraining=true - --set queueURL="${queue_url}" - --wait - ) - - set -x - helm "${anth_helm_args[@]}" - set +x - - sleep 15 -} - +### NODEGROUP ### function provision_node_group { + create_node_policy node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) - if [[ -z $node_group_exists ]]; then - echo "๐Ÿฅ‘ Provisioning Spot Node Group" - else - echo "๐Ÿฅ‘ Re-initializing $node_group_name for testing purposes" - eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve + if [[ -n $node_group_exists ]]; then + get_node_role_name + delete_node_group echo "" node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) @@ -131,26 +195,51 @@ function provision_node_group { done echo "" sleep 20 - # asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName') fi + echo "๐Ÿฅ‘ Provisioning Spot Node Group" eksctl create nodegroup --config-file=$NODE_GROUP_CONFIG_FILE + + echo "๐Ÿฅ‘ Attaching Node policy to Node role" + get_node_role_name + aws iam attach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn + update_ASG + set_node_data + kubectl label nodes $nth_node_ip $nth_label +} + +function create_node_policy { + node_policy_exists=$(aws iam list-policies | grep "$node_policy_name" || :) + if [[ -z $node_policy_exists ]]; then + echo "๐Ÿฅ‘ Creating Node policy" + node_policy_arn=$(aws iam create-policy --policy-name $node_policy_name --policy-document file:///tmp/nth-nodegroup-policy.json | jq -r .Policy.Arn) + else + echo "๐Ÿฅ‘ $node_policy_name already exists; continuing with test run" + node_policy_arn=$(aws iam list-policies | jq -r --arg policy_name $node_policy_name '.Policies | .[] | select(.PolicyName | contains($policy_name)) | .Arn') + fi + + sleep 10 +} + +function get_node_role_name { + node_role_arn=$(aws eks describe-nodegroup --cluster-name $CLUSTER_NAME --nodegroup-name $node_group_name | jq -r .nodegroup.nodeRole) + split_node_role_arn=($(tr '/' ' ' <<< $node_role_arn)) + node_role_name=${split_node_role_arn[1]} +} +function set_node_data { instance_ids=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_name | jq -r '.AutoScalingGroups | .[0].Instances | .[].InstanceId') - instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[] | .Instances | .[].InstanceId, .[].PrivateDnsName]') + instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[].Instances | .[].InstanceId, .[].PrivateDnsName]') nth_node_id=$(jq -r '.[0]' <<< $instance_data) nth_node_ip=$(jq -r '.[1]' <<< $instance_data) termination_node_id=$(jq -r '.[2]' <<< $instance_data) termination_node_ip=$(jq -r '.[3]' <<< $instance_data) - - kubectl label nodes $nth_node_ip $nth_label } function update_ASG { asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName') - echo "Auto Scaling Group: $asg_name" echo "๐Ÿฅ‘ Setting Capacity Rebalance" aws autoscaling update-auto-scaling-group --auto-scaling-group-name $asg_name --capacity-rebalance @@ -175,25 +264,40 @@ function create_auto_scaling_role { fi } +### HELM ### +function install_helm { + + anth_helm_args=( + upgrade + --install + --namespace kube-system + "$CLUSTER_NAME-acth" + "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set image.pullPolicy="Always" + --set nodeSelector."${nth_label}" + --set tolerations[0].operator=Exists + --set awsAccessKeyID=$(aws --profile default configure get aws_access_key_id) + --set awsSecretAccessKey=$(aws --profile default configure get aws_secret_access_key) + --set awsRegion="${REGION}" + --set checkTagBeforeDraining=false + --set enableSqsTerminationDraining=true + --set queueURL="${queue_url}" + --wait + ) + + set -x + helm "${anth_helm_args[@]}" + set +x + + sleep 15 +} + +### FIS ### function create_FIS_role { fis_role_exists=$(aws iam get-role --role-name $fis_role_name | grep "$fis_role_name" || :) if [[ -z $fis_role_exists ]]; then - cat << EOF > /tmp/fis-role-trust-policy.json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": [ - "fis.amazonaws.com" - ] - }, - "Action": "sts:AssumeRole" - } - ] -} -EOF echo "๐Ÿฅ‘ Creating FIS Role" fis_role_arn=$(aws iam create-role --role-name $fis_role_name --assume-role-policy-document file:///tmp/fis-role-trust-policy.json | jq -r '.Role.Arn') aws iam attach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn @@ -207,64 +311,15 @@ EOF function create_experiment_template { experiment_exists=$(aws fis list-experiment-templates | grep "$fis_template_name" || :) if [[ -z $experiment_exists ]]; then - create_FIS_role - cat << EOF > /tmp/fis-experiment-template.json -{ - "description": "Test Spot Instance interruptions", - "targets": { - "oneSpotInstance": { - "resourceType": "aws:ec2:spot-instance", - "resourceTags": { - "Name": "interruptMe" - }, - "filters": [ - { - "path": "State.Name", - "values": [ - "running" - ] - } - ], - "selectionMode": "COUNT(1)" - } - }, - "actions": { - "interruptSpotInstance": { - "actionId": "aws:ec2:send-spot-instance-interruptions", - "parameters": { - "durationBeforeInterruption": "PT2M" - }, - "targets": { - "SpotInstances": "oneSpotInstance" - } - } - }, - "stopConditions": [ - { - "source": "none" - } - ], - "roleArn": "$fis_role_arn", - "tags": { - "Name": "$fis_template_name" - } -} -EOF + create_FIS_Template_JSON echo "๐Ÿฅ‘ Creating experiment template" template_id=$(aws fis create-experiment-template --cli-input-json file:///tmp/fis-experiment-template.json | jq -r .experimentTemplate.id) - echo "Template_ID: $template_id" else template_id=$(aws fis list-experiment-templates | jq -r --arg template_name $fis_template_name '.experimentTemplates | .[] | select(.tags | has("Name")) | select(.tags.Name | contains($template_name)) | .id') echo "๐Ÿฅ‘ $fis_template_name already exists; continuing with test run" fi } -function start_FIS_experiment { - create_experiment_template - echo "๐Ÿฅ‘ Starting Experiment" - experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime') -} - function create_tags { echo "๐Ÿฅ‘ Creating instance tags" instance_id_string=$(tr '\n' ' ' <<< ${instance_ids}) @@ -272,6 +327,16 @@ function create_tags { aws ec2 create-tags --resources "${termination_node_id}" --tags Key=Name,Value=interruptMe } +function start_FIS_experiment { + create_tags + create_FIS_role + create_experiment_template + echo "๐Ÿฅ‘ Starting Experiment" + experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime') +} + + +##### TESTING ##### function is_new_instance { is_new_instance="" if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then @@ -304,20 +369,21 @@ function get_launch_activity { } function test_launch_lifecycle { + aws sqs receive-message --queue-url $queue_url echo -n "๐Ÿฅ‘ Waiting for launch hook completion." while [[ true ]]; do - activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[] | .StatusCode') - if [[ $activity_status == "Success" ]]; then + activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[].StatusCode') + if [[ $activity_status == "Successful" ]]; then echo "" echo "โœ… Launch Lifecycle Successfully Completed โœ…" - # exit 0 + exit_policy="exit 0" break fi if [[ $activity_status == "Cancelled" ]]; then echo "" echo "โŒ Launch Lifecycle Failed โŒ" - # exit 1 + exit_policy="exit 1" break fi echo -n "." @@ -325,31 +391,56 @@ function test_launch_lifecycle { done } + +##### CLEAN UP ##### function clean_up { echo "=====================================================================================================" echo "๐Ÿงน Cleaning up SQS, SNS, NodeGroup, IAM, FIS ๐Ÿงน" echo "=====================================================================================================" - helm uninstall nth-eks-cluster-test-acth -n kube-system - eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve + echo "๐Ÿฅ‘ Uninstalling NTH helm chart" + helm uninstall "$CLUSTER_NAME-acth" -n kube-system + delete_node_group + echo "๐Ÿฅ‘ Unsubscribing SNS from SQS" + aws sns unsubscribe --subscription-arn $subscription_arn + echo "๐Ÿฅ‘ Deleting SQS queue" aws sqs delete-queue --queue-url $queue_url + echo "๐Ÿฅ‘ Deleting SNS topic" aws sns delete-topic --topic-arn $sns_arn + echo "๐Ÿฅ‘ Deleting FIS experiment template" deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate) + echo "๐Ÿฅ‘ Detaching FIS role policy" aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn + echo "๐Ÿฅ‘ Deleting FIS role" aws iam delete-role --role-name $fis_role_name + echo "๐Ÿฅ‘ Deleting autoscaling role" aws iam delete-service-linked-role --role-name $auto_scaling_role_name + echo "๐Ÿฅ‘ Deleting Node role policy" + aws iam delete-policy --policy-arn $node_policy_arn +} + +function delete_node_group { + echo "Node Role Name: $node_role_name" + node_policy_exists=$(aws iam list-attached-role-policies --role-name $node_role_name | grep "$node_policy_name" || :) + echo $node_policy_exists + if [[ -n $node_policy_exists ]]; then + echo "๐Ÿฅ‘ Detaching NTH Node Group policy" + aws iam detach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn + fi + echo "๐Ÿฅ‘ Deleting NTH Node Group" + eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve } function main { provision_sqs_queue - provision_sns_topic + provision_sns_topic subscribe_sqs_to_sns provision_node_group install_helm - create_tags start_FIS_experiment get_launch_activity test_launch_lifecycle + trap "clean_up" EXIT + eval $exit_policy } main -trap "clean_up" EXIT From aed12d161313d5d9e7e0693547759ce0f1d3841f Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Wed, 1 Nov 2023 12:46:43 -0500 Subject: [PATCH 09/27] Added bash script to run test files. Revised method names and placemetns in ASG launch lifecycle calls --- pkg/monitor/sqsevent/asg-lifecycle-event.go | 21 ++++++++------------- pkg/monitor/sqsevent/sqs-monitor.go | 13 ++++++++++--- test/e2e/asg-launch-lifecycle-sqs-test | 8 +++----- test/eks-cluster-test/run-test | 1 + test/k8s-local-cluster-test/run-test | 14 ++++++++++++++ 5 files changed, 36 insertions(+), 21 deletions(-) diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index e03ba9b9..dea1ed85 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -101,11 +101,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m } interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error { - errs := m.deleteMessages([]*sqs.Message{message}) - if errs != nil { - return errs[0] - } - return nil + return m.deleteMessage(message) } interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { @@ -119,10 +115,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m return &interruptionEvent, nil } -func (m SQSMonitor) logAndDeleteLifecycle(lifecycleDetail *LifecycleDetail, message *sqs.Message) error { - log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", - lifecycleDetail.LifecycleHookName, - lifecycleDetail.EC2InstanceID) +func (m SQSMonitor) deleteMessage(message *sqs.Message) error { errs := m.deleteMessages([]*sqs.Message{message}) if errs != nil { return errs[0] @@ -142,7 +135,7 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* } // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster -func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error { +func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error { lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { @@ -158,12 +151,14 @@ func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent, message } _, err = m.continueLifecycleAction(lifecycleDetail) - if err != nil { - return ignore{skip{fmt.Errorf("completing ASG launch lifecyle: %w", err)}} + return ignore{skip{fmt.Errorf("continuing ASG launch lifecyle: %w", err)}} } - err = m.logAndDeleteLifecycle(lifecycleDetail, message) + log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", + lifecycleDetail.LifecycleHookName, + lifecycleDetail.EC2InstanceID) + err = m.deleteMessage(message) return err } diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 7e7691d6..9805c701 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -60,6 +60,7 @@ type InterruptionEventWrapper struct { Err error } +// Used to skip processing an error, but acknowledge an error occured during a termination event type skip struct { err error } @@ -72,6 +73,7 @@ func (s skip) Unwrap() error { return s.err } +// Used to completely ignore an error. Used when processing a non-terminating event type ignore struct { err error } @@ -148,6 +150,10 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri lifecycleEventMessage := LifecycleDetailMessage{} lifecycleEvent := LifecycleDetail{} err := json.Unmarshal([]byte(*message.Body), &lifecycleEventMessage) + if err != nil { + log.Err(err).Msg("processing JSON message of lifecycle event from ASG") + return eventBridgeEvent, err + } err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) switch { @@ -182,13 +188,14 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { interruptionEventWrappers := []InterruptionEventWrapper{} interruptionEvent := &monitor.InterruptionEvent{} - lifecycleEvent := LifecycleDetail{} - err := json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) + var err error switch eventBridgeEvent.Source { case "aws.autoscaling": + lifecycleEvent := LifecycleDetail{} + err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { - err = m.asgCompleteLaunchLifecycle(eventBridgeEvent, message) + err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message) interruptionEvent = nil } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index 95d80a6f..2452a299 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -338,12 +338,11 @@ function start_FIS_experiment { ##### TESTING ##### function is_new_instance { - is_new_instance="" + is_new_instance=true if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then is_new_instance=false - else - is_new_instance="" fi + echo $is_new_instance } function get_launch_activity { @@ -358,8 +357,7 @@ function get_launch_activity { description=$(jq -r .[$((++i))] <<< $activities_details) status=$(jq -r .[$((i+=2))] <<< $activities_details) activity_instance=${description##*:} - is_new_instance $activity_instance - if [[ $description =~ .*"Launching".* && -z $is_new_instance ]]; then + if [[ $description =~ .*"Launching".* && is_new_instance $activity_instance ]]; then launch_activity=$id echo "๐Ÿฅ‘ Launch Activity found for instance $activity_instance" break diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test index 9d6a7f77..beebf0dd 100755 --- a/test/eks-cluster-test/run-test +++ b/test/eks-cluster-test/run-test @@ -194,6 +194,7 @@ function reset_cluster { if [[ -z ${assertion_scripts+x} ]]; then assertion_scripts=( + "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test" "$SCRIPTPATH/../e2e/cordon-only-test" "$SCRIPTPATH/../e2e/imds-v2-test" "$SCRIPTPATH/../e2e/maintenance-event-cancellation-test" diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index b21b2c64..4951805c 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -24,6 +24,10 @@ WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local" ASSERTION_SCRIPTS=$(find "$SCRIPTPATH/../e2e" -type f | sort) +SCRIPT_BLACKLIST=( + "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test" +) + function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } function relpath() { @@ -271,8 +275,18 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\ ## Mark worker2 only for Critical Add-Ons like dns kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite +function is_blacklisted { + is_blacklisted=false + if [[ $SCRIPT_BLACKLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then + is_blacklisted=true + fi + return $is_blacklisted +} + i=0 for assert_script in $ASSERTION_SCRIPTS; do + if[[ is_blacklisted $assert_script ]]; then continue; fi + reset_cluster START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") IMDS_PORT=$((i + 1338)) From 6838bcc25b20933101275a7d514ae9bdb9176488 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 21 Nov 2023 13:38:23 -0600 Subject: [PATCH 10/27] Refactored the unmarshalling of the SQS message --- pkg/monitor/sqsevent/sqs-monitor.go | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 9805c701..37a7e6ef 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -144,17 +144,26 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, return &event, err } -// processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent -func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { - eventBridgeEvent := EventBridgeEvent{} +func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) { lifecycleEventMessage := LifecycleDetailMessage{} lifecycleEvent := LifecycleDetail{} - err := json.Unmarshal([]byte(*message.Body), &lifecycleEventMessage) + err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage) if err != nil { - log.Err(err).Msg("processing JSON message of lifecycle event from ASG") - return eventBridgeEvent, err + // log.Err(err).Msg("processing JSON message of lifecycle event from ASG") + return lifecycleEvent, err + } + if lifecycleEventMessage.Message != nil { + err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) + } else { + err = json.Unmarshal([]byte(fmt.Sprintf("%v", *messageBody)), &lifecycleEvent) } - err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) + return lifecycleEvent, err +} + +// processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent +func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { + eventBridgeEvent := EventBridgeEvent{} + lifecycleEvent, err := messageToLifecycleEvent(message.Body) switch { case err != nil: From 3ee0880cee76f8b18eafee4f53a8515f2b2826e4 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 21 Nov 2023 15:18:01 -0600 Subject: [PATCH 11/27] Refactor the creation and usage of the K8s client --- cmd/node-termination-handler.go | 16 ++++++++++++++-- pkg/monitor/sqsevent/asg-lifecycle-event.go | 18 ++++-------------- pkg/monitor/sqsevent/sqs-monitor.go | 2 ++ pkg/node/node.go | 17 +++-------------- pkg/node/node_test.go | 10 ++++++++-- pkg/observability/k8s-events.go | 13 +------------ 6 files changed, 32 insertions(+), 44 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 9f7dcaf1..d4a491bc 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -45,6 +45,8 @@ import ( "github.com/rs/zerolog/log" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" ) const ( @@ -97,7 +99,16 @@ func main() { nthConfig.Print() log.Fatal().Err(err).Msg("Webhook validation failed,") } - node, err := node.New(nthConfig) + + clusterConfig, err := rest.InClusterConfig() + if err != nil { + log.Fatal().Err(err).Msgf("retreiving cluster config: %v", err) + } + clientset, err := kubernetes.NewForConfig(clusterConfig) + if err != nil { + log.Fatal().Err(err).Msgf("creating new clientset with config: %v", err) + } + node, err := node.New(nthConfig, clientset) if err != nil { nthConfig.Print() log.Fatal().Err(err).Msg("Unable to instantiate a node for various kubernetes node functions,") @@ -137,7 +148,7 @@ func main() { log.Fatal().Msgf("Unable to find the AWS region to process queue events.") } - recorder, err := observability.InitK8sEventRecorder(nthConfig.EmitKubernetesEvents, nthConfig.NodeName, nthConfig.EnableSQSTerminationDraining, nodeMetadata, nthConfig.KubernetesEventsExtraAnnotations) + recorder, err := observability.InitK8sEventRecorder(nthConfig.EmitKubernetesEvents, nthConfig.NodeName, nthConfig.EnableSQSTerminationDraining, nodeMetadata, nthConfig.KubernetesEventsExtraAnnotations, clientset) if err != nil { nthConfig.Print() log.Fatal().Err(err).Msg("Unable to create Kubernetes event recorder,") @@ -204,6 +215,7 @@ func main() { ASG: autoscaling.New(sess), EC2: ec2.New(sess), BeforeCompleteLifecycleAction: func() { <-time.After(completeLifecycleActionDelay) }, + K8sClientset: clientset, } monitoringFns[sqsEvents] = sqsMonitor } diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index dea1ed85..50493bf7 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -28,7 +28,6 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" ) /* Example SQS ASG Lifecycle Termination Event Message: @@ -146,7 +145,7 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message return ignore{skip{fmt.Errorf("message is an ASG test notification")}} } - if !isNodeReady(lifecycleDetail) { + if !isNodeReady(lifecycleDetail, m.K8sClientset) { return ignore{skip{fmt.Errorf("new ASG instance has not connected to cluster")}} } @@ -163,8 +162,8 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message } // If the Node, new EC2 instance, is ready in the K8s cluster -func isNodeReady(lifecycleDetail *LifecycleDetail) bool { - nodes, err := getNodes() +func isNodeReady(lifecycleDetail *LifecycleDetail, clientset *kubernetes.Clientset) bool { + nodes, err := getNodes(clientset) if err != nil { log.Err(fmt.Errorf("getting nodes from cluster: %w", err)) return false @@ -189,16 +188,7 @@ func isNodeReady(lifecycleDetail *LifecycleDetail) bool { } // Gets Nodes connected to K8s cluster -func getNodes() (*v1.NodeList, error) { - clusterConfig, err := rest.InClusterConfig() - if err != nil { - return nil, fmt.Errorf("retreiving cluster config: %w", err) - } - // creates the clientset - clientset, err := kubernetes.NewForConfig(clusterConfig) - if err != nil { - return nil, fmt.Errorf("creating new clientset with config: %w", err) - } +func getNodes(clientset *kubernetes.Clientset) (*v1.NodeList, error) { nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("retreiving nodes from cluster: %w", err) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 37a7e6ef..8bdd2a0a 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -28,6 +28,7 @@ import ( "github.com/aws/aws-sdk-go/service/ec2/ec2iface" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/sqs/sqsiface" + "k8s.io/client-go/kubernetes" "github.com/rs/zerolog/log" @@ -52,6 +53,7 @@ type SQSMonitor struct { CheckIfManaged bool ManagedTag string BeforeCompleteLifecycleAction func() + K8sClientset *kubernetes.Clientset } // InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any diff --git a/pkg/node/node.go b/pkg/node/node.go index ffd04bb5..2b62e768 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -31,7 +31,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" "k8s.io/kubectl/pkg/drain" ) @@ -84,8 +83,8 @@ type Node struct { } // New will construct a node struct to perform various node function through the kubernetes api server -func New(nthConfig config.Config) (*Node, error) { - drainHelper, err := getDrainHelper(nthConfig) +func New(nthConfig config.Config, clientset *kubernetes.Clientset) (*Node, error) { + drainHelper, err := getDrainHelper(nthConfig, clientset) if err != nil { return nil, err } @@ -634,7 +633,7 @@ func (n Node) fetchAllPods(nodeName string) (*corev1.PodList, error) { }) } -func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { +func getDrainHelper(nthConfig config.Config, clientset *kubernetes.Clientset) (*drain.Helper, error) { drainHelper := &drain.Helper{ Ctx: context.TODO(), Client: &kubernetes.Clientset{}, @@ -652,17 +651,7 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { return drainHelper, nil } - clusterConfig, err := rest.InClusterConfig() - if err != nil { - return nil, err - } - // creates the clientset - clientset, err := kubernetes.NewForConfig(clusterConfig) - if err != nil { - return nil, err - } drainHelper.Client = clientset - return drainHelper, nil } diff --git a/pkg/node/node_test.go b/pkg/node/node_test.go index 93496872..e9837e6c 100644 --- a/pkg/node/node_test.go +++ b/pkg/node/node_test.go @@ -63,8 +63,13 @@ func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node { return tNode } +func getNewNode(nthConfig config.Config, client *fake.Clientset) (*node.Node, error) { + drainHelper := getDrainHelper(client) + return node.NewWithValues(nthConfig, drainHelper, uptime.Uptime) +} + func TestDryRun(t *testing.T) { - tNode, err := node.New(config.Config{DryRun: true}) + tNode, err := getNewNode(config.Config{DryRun: true}, fake.NewSimpleClientset()) h.Ok(t, err) fakeRecorder := record.NewFakeRecorder(recorderBufferSize) @@ -103,7 +108,8 @@ func TestDryRun(t *testing.T) { } func TestNewFailure(t *testing.T) { - _, err := node.New(config.Config{}) + client := fake.NewSimpleClientset() + _, err := getNewNode(config.Config{}, client) h.Assert(t, true, "Failed to return error when creating new Node.", err != nil) } diff --git a/pkg/observability/k8s-events.go b/pkg/observability/k8s-events.go index a3da3778..6b7caf25 100644 --- a/pkg/observability/k8s-events.go +++ b/pkg/observability/k8s-events.go @@ -27,7 +27,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" ) @@ -80,7 +79,7 @@ type K8sEventRecorder struct { } // InitK8sEventRecorder creates a Kubernetes event recorder -func InitK8sEventRecorder(enabled bool, nodeName string, sqsMode bool, nodeMetadata ec2metadata.NodeMetadata, extraAnnotationsStr string) (K8sEventRecorder, error) { +func InitK8sEventRecorder(enabled bool, nodeName string, sqsMode bool, nodeMetadata ec2metadata.NodeMetadata, extraAnnotationsStr string, clientSet *kubernetes.Clientset) (K8sEventRecorder, error) { if !enabled { return K8sEventRecorder{}, nil } @@ -107,16 +106,6 @@ func InitK8sEventRecorder(enabled bool, nodeName string, sqsMode bool, nodeMetad } } - config, err := rest.InClusterConfig() - if err != nil { - return K8sEventRecorder{}, err - } - - clientSet, err := kubernetes.NewForConfig(config) - if err != nil { - return K8sEventRecorder{}, err - } - broadcaster := record.NewBroadcaster() broadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientSet.CoreV1().Events("")}) From 470bc5fdef59a9846799f2278b35647a8f257d72 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 1 Dec 2023 10:04:57 -0600 Subject: [PATCH 12/27] Updated run-test with inclusive terminology --- test/k8s-local-cluster-test/run-test | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index 4951805c..992e1fcc 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -23,8 +23,7 @@ AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"} ASSERTION_SCRIPTS=$(find "$SCRIPTPATH/../e2e" -type f | sort) - -SCRIPT_BLACKLIST=( +SCRIPT_DENYLIST=( "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test" ) @@ -275,17 +274,17 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\ ## Mark worker2 only for Critical Add-Ons like dns kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite -function is_blacklisted { - is_blacklisted=false - if [[ $SCRIPT_BLACKLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then - is_blacklisted=true +function is_denylisted { + is_denylisted=false + if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then + is_denylisted=true fi - return $is_blacklisted + return $is_denylisted } i=0 for assert_script in $ASSERTION_SCRIPTS; do - if[[ is_blacklisted $assert_script ]]; then continue; fi + if[[ $(is_denylisted $assert_script) ]]; then continue; fi reset_cluster START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") From 96127141fccb91602b134ab3adcce6a61dcd40ae Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 1 Dec 2023 10:09:34 -0600 Subject: [PATCH 13/27] Fix boolean logic in bash scripts --- test/e2e/asg-launch-lifecycle-sqs-test | 10 ++++++---- test/k8s-local-cluster-test/run-test | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index 2452a299..f92e0a3a 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -338,11 +338,10 @@ function start_FIS_experiment { ##### TESTING ##### function is_new_instance { - is_new_instance=true + is_new="true" if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then - is_new_instance=false + is_new="false" fi - echo $is_new_instance } function get_launch_activity { @@ -357,7 +356,8 @@ function get_launch_activity { description=$(jq -r .[$((++i))] <<< $activities_details) status=$(jq -r .[$((i+=2))] <<< $activities_details) activity_instance=${description##*:} - if [[ $description =~ .*"Launching".* && is_new_instance $activity_instance ]]; then + is_new_instance $activity_instance + if [[ $description =~ .*"Launching".* && $is_new == "true" ]]; then launch_activity=$id echo "๐Ÿฅ‘ Launch Activity found for instance $activity_instance" break @@ -395,6 +395,8 @@ function clean_up { echo "=====================================================================================================" echo "๐Ÿงน Cleaning up SQS, SNS, NodeGroup, IAM, FIS ๐Ÿงน" echo "=====================================================================================================" + pod_id=$(get_nth_worker_pod || :) + kubectl logs $pod_id --namespace kube-system || : echo "๐Ÿฅ‘ Uninstalling NTH helm chart" helm uninstall "$CLUSTER_NAME-acth" -n kube-system delete_node_group diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index 992e1fcc..e9e5388b 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -275,16 +275,16 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\ kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite function is_denylisted { - is_denylisted=false + is_denied="false" if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then - is_denylisted=true + is_denied="true" fi - return $is_denylisted } i=0 for assert_script in $ASSERTION_SCRIPTS; do - if[[ $(is_denylisted $assert_script) ]]; then continue; fi + is_denylisted $assert_script + if[[ $is_denied == "true" ]]; then continue; fi reset_cluster START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") From 7fbfe35545aaca5e3050791af52ff6dd15257788 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 1 Dec 2023 10:40:03 -0600 Subject: [PATCH 14/27] Refactored the processing of ASG Launch Lifecycle events as interruption events. --- cmd/node-termination-handler.go | 139 ++++++++++++++++++-- pkg/monitor/sqsevent/asg-lifecycle-event.go | 82 ++++-------- pkg/monitor/sqsevent/sqs-monitor.go | 5 +- pkg/monitor/types.go | 2 + 4 files changed, 154 insertions(+), 74 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index d4a491bc..1fdfd02f 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -43,7 +43,11 @@ import ( "github.com/aws/aws-sdk-go/service/sqs" "github.com/rs/zerolog" "github.com/rs/zerolog/log" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -215,7 +219,6 @@ func main() { ASG: autoscaling.New(sess), EC2: ec2.New(sess), BeforeCompleteLifecycleAction: func() { <-time.After(completeLifecycleActionDelay) }, - K8sClientset: clientset, } monitoringFns[sqsEvents] = sqsMonitor } @@ -269,7 +272,7 @@ func main() { event.InProgress = true wg.Add(1) recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description) - go drainOrCordonIfNecessary(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, &wg) + go processInterruptionEventFunctions(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg) default: log.Warn().Msg("all workers busy, waiting") break EventLoop @@ -341,21 +344,39 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int } } -func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, wg *sync.WaitGroup) { +func processInterruptionEventFunctions(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) { defer wg.Done() - nodeFound := true - nodeName := drainEvent.NodeName + processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset) + drainOrCordonIfNecessary(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder) + <-interruptionEventStore.Workers +} - if nthConfig.UseProviderId { - newNodeName, err := node.GetNodeNameFromProviderID(drainEvent.ProviderID) +func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) { + if drainEvent.Kind != monitor.ASGLaunchLifecycleKind { + return + } - if err != nil { - log.Err(err).Msgf("Unable to get node name for node with ProviderID '%s' using original AWS event node name ", drainEvent.ProviderID) - } else { - nodeName = newNodeName - } + if !isNodeReady(drainEvent.InstanceID, clientset) { + log.Error().Msgf("new ASG instance, %s, has not connected to cluster", drainEvent.InstanceID) + interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) + return } + nodeName := getNodeName(drainEvent, node, nthConfig) + + if drainEvent.PostDrainTask != nil { + runPostDrainTask(node, nodeName, drainEvent, metrics, recorder) + } +} + +func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) { + if drainEvent.Kind == monitor.ASGLaunchLifecycleKind { + return + } + + nodeFound := true + nodeName := getNodeName(drainEvent, node, nthConfig) + nodeLabels, err := node.GetNodeLabels(nodeName) if err != nil { log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName) @@ -395,7 +416,99 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto if (err == nil || (!nodeFound && nthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil { runPostDrainTask(node, nodeName, drainEvent, metrics, recorder) } - <-interruptionEventStore.Workers +} + +func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config) string { + nodeName := drainEvent.NodeName + if nthConfig.UseProviderId { + newNodeName, err := node.GetNodeNameFromProviderID(drainEvent.ProviderID) + + if err != nil { + log.Err(err).Msgf("Unable to get node name for node with ProviderID '%s' using original AWS event node name ", drainEvent.ProviderID) + } else { + nodeName = newNodeName + } + } + return nodeName +} + +func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool { + nodes, err := getNodesWithInstanceID(instanceID, clientset) + if err != nil { + log.Err(fmt.Errorf("getting nodes with instance ID: %w", err)) + return false + } + + if len(nodes) == 0 { + log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", instanceID)) + return false + } + + for _, node := range nodes { + conditions := node.Status.Conditions + for _, condition := range conditions { + if condition.Type != "Ready" { + continue + } + if condition.Status != "True" { + log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID)) + return false + } + } + } + log.Info().Msgf("new ASG instance, %s, is found and ready in cluster", instanceID) + return true +} + +// Gets Nodes connected to K8s cluster +func getNodesWithInstanceID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { + nodes, err := getNodesWithInstanceFromLabel(instanceID, clientset) + if err != nil { + return nil, err + } + if len(nodes) != 0 { + return nodes, nil + } + + nodes, err = getNodesWithInstanceFromProviderID(instanceID, clientset) + if err != nil { + return nil, err + } + return nodes, nil +} + +func getNodesWithInstanceFromLabel(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { + instanceIDReq, err := labels.NewRequirement("alpha.eksctl.io/instance-id", selection.Equals, []string{instanceID}) + if err != nil { + return nil, fmt.Errorf("bad label requirement: %w", err) + } + selector := labels.NewSelector().Add(*instanceIDReq) + options := metav1.ListOptions{LabelSelector: selector.String()} + return getNodes(options, clientset) +} + +func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { + nodes, err := getNodes(metav1.ListOptions{}, clientset) + if err != nil { + return nil, err + } + + var filteredNodes []v1.Node + for _, node := range nodes { + if !strings.Contains(node.Spec.ProviderID, instanceID) { + continue + } + filteredNodes = append(filteredNodes, node) + } + return filteredNodes, nil +} + +func getNodes(options metav1.ListOptions, clientset *kubernetes.Clientset) ([]v1.Node, error) { + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), options) + if err != nil { + return nil, fmt.Errorf("retreiving nodes from cluster: %w", err) + } + return nodes.Items, err } func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) { diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 50493bf7..d232cf07 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -14,10 +14,8 @@ package sqsevent import ( - "context" "encoding/json" "fmt" - "strings" "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-node-termination-handler/pkg/node" @@ -25,9 +23,6 @@ import ( "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/sqs" "github.com/rs/zerolog/log" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" ) /* Example SQS ASG Lifecycle Termination Event Message: @@ -134,72 +129,45 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* } // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster -func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error { +func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { - return fmt.Errorf("unmarshaling ASG lifecycle event: %w", err) + return nil, fmt.Errorf("unmarshaling ASG lifecycle event: %w", err) } if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { - return ignore{skip{fmt.Errorf("message is an ASG test notification")}} + return nil, ignore{skip{fmt.Errorf("message is an ASG test notification")}} } - if !isNodeReady(lifecycleDetail, m.K8sClientset) { - return ignore{skip{fmt.Errorf("new ASG instance has not connected to cluster")}} - } - - _, err = m.continueLifecycleAction(lifecycleDetail) + nodeInfo, err := m.getNodeInfo(lifecycleDetail.EC2InstanceID) if err != nil { - return ignore{skip{fmt.Errorf("continuing ASG launch lifecyle: %w", err)}} + return nil, err } - log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", - lifecycleDetail.LifecycleHookName, - lifecycleDetail.EC2InstanceID) - err = m.deleteMessage(message) - return err -} - -// If the Node, new EC2 instance, is ready in the K8s cluster -func isNodeReady(lifecycleDetail *LifecycleDetail, clientset *kubernetes.Clientset) bool { - nodes, err := getNodes(clientset) - if err != nil { - log.Err(fmt.Errorf("getting nodes from cluster: %w", err)) - return false + interruptionEvent := monitor.InterruptionEvent{ + EventID: fmt.Sprintf("asg-lifecycle-term-%x", event.ID), + Kind: monitor.ASGLaunchLifecycleKind, + Monitor: SQSMonitorKind, + AutoScalingGroupName: lifecycleDetail.AutoScalingGroupName, + StartTime: event.getTime(), + NodeName: nodeInfo.Name, + IsManaged: nodeInfo.IsManaged, + InstanceID: lifecycleDetail.EC2InstanceID, + ProviderID: nodeInfo.ProviderID, + Description: fmt.Sprintf("ASG Lifecycle Launch event received. Instance will be interrupted at %s \n", event.getTime()), } - for _, node := range nodes.Items { - instanceID := getInstanceID(node) - if instanceID != lifecycleDetail.EC2InstanceID { - continue - } - - conditions := node.Status.Conditions - for _, condition := range conditions { - if condition.Type == "Ready" && condition.Status == "True" { - return true - } + interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error { + _, err = m.continueLifecycleAction(lifecycleDetail) + if err != nil { + return fmt.Errorf("continuing ASG launch lifecyle: %w", err) } - log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID)) - } - log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", lifecycleDetail.EC2InstanceID)) - return false -} - -// Gets Nodes connected to K8s cluster -func getNodes(clientset *kubernetes.Clientset) (*v1.NodeList, error) { - nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) - if err != nil { - return nil, fmt.Errorf("retreiving nodes from cluster: %w", err) + log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", + lifecycleDetail.LifecycleHookName, + lifecycleDetail.EC2InstanceID) + return m.deleteMessage(message) } - return nodes, err -} -// Gets EC2 InstanceID from ProviderID, format: aws:///$az/$instanceid -func getInstanceID(node v1.Node) string { - providerID := node.Spec.ProviderID - providerIDSplit := strings.Split(providerID, "/") - instanceID := providerIDSplit[len(providerIDSplit)-1] - return instanceID + return &interruptionEvent, err } diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 8bdd2a0a..e51ede20 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -28,7 +28,6 @@ import ( "github.com/aws/aws-sdk-go/service/ec2/ec2iface" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/sqs/sqsiface" - "k8s.io/client-go/kubernetes" "github.com/rs/zerolog/log" @@ -53,7 +52,6 @@ type SQSMonitor struct { CheckIfManaged bool ManagedTag string BeforeCompleteLifecycleAction func() - K8sClientset *kubernetes.Clientset } // InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any @@ -206,8 +204,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, lifecycleEvent := LifecycleDetail{} err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { - err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message) - interruptionEvent = nil + interruptionEvent, err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message) } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) } diff --git a/pkg/monitor/types.go b/pkg/monitor/types.go index c3c587d2..93d56625 100644 --- a/pkg/monitor/types.go +++ b/pkg/monitor/types.go @@ -31,6 +31,8 @@ const ( StateChangeKind = "STATE_CHANGE" // ASGLifecycleKind is a const to define an ASG Lifecycle kind of interruption event ASGLifecycleKind = "ASG_LIFECYCLE" + // ASGLifecycleKind is a const to define an ASG Launch Lifecycle kind of interruption event + ASGLaunchLifecycleKind = "ASG_LAUNCH_LIFECYCLE" // SQSTerminateKind is a const to define an SQS termination kind of interruption event SQSTerminateKind = "SQS_TERMINATE" ) From fee8077469e275339fea3c0f8e1667b92b972a60 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 5 Dec 2023 15:31:17 -0600 Subject: [PATCH 15/27] Revise log messages and formatting --- cmd/node-termination-handler.go | 15 ++++++++++----- pkg/monitor/sqsevent/asg-lifecycle-event.go | 13 ++++++++----- pkg/monitor/sqsevent/sqs-monitor.go | 17 +++++++++-------- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 1fdfd02f..9057d6db 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -106,7 +106,7 @@ func main() { clusterConfig, err := rest.InClusterConfig() if err != nil { - log.Fatal().Err(err).Msgf("retreiving cluster config: %v", err) + log.Fatal().Err(err).Msgf("retreiving cluster config") } clientset, err := kubernetes.NewForConfig(clusterConfig) if err != nil { @@ -344,6 +344,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int } } +// TODO rename to processInterruptionEvent RENAME func processInterruptionEventFunctions(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) { defer wg.Done() processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset) @@ -351,17 +352,19 @@ func processInterruptionEventFunctions(interruptionEventStore *interruptionevent <-interruptionEventStore.Workers } +// TODO move function and helpers to new package pkg/interruptioneventhandler/asg/launch func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) { if drainEvent.Kind != monitor.ASGLaunchLifecycleKind { return } if !isNodeReady(drainEvent.InstanceID, clientset) { - log.Error().Msgf("new ASG instance, %s, has not connected to cluster", drainEvent.InstanceID) + log.Error().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster") interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) return } + log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster") nodeName := getNodeName(drainEvent, node, nthConfig) if drainEvent.PostDrainTask != nil { @@ -370,6 +373,7 @@ func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventsto } func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) { + //TODO Use allow list instead of denylist LOGIC if drainEvent.Kind == monitor.ASGLaunchLifecycleKind { return } @@ -418,6 +422,7 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto } } +// TODO Restructure indentation LOGIC func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config) string { nodeName := drainEvent.NodeName if nthConfig.UseProviderId { @@ -432,6 +437,7 @@ func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfi return nodeName } +// TODO make method return error func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool { nodes, err := getNodesWithInstanceID(instanceID, clientset) if err != nil { @@ -440,23 +446,21 @@ func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool { } if len(nodes) == 0 { - log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", instanceID)) return false } for _, node := range nodes { conditions := node.Status.Conditions for _, condition := range conditions { + //TODO combine if statements LOGIC if condition.Type != "Ready" { continue } if condition.Status != "True" { - log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID)) return false } } } - log.Info().Msgf("new ASG instance, %s, is found and ready in cluster", instanceID) return true } @@ -503,6 +507,7 @@ func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes return filteredNodes, nil } +// TODO Remove method func getNodes(options metav1.ListOptions, clientset *kubernetes.Clientset) ([]v1.Node, error) { nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), options) if err != nil { diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index d232cf07..2e53bd39 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -128,12 +128,17 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* }) } +// TODO Rename to createAsgInstanceLaunchEvent RENAME // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { + if message == nil || event == nil { + return nil, fmt.Errorf("event message is nil for ASG Instance Launch Event creation") + } + lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { - return nil, fmt.Errorf("unmarshaling ASG lifecycle event: %w", err) + return nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err) } if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { @@ -161,11 +166,9 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error { _, err = m.continueLifecycleAction(lifecycleDetail) if err != nil { - return fmt.Errorf("continuing ASG launch lifecyle: %w", err) + return fmt.Errorf("continuing ASG launch lifecycle: %w", err) } - log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s", - lifecycleDetail.LifecycleHookName, - lifecycleDetail.EC2InstanceID) + log.Info().Str("lifecycleHookName", lifecycleDetail.LifecycleHookName).Str("instanceID", lifecycleDetail.EC2InstanceID).Msg("Completed ASG Lifecycle Hook") return m.deleteMessage(message) } diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index e51ede20..fef06113 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -73,6 +73,7 @@ func (s skip) Unwrap() error { return s.err } +// TODO REMOVE // Used to completely ignore an error. Used when processing a non-terminating event type ignore struct { err error @@ -144,14 +145,15 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, return &event, err } +// TODO Rename to parseLifecycleEvent, rename messageBody to message RENAME func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) { lifecycleEventMessage := LifecycleDetailMessage{} lifecycleEvent := LifecycleDetail{} err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage) if err != nil { - // log.Err(err).Msg("processing JSON message of lifecycle event from ASG") return lifecycleEvent, err } + //TODO add comment about why Sprintf is needed COMMENT if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) } else { @@ -162,13 +164,14 @@ func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) { // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { + log.Debug().Msg("processing lifecycle event from ASG") eventBridgeEvent := EventBridgeEvent{} + //TODO nil-check the pointer and pass in a string instead of a pointer LOGIC lifecycleEvent, err := messageToLifecycleEvent(message.Body) switch { case err != nil: - log.Err(err).Msg("only lifecycle events from ASG to SQS are supported outside EventBridge") - return eventBridgeEvent, err + return eventBridgeEvent, fmt.Errorf("parsing lifecycle event messsage from ASG: %w", err) case lifecycleEvent.Event == TEST_NOTIFICATION || lifecycleEvent.LifecycleTransition == TEST_NOTIFICATION: err := fmt.Errorf("message is a test notification") @@ -179,17 +182,13 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" && lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING": - log.Err(err).Msg("only lifecycle termination events from ASG to SQS are supported outside EventBridge") - err = fmt.Errorf("unsupported message type (%s)", message.String()) - return eventBridgeEvent, err + return eventBridgeEvent, fmt.Errorf("unsupported message type (%s) while parsing lifecycle event messsage from ASG", message.String()) } eventBridgeEvent.Source = "aws.autoscaling" eventBridgeEvent.Time = lifecycleEvent.Time eventBridgeEvent.ID = lifecycleEvent.RequestID eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent) - - log.Debug().Msg("processing lifecycle event from ASG") return eventBridgeEvent, err } @@ -200,9 +199,11 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, var err error switch eventBridgeEvent.Source { + //TODO add comment for other cases case "aws.autoscaling": lifecycleEvent := LifecycleDetail{} err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) + //TODO handle err != nil if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { interruptionEvent, err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message) } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { From 45869afd0d5ec500914ab73b8a1643a923fa5399 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 5 Dec 2023 16:01:33 -0600 Subject: [PATCH 16/27] Removed ignore errors and getNodes method. Changed method names --- cmd/node-termination-handler.go | 34 +++++++++------------ pkg/monitor/sqsevent/asg-lifecycle-event.go | 5 ++- pkg/monitor/sqsevent/sqs-monitor.go | 33 +++++--------------- 3 files changed, 23 insertions(+), 49 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 9057d6db..09c7e7d1 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -272,7 +272,7 @@ func main() { event.InProgress = true wg.Add(1) recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description) - go processInterruptionEventFunctions(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg) + go processInterruptionEvent(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg) default: log.Warn().Msg("all workers busy, waiting") break EventLoop @@ -344,8 +344,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int } } -// TODO rename to processInterruptionEvent RENAME -func processInterruptionEventFunctions(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) { +func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) { defer wg.Done() processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset) drainOrCordonIfNecessary(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder) @@ -373,7 +372,7 @@ func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventsto } func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) { - //TODO Use allow list instead of denylist LOGIC + // TODO Use allow list instead of denylist LOGIC if drainEvent.Kind == monitor.ASGLaunchLifecycleKind { return } @@ -452,7 +451,7 @@ func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool { for _, node := range nodes { conditions := node.Status.Conditions for _, condition := range conditions { - //TODO combine if statements LOGIC + // TODO combine if statements LOGIC if condition.Type != "Ready" { continue } @@ -482,23 +481,27 @@ func getNodesWithInstanceID(instanceID string, clientset *kubernetes.Clientset) } func getNodesWithInstanceFromLabel(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { - instanceIDReq, err := labels.NewRequirement("alpha.eksctl.io/instance-id", selection.Equals, []string{instanceID}) + instanceIDLabel := "alpha.eksctl.io/instance-id" + instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID}) if err != nil { return nil, fmt.Errorf("bad label requirement: %w", err) } selector := labels.NewSelector().Add(*instanceIDReq) - options := metav1.ListOptions{LabelSelector: selector.String()} - return getNodes(options, clientset) + nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err) + } + return nodeList.Items, nil } func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { - nodes, err := getNodes(metav1.ListOptions{}, clientset) + nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { - return nil, err + return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err) } var filteredNodes []v1.Node - for _, node := range nodes { + for _, node := range nodeList.Items { if !strings.Contains(node.Spec.ProviderID, instanceID) { continue } @@ -507,15 +510,6 @@ func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes return filteredNodes, nil } -// TODO Remove method -func getNodes(options metav1.ListOptions, clientset *kubernetes.Clientset) ([]v1.Node, error) { - nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), options) - if err != nil { - return nil, fmt.Errorf("retreiving nodes from cluster: %w", err) - } - return nodes.Items, err -} - func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) { err := drainEvent.PreDrainTask(*drainEvent, node) if err != nil { diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 2e53bd39..0db51eab 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -128,9 +128,8 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* }) } -// TODO Rename to createAsgInstanceLaunchEvent RENAME // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster -func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { +func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { if message == nil || event == nil { return nil, fmt.Errorf("event message is nil for ASG Instance Launch Event creation") } @@ -142,7 +141,7 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message } if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { - return nil, ignore{skip{fmt.Errorf("message is an ASG test notification")}} + return nil, skip{fmt.Errorf("message is an ASG test notification")} } nodeInfo, err := m.getNodeInfo(lifecycleDetail.EC2InstanceID) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index fef06113..d62fde76 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -73,20 +73,6 @@ func (s skip) Unwrap() error { return s.err } -// TODO REMOVE -// Used to completely ignore an error. Used when processing a non-terminating event -type ignore struct { - err error -} - -func (i ignore) Error() string { - return i.err.Error() -} - -func (i ignore) Unwrap() error { - return i.err -} - // Kind denotes the kind of monitor func (m SQSMonitor) Kind() string { return SQSMonitorKind @@ -145,15 +131,14 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, return &event, err } -// TODO Rename to parseLifecycleEvent, rename messageBody to message RENAME -func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) { +func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) { lifecycleEventMessage := LifecycleDetailMessage{} lifecycleEvent := LifecycleDetail{} err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage) if err != nil { return lifecycleEvent, err } - //TODO add comment about why Sprintf is needed COMMENT + // TODO add comment about why Sprintf is needed COMMENT if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) } else { @@ -166,8 +151,8 @@ func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) { func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { log.Debug().Msg("processing lifecycle event from ASG") eventBridgeEvent := EventBridgeEvent{} - //TODO nil-check the pointer and pass in a string instead of a pointer LOGIC - lifecycleEvent, err := messageToLifecycleEvent(message.Body) + // TODO nil-check the pointer and pass in a string instead of a pointer LOGIC + lifecycleEvent, err := parseLifecycleEvent(message.Body) switch { case err != nil: @@ -199,13 +184,13 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, var err error switch eventBridgeEvent.Source { - //TODO add comment for other cases + // TODO add comment for other cases case "aws.autoscaling": lifecycleEvent := LifecycleDetail{} err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) - //TODO handle err != nil + // TODO handle err != nil if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { - interruptionEvent, err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message) + interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message) } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) } @@ -236,13 +221,9 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr dropMessageSuggestionCount := 0 failedInterruptionEventsCount := 0 var skipErr skip - var ignoreErr ignore for _, eventWrapper := range interruptionEventWrappers { switch { - case errors.As(eventWrapper.Err, &ignoreErr): - log.Warn().Err(ignoreErr).Msg("ASG launch cycle not continued") - case errors.As(eventWrapper.Err, &skipErr): log.Warn().Err(skipErr).Msg("dropping event") dropMessageSuggestionCount++ From 10f0f34d075268a811e177b483a2208153a3db77 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 5 Dec 2023 16:43:33 -0600 Subject: [PATCH 17/27] Refactor error handling, and add helpful comments --- cmd/node-termination-handler.go | 17 ++++++++--------- pkg/monitor/sqsevent/asg-lifecycle-event.go | 2 +- pkg/monitor/sqsevent/sqs-monitor.go | 17 +++++++++++++---- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 09c7e7d1..9619aad3 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -357,8 +357,9 @@ func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventsto return } - if !isNodeReady(drainEvent.InstanceID, clientset) { - log.Error().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster") + isNodeReady, err := isNodeReady(drainEvent.InstanceID, clientset) + if err != nil || !isNodeReady { + log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster") interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) return } @@ -436,16 +437,14 @@ func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfi return nodeName } -// TODO make method return error -func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool { +func isNodeReady(instanceID string, clientset *kubernetes.Clientset) (bool, error) { nodes, err := getNodesWithInstanceID(instanceID, clientset) if err != nil { - log.Err(fmt.Errorf("getting nodes with instance ID: %w", err)) - return false + return false, fmt.Errorf("getting nodes with instance ID: %w", err) } if len(nodes) == 0 { - return false + return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID) } for _, node := range nodes { @@ -456,11 +455,11 @@ func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool { continue } if condition.Status != "True" { - return false + return false, fmt.Errorf("ec2 instance, %s, found, but not ready in cluster", instanceID) } } } - return true + return true, nil } // Gets Nodes connected to K8s cluster diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 0db51eab..3a9dd390 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -137,7 +137,7 @@ func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, messag lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { - return nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err) + return nil, fmt.Errorf("unmarshaling message, %s, from ASG launch lifecycle event: %w", *message.MessageId, err) } if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION { diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index d62fde76..d155ecf3 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -138,7 +138,7 @@ func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) { if err != nil { return lifecycleEvent, err } - // TODO add comment about why Sprintf is needed COMMENT + // Converts escaped JSON object to string, to lifecycle event if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) } else { @@ -151,7 +151,10 @@ func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) { func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { log.Debug().Msg("processing lifecycle event from ASG") eventBridgeEvent := EventBridgeEvent{} - // TODO nil-check the pointer and pass in a string instead of a pointer LOGIC + + if message == nil { + return eventBridgeEvent, fmt.Errorf("ASG event message is nil") + } lifecycleEvent, err := parseLifecycleEvent(message.Body) switch { @@ -183,12 +186,18 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, interruptionEvent := &monitor.InterruptionEvent{} var err error + if message == nil || eventBridgeEvent == nil { + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("event message is nil")}) + } + switch eventBridgeEvent.Source { - // TODO add comment for other cases + // LifecycleTransitions other than LAUNCHING or TERMINATING will result in the interruptionEvent being uninitialized case "aws.autoscaling": lifecycleEvent := LifecycleDetail{} err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) - // TODO handle err != nil + if err != nil { + interruptionEvent, err = nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err) + } if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message) } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { From 1ab824bf73f6a2683b0b9d40376a1967781e09e9 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 7 Dec 2023 10:46:41 -0600 Subject: [PATCH 18/27] Refactored interruption event handling into a seperate package with distinct handlers for different interruption event Kinds --- cmd/node-termination-handler.go | 240 +----------------- pkg/interruptionevent/asg/launch/handler.go | 144 +++++++++++ pkg/interruptionevent/draincordon/handler.go | 139 ++++++++++ .../internal/common/handler.go | 76 ++++++ pkg/monitor/sqsevent/sqs-monitor.go | 8 +- 5 files changed, 377 insertions(+), 230 deletions(-) create mode 100644 pkg/interruptionevent/asg/launch/handler.go create mode 100644 pkg/interruptionevent/draincordon/handler.go create mode 100644 pkg/interruptionevent/internal/common/handler.go diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 9619aad3..6fd0c745 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -25,6 +25,8 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/interruptionevent/asg/launch" + "github.com/aws/aws-node-termination-handler/pkg/interruptionevent/draincordon" "github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore" "github.com/aws/aws-node-termination-handler/pkg/logging" "github.com/aws/aws-node-termination-handler/pkg/monitor" @@ -43,11 +45,6 @@ import ( "github.com/aws/aws-sdk-go/service/sqs" "github.com/rs/zerolog" "github.com/rs/zerolog/log" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -62,6 +59,10 @@ const ( duplicateErrThreshold = 3 ) +type interruptionEventHandler interface { + HandleEvent(*monitor.InterruptionEvent) +} + func main() { // Zerolog uses json formatting by default, so change that to a human-readable format instead log.Logger = log.Output(logging.RoutingLevelWriter{ @@ -258,6 +259,9 @@ func main() { var wg sync.WaitGroup + asgLaunchHandler := launch.New(interruptionEventStore, *node, nthConfig, metrics, recorder, clientset) + drainCordonHander := draincordon.New(interruptionEventStore, *node, nthConfig, nodeMetadata, metrics, recorder) + for range time.NewTicker(1 * time.Second).C { select { case <-signalChan: @@ -272,7 +276,7 @@ func main() { event.InProgress = true wg.Add(1) recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description) - go processInterruptionEvent(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg) + go processInterruptionEvent(interruptionEventStore, event, []interruptionEventHandler{asgLaunchHandler, drainCordonHander}, &wg) default: log.Warn().Msg("all workers busy, waiting") break EventLoop @@ -344,228 +348,12 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int } } -func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) { +func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, event *monitor.InterruptionEvent, eventHandlers []interruptionEventHandler, wg *sync.WaitGroup) { defer wg.Done() - processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset) - drainOrCordonIfNecessary(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder) - <-interruptionEventStore.Workers -} - -// TODO move function and helpers to new package pkg/interruptioneventhandler/asg/launch -func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) { - if drainEvent.Kind != monitor.ASGLaunchLifecycleKind { - return + for _, eventHandler := range eventHandlers { + eventHandler.HandleEvent(event) } - - isNodeReady, err := isNodeReady(drainEvent.InstanceID, clientset) - if err != nil || !isNodeReady { - log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster") - interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) - return - } - - log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster") - nodeName := getNodeName(drainEvent, node, nthConfig) - - if drainEvent.PostDrainTask != nil { - runPostDrainTask(node, nodeName, drainEvent, metrics, recorder) - } -} - -func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) { - // TODO Use allow list instead of denylist LOGIC - if drainEvent.Kind == monitor.ASGLaunchLifecycleKind { - return - } - - nodeFound := true - nodeName := getNodeName(drainEvent, node, nthConfig) - - nodeLabels, err := node.GetNodeLabels(nodeName) - if err != nil { - log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName) - nodeFound = false - } - drainEvent.NodeLabels = nodeLabels - if drainEvent.PreDrainTask != nil { - runPreDrainTask(node, nodeName, drainEvent, metrics, recorder) - } - - podNameList, err := node.FetchPodNameList(nodeName) - if err != nil { - log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName) - } - drainEvent.Pods = podNameList - err = node.LogPods(podNameList, nodeName) - if err != nil { - log.Err(err).Msg("There was a problem while trying to log all pod names on the node") - } - - if nthConfig.CordonOnly || (!nthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !nthConfig.EnableRebalanceDraining) { - err = cordonNode(node, nodeName, drainEvent, metrics, recorder) - } else { - err = cordonAndDrainNode(node, nodeName, drainEvent, metrics, recorder, nthConfig.EnableSQSTerminationDraining) - } - - if nthConfig.WebhookURL != "" { - webhook.Post(nodeMetadata, drainEvent, nthConfig) - } - - if err != nil { - interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) - } else { - interruptionEventStore.MarkAllAsProcessed(nodeName) - } - - if (err == nil || (!nodeFound && nthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil { - runPostDrainTask(node, nodeName, drainEvent, metrics, recorder) - } -} - -// TODO Restructure indentation LOGIC -func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config) string { - nodeName := drainEvent.NodeName - if nthConfig.UseProviderId { - newNodeName, err := node.GetNodeNameFromProviderID(drainEvent.ProviderID) - - if err != nil { - log.Err(err).Msgf("Unable to get node name for node with ProviderID '%s' using original AWS event node name ", drainEvent.ProviderID) - } else { - nodeName = newNodeName - } - } - return nodeName -} - -func isNodeReady(instanceID string, clientset *kubernetes.Clientset) (bool, error) { - nodes, err := getNodesWithInstanceID(instanceID, clientset) - if err != nil { - return false, fmt.Errorf("getting nodes with instance ID: %w", err) - } - - if len(nodes) == 0 { - return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID) - } - - for _, node := range nodes { - conditions := node.Status.Conditions - for _, condition := range conditions { - // TODO combine if statements LOGIC - if condition.Type != "Ready" { - continue - } - if condition.Status != "True" { - return false, fmt.Errorf("ec2 instance, %s, found, but not ready in cluster", instanceID) - } - } - } - return true, nil -} - -// Gets Nodes connected to K8s cluster -func getNodesWithInstanceID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { - nodes, err := getNodesWithInstanceFromLabel(instanceID, clientset) - if err != nil { - return nil, err - } - if len(nodes) != 0 { - return nodes, nil - } - - nodes, err = getNodesWithInstanceFromProviderID(instanceID, clientset) - if err != nil { - return nil, err - } - return nodes, nil -} - -func getNodesWithInstanceFromLabel(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { - instanceIDLabel := "alpha.eksctl.io/instance-id" - instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID}) - if err != nil { - return nil, fmt.Errorf("bad label requirement: %w", err) - } - selector := labels.NewSelector().Add(*instanceIDReq) - nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()}) - if err != nil { - return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err) - } - return nodeList.Items, nil -} - -func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) { - nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) - if err != nil { - return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err) - } - - var filteredNodes []v1.Node - for _, node := range nodeList.Items { - if !strings.Contains(node.Spec.ProviderID, instanceID) { - continue - } - filteredNodes = append(filteredNodes, node) - } - return filteredNodes, nil -} - -func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) { - err := drainEvent.PreDrainTask(*drainEvent, node) - if err != nil { - log.Err(err).Msg("There was a problem executing the pre-drain task") - recorder.Emit(nodeName, observability.Warning, observability.PreDrainErrReason, observability.PreDrainErrMsgFmt, err.Error()) - } else { - recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg) - } - metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err) -} - -func cordonNode(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) error { - err := node.Cordon(nodeName, drainEvent.Description) - if err != nil { - if errors.IsNotFound(err) { - log.Err(err).Msgf("node '%s' not found in the cluster", nodeName) - } else { - log.Err(err).Msg("There was a problem while trying to cordon the node") - recorder.Emit(nodeName, observability.Warning, observability.CordonErrReason, observability.CordonErrMsgFmt, err.Error()) - } - return err - } else { - log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned") - metrics.NodeActionsInc("cordon", nodeName, drainEvent.EventID, err) - recorder.Emit(nodeName, observability.Normal, observability.CordonReason, observability.CordonMsg) - } - return nil -} - -func cordonAndDrainNode(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder, sqsTerminationDraining bool) error { - err := node.CordonAndDrain(nodeName, drainEvent.Description, recorder.EventRecorder) - if err != nil { - if errors.IsNotFound(err) { - log.Err(err).Msgf("node '%s' not found in the cluster", nodeName) - } else { - log.Err(err).Msg("There was a problem while trying to cordon and drain the node") - metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err) - recorder.Emit(nodeName, observability.Warning, observability.CordonAndDrainErrReason, observability.CordonAndDrainErrMsgFmt, err.Error()) - } - return err - } else { - log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned and drained") - metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err) - recorder.Emit(nodeName, observability.Normal, observability.CordonAndDrainReason, observability.CordonAndDrainMsg) - } - return nil -} - -func runPostDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) { - err := drainEvent.PostDrainTask(*drainEvent, node) - if err != nil { - log.Err(err).Msg("There was a problem executing the post-drain task") - recorder.Emit(nodeName, observability.Warning, observability.PostDrainErrReason, observability.PostDrainErrMsgFmt, err.Error()) - } else { - recorder.Emit(nodeName, observability.Normal, observability.PostDrainReason, observability.PostDrainMsg) - } - metrics.NodeActionsInc("post-drain", nodeName, drainEvent.EventID, err) + <-interruptionEventStore.Workers } func getRegionFromQueueURL(queueURL string) string { diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go new file mode 100644 index 00000000..46d917a8 --- /dev/null +++ b/pkg/interruptionevent/asg/launch/handler.go @@ -0,0 +1,144 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License + +package launch + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/interruptionevent/internal/common" + "github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + "github.com/aws/aws-node-termination-handler/pkg/observability" + "github.com/rs/zerolog/log" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" + "k8s.io/client-go/kubernetes" +) + +type Handler struct { + commonHandler *common.Handler + clientset *kubernetes.Clientset +} + +func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, nthConfig config.Config, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) *Handler { + commonHandler := &common.Handler{ + InterruptionEventStore: interruptionEventStore, + Node: node, + NthConfig: nthConfig, + Metrics: metrics, + Recorder: recorder, + } + + return &Handler{ + commonHandler: commonHandler, + clientset: clientset, + } +} + +func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { + if !common.IsAllowedKind(drainEvent.Kind, monitor.ASGLaunchLifecycleKind) { + return + } + + isNodeReady, err := h.isNodeReady(drainEvent.InstanceID) + if err != nil || !isNodeReady { + log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster") + h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) + return + } + + log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster") + nodeName, err := h.commonHandler.GetNodeName(drainEvent) + if err != nil { + log.Error().Err(err).Msg("unable to retrieve node name for ASG event processing") + } + + if drainEvent.PostDrainTask != nil { + h.commonHandler.RunPostDrainTask(nodeName, drainEvent) + } +} + +func (h *Handler) isNodeReady(instanceID string) (bool, error) { + nodes, err := h.getNodesWithInstanceID(instanceID) + if err != nil { + return false, fmt.Errorf("getting nodes with instance ID: %w", err) + } + + if len(nodes) == 0 { + return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID) + } + + for _, node := range nodes { + conditions := node.Status.Conditions + for _, condition := range conditions { + if condition.Type == "Ready" && condition.Status != "True" { + return false, fmt.Errorf("EC2 instance, %s, found, but not ready in cluster", instanceID) + } + } + } + return true, nil +} + +// Gets Nodes connected to K8s cluster +func (h *Handler) getNodesWithInstanceID(instanceID string) ([]v1.Node, error) { + nodes, err := h.getNodesWithInstanceFromLabel(instanceID) + if err != nil { + return nil, err + } + if len(nodes) != 0 { + return nodes, nil + } + + nodes, err = h.getNodesWithInstanceFromProviderID(instanceID) + if err != nil { + return nil, err + } + return nodes, nil +} + +func (h *Handler) getNodesWithInstanceFromLabel(instanceID string) ([]v1.Node, error) { + instanceIDLabel := "alpha.eksctl.io/instance-id" + instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID}) + if err != nil { + return nil, fmt.Errorf("bad label requirement: %w", err) + } + selector := labels.NewSelector().Add(*instanceIDReq) + nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err) + } + return nodeList.Items, nil +} + +func (h *Handler) getNodesWithInstanceFromProviderID(instanceID string) ([]v1.Node, error) { + nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err) + } + + var filteredNodes []v1.Node + for _, node := range nodeList.Items { + if !strings.Contains(node.Spec.ProviderID, instanceID) { + continue + } + filteredNodes = append(filteredNodes, node) + } + return filteredNodes, nil +} diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go new file mode 100644 index 00000000..32ea8cae --- /dev/null +++ b/pkg/interruptionevent/draincordon/handler.go @@ -0,0 +1,139 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License + +package draincordon + +import ( + "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/interruptionevent/internal/common" + "github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + "github.com/aws/aws-node-termination-handler/pkg/observability" + "github.com/aws/aws-node-termination-handler/pkg/webhook" + "github.com/rs/zerolog/log" + "k8s.io/apimachinery/pkg/api/errors" +) + +var allowedKinds = []string{monitor.ASGLifecycleKind, monitor.RebalanceRecommendationKind, monitor.SQSTerminateKind, monitor.ScheduledEventKind, + monitor.SpotITNKind, monitor.StateChangeKind} + +type Handler struct { + commonHandler *common.Handler + nodeMetadata ec2metadata.NodeMetadata +} + +func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) *Handler { + commonHandler := &common.Handler{ + InterruptionEventStore: interruptionEventStore, + Node: node, + NthConfig: nthConfig, + Metrics: metrics, + Recorder: recorder, + } + + return &Handler{ + commonHandler: commonHandler, + nodeMetadata: nodeMetadata, + } +} + +func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { + if !common.IsAllowedKind(drainEvent.Kind, allowedKinds...) { + return + } + + nodeFound := true + nodeName, err := h.commonHandler.GetNodeName(drainEvent) + if err != nil { + log.Error().Err(err).Msg("unable to retrieve node name for draining or cordoning") + } + + nodeLabels, err := h.commonHandler.Node.GetNodeLabels(nodeName) + if err != nil { + log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName) + nodeFound = false + } + drainEvent.NodeLabels = nodeLabels + if drainEvent.PreDrainTask != nil { + h.commonHandler.RunPreDrainTask(nodeName, drainEvent) + } + + podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName) + if err != nil { + log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName) + } + drainEvent.Pods = podNameList + err = h.commonHandler.Node.LogPods(podNameList, nodeName) + if err != nil { + log.Err(err).Msg("There was a problem while trying to log all pod names on the node") + } + + if h.commonHandler.NthConfig.CordonOnly || (!h.commonHandler.NthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !h.commonHandler.NthConfig.EnableRebalanceDraining) { + err = h.cordonNode(nodeName, drainEvent) + } else { + err = h.cordonAndDrainNode(nodeName, drainEvent) + } + + if h.commonHandler.NthConfig.WebhookURL != "" { + webhook.Post(h.nodeMetadata, drainEvent, h.commonHandler.NthConfig) + } + + if err != nil { + h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) + } else { + h.commonHandler.InterruptionEventStore.MarkAllAsProcessed(nodeName) + } + + if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil { + h.commonHandler.RunPostDrainTask(nodeName, drainEvent) + } +} + +func (h *Handler) cordonNode(nodeName string, drainEvent *monitor.InterruptionEvent) error { + err := h.commonHandler.Node.Cordon(nodeName, drainEvent.Description) + if err != nil { + if errors.IsNotFound(err) { + log.Err(err).Msgf("node '%s' not found in the cluster", nodeName) + } else { + log.Err(err).Msg("There was a problem while trying to cordon the node") + h.commonHandler.Recorder.Emit(nodeName, observability.Warning, observability.CordonErrReason, observability.CordonErrMsgFmt, err.Error()) + } + return err + } else { + log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned") + h.commonHandler.Metrics.NodeActionsInc("cordon", nodeName, drainEvent.EventID, err) + h.commonHandler.Recorder.Emit(nodeName, observability.Normal, observability.CordonReason, observability.CordonMsg) + } + return nil +} + +func (h *Handler) cordonAndDrainNode(nodeName string, drainEvent *monitor.InterruptionEvent) error { + err := h.commonHandler.Node.CordonAndDrain(nodeName, drainEvent.Description, h.commonHandler.Recorder.EventRecorder) + if err != nil { + if errors.IsNotFound(err) { + log.Err(err).Msgf("node '%s' not found in the cluster", nodeName) + } else { + log.Err(err).Msg("There was a problem while trying to cordon and drain the node") + h.commonHandler.Metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err) + h.commonHandler.Recorder.Emit(nodeName, observability.Warning, observability.CordonAndDrainErrReason, observability.CordonAndDrainErrMsgFmt, err.Error()) + } + return err + } else { + log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned and drained") + h.commonHandler.Metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err) + h.commonHandler.Recorder.Emit(nodeName, observability.Normal, observability.CordonAndDrainReason, observability.CordonAndDrainMsg) + } + return nil +} diff --git a/pkg/interruptionevent/internal/common/handler.go b/pkg/interruptionevent/internal/common/handler.go new file mode 100644 index 00000000..0c58366a --- /dev/null +++ b/pkg/interruptionevent/internal/common/handler.go @@ -0,0 +1,76 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License + +package common + +import ( + "fmt" + + "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + "github.com/aws/aws-node-termination-handler/pkg/observability" + "github.com/rs/zerolog/log" +) + +type Handler struct { + InterruptionEventStore *interruptioneventstore.Store + Node node.Node + NthConfig config.Config + Metrics observability.Metrics + Recorder observability.K8sEventRecorder +} + +func (h *Handler) GetNodeName(drainEvent *monitor.InterruptionEvent) (string, error) { + if !h.NthConfig.UseProviderId { + return drainEvent.NodeName, nil + } + + nodeName, err := h.Node.GetNodeNameFromProviderID(drainEvent.ProviderID) + if err != nil { + return "", fmt.Errorf("parse node name from providerID=%q: %w", drainEvent.ProviderID, err) + } + return nodeName, nil +} + +func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { + err := drainEvent.PreDrainTask(*drainEvent, h.Node) + if err != nil { + log.Err(err).Msg("There was a problem executing the pre-drain task") + h.Recorder.Emit(nodeName, observability.Warning, observability.PreDrainErrReason, observability.PreDrainErrMsgFmt, err.Error()) + } else { + h.Recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg) + } + h.Metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err) +} + +func (h *Handler) RunPostDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { + err := drainEvent.PostDrainTask(*drainEvent, h.Node) + if err != nil { + log.Err(err).Msg("There was a problem executing the post-drain task") + h.Recorder.Emit(nodeName, observability.Warning, observability.PostDrainErrReason, observability.PostDrainErrMsgFmt, err.Error()) + } else { + h.Recorder.Emit(nodeName, observability.Normal, observability.PostDrainReason, observability.PostDrainMsg) + } + h.Metrics.NodeActionsInc("post-drain", nodeName, drainEvent.EventID, err) +} + +func IsAllowedKind(kind string, allowedKinds ...string) bool { + for _, allowedKind := range allowedKinds { + if kind == allowedKind { + return true + } + } + return false +} diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index d155ecf3..2e4f2f52 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -131,10 +131,10 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, return &event, err } -func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) { +func parseLifecycleEvent(message string) (LifecycleDetail, error) { lifecycleEventMessage := LifecycleDetailMessage{} lifecycleEvent := LifecycleDetail{} - err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage) + err := json.Unmarshal([]byte(message), &lifecycleEventMessage) if err != nil { return lifecycleEvent, err } @@ -142,7 +142,7 @@ func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) { if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) } else { - err = json.Unmarshal([]byte(fmt.Sprintf("%v", *messageBody)), &lifecycleEvent) + err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent) } return lifecycleEvent, err } @@ -155,7 +155,7 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri if message == nil { return eventBridgeEvent, fmt.Errorf("ASG event message is nil") } - lifecycleEvent, err := parseLifecycleEvent(message.Body) + lifecycleEvent, err := parseLifecycleEvent(*message.Body) switch { case err != nil: From 9832d2b157c7d1a34cc1e49925b5a77b7ef6e354 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 8 Dec 2023 17:52:52 -0600 Subject: [PATCH 19/27] Revised formatting, logging, and error message issues --- pkg/interruptionevent/asg/launch/handler.go | 19 +++++++++++-------- pkg/interruptionevent/draincordon/handler.go | 10 ++++++++-- pkg/monitor/sqsevent/sqs-monitor.go | 6 ++++-- pkg/node/node_test.go | 6 +++--- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go index 46d917a8..a7a51188 100644 --- a/pkg/interruptionevent/asg/launch/handler.go +++ b/pkg/interruptionevent/asg/launch/handler.go @@ -32,6 +32,8 @@ import ( "k8s.io/client-go/kubernetes" ) +const instanceIDLabel = "alpha.eksctl.io/instance-id" + type Handler struct { commonHandler *common.Handler clientset *kubernetes.Clientset @@ -64,7 +66,6 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { return } - log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster") nodeName, err := h.commonHandler.GetNodeName(drainEvent) if err != nil { log.Error().Err(err).Msg("unable to retrieve node name for ASG event processing") @@ -78,21 +79,24 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { func (h *Handler) isNodeReady(instanceID string) (bool, error) { nodes, err := h.getNodesWithInstanceID(instanceID) if err != nil { - return false, fmt.Errorf("getting nodes with instance ID: %w", err) + return false, fmt.Errorf("find node(s) with instanceId=%s: %w", instanceID, err) } if len(nodes) == 0 { - return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID) + log.Warn().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster") + return false, nil } for _, node := range nodes { conditions := node.Status.Conditions for _, condition := range conditions { if condition.Type == "Ready" && condition.Status != "True" { - return false, fmt.Errorf("EC2 instance, %s, found, but not ready in cluster", instanceID) + log.Warn().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster") + return false, nil } } } + log.Info().Str("instanceID", instanceID).Msg("EC2 instance is found and ready in cluster") return true, nil } @@ -114,15 +118,14 @@ func (h *Handler) getNodesWithInstanceID(instanceID string) ([]v1.Node, error) { } func (h *Handler) getNodesWithInstanceFromLabel(instanceID string) ([]v1.Node, error) { - instanceIDLabel := "alpha.eksctl.io/instance-id" instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID}) if err != nil { - return nil, fmt.Errorf("bad label requirement: %w", err) + return nil, fmt.Errorf("construct node search requirement %s=%s: %w", instanceIDLabel, instanceID, err) } selector := labels.NewSelector().Add(*instanceIDReq) nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()}) if err != nil { - return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err) + return nil, fmt.Errorf("list nodes using selector %q: %w", selector.String(), err) } return nodeList.Items, nil } @@ -130,7 +133,7 @@ func (h *Handler) getNodesWithInstanceFromLabel(instanceID string) ([]v1.Node, e func (h *Handler) getNodesWithInstanceFromProviderID(instanceID string) ([]v1.Node, error) { nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { - return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err) + return nil, fmt.Errorf("list all nodes: %w", err) } var filteredNodes []v1.Node diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go index 32ea8cae..b9e7596e 100644 --- a/pkg/interruptionevent/draincordon/handler.go +++ b/pkg/interruptionevent/draincordon/handler.go @@ -26,8 +26,14 @@ import ( "k8s.io/apimachinery/pkg/api/errors" ) -var allowedKinds = []string{monitor.ASGLifecycleKind, monitor.RebalanceRecommendationKind, monitor.SQSTerminateKind, monitor.ScheduledEventKind, - monitor.SpotITNKind, monitor.StateChangeKind} +var allowedKinds = []string{ + monitor.ASGLifecycleKind, + monitor.RebalanceRecommendationKind, + monitor.SQSTerminateKind, + monitor.ScheduledEventKind, + monitor.SpotITNKind, + monitor.StateChangeKind, +} type Handler struct { commonHandler *common.Handler diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 2e4f2f52..1cdcc985 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -136,25 +136,27 @@ func parseLifecycleEvent(message string) (LifecycleDetail, error) { lifecycleEvent := LifecycleDetail{} err := json.Unmarshal([]byte(message), &lifecycleEventMessage) if err != nil { - return lifecycleEvent, err + return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract Message field: %w", err) } // Converts escaped JSON object to string, to lifecycle event if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) + err = fmt.Errorf("unmarshalling Message field from SQS message body: %w", err) } else { err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent) + err = fmt.Errorf("unmarshalling SQS message body: %w", err) } return lifecycleEvent, err } // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { - log.Debug().Msg("processing lifecycle event from ASG") eventBridgeEvent := EventBridgeEvent{} if message == nil { return eventBridgeEvent, fmt.Errorf("ASG event message is nil") } + log.Debug().Str("messageBody", *message.Body).Str("messageID", *message.MessageId).Msg("processing lifecycle event from ASG") lifecycleEvent, err := parseLifecycleEvent(*message.Body) switch { diff --git a/pkg/node/node_test.go b/pkg/node/node_test.go index e9837e6c..945b98af 100644 --- a/pkg/node/node_test.go +++ b/pkg/node/node_test.go @@ -63,13 +63,13 @@ func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node { return tNode } -func getNewNode(nthConfig config.Config, client *fake.Clientset) (*node.Node, error) { +func newNode(nthConfig config.Config, client *fake.Clientset) (*node.Node, error) { drainHelper := getDrainHelper(client) return node.NewWithValues(nthConfig, drainHelper, uptime.Uptime) } func TestDryRun(t *testing.T) { - tNode, err := getNewNode(config.Config{DryRun: true}, fake.NewSimpleClientset()) + tNode, err := newNode(config.Config{DryRun: true}, fake.NewSimpleClientset()) h.Ok(t, err) fakeRecorder := record.NewFakeRecorder(recorderBufferSize) @@ -109,7 +109,7 @@ func TestDryRun(t *testing.T) { func TestNewFailure(t *testing.T) { client := fake.NewSimpleClientset() - _, err := getNewNode(config.Config{}, client) + _, err := newNode(config.Config{}, client) h.Assert(t, true, "Failed to return error when creating new Node.", err != nil) } From 307b6a8a1daf17d090aa812ab95981b83486d2fd Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Mon, 18 Dec 2023 23:10:07 -0600 Subject: [PATCH 20/27] Refactors for log and error handling for eventhandlers. Refacors for ASG launch lifecycle bash script Refactored interruption event handling into a seperate package with distinct handlers for different interruption event Kinds. Updated ASG launch lifecycle hook acceptance test and eks-cluster run-test --- cmd/node-termination-handler.go | 13 +- pkg/interruptionevent/asg/launch/handler.go | 24 ++- pkg/interruptionevent/draincordon/handler.go | 23 ++- pkg/monitor/sqsevent/asg-lifecycle-event.go | 8 +- pkg/monitor/sqsevent/sqs-monitor.go | 24 ++- test/e2e/asg-launch-lifecycle-sqs-test | 174 ++++++++++++------- test/eks-cluster-test/node_group-spec.yaml | 15 -- test/k8s-local-cluster-test/run-test | 7 +- 8 files changed, 176 insertions(+), 112 deletions(-) delete mode 100644 test/eks-cluster-test/node_group-spec.yaml diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 6fd0c745..cf8ee1ad 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -60,7 +60,7 @@ const ( ) type interruptionEventHandler interface { - HandleEvent(*monitor.InterruptionEvent) + HandleEvent(*monitor.InterruptionEvent) error } func main() { @@ -350,8 +350,17 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, event *monitor.InterruptionEvent, eventHandlers []interruptionEventHandler, wg *sync.WaitGroup) { defer wg.Done() + + if event == nil { + log.Error().Msg("processing nil interruption event") + } + + var err error for _, eventHandler := range eventHandlers { - eventHandler.HandleEvent(event) + err = eventHandler.HandleEvent(event) + if err != nil { + log.Error().Err(err).Interface("event", event).Msg("handling event") + } } <-interruptionEventStore.Workers } diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go index a7a51188..9a0cfc82 100644 --- a/pkg/interruptionevent/asg/launch/handler.go +++ b/pkg/interruptionevent/asg/launch/handler.go @@ -54,26 +54,34 @@ func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, n } } -func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { +func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { + if drainEvent == nil { + return fmt.Errorf("handling nil event") + } + if !common.IsAllowedKind(drainEvent.Kind, monitor.ASGLaunchLifecycleKind) { - return + return nil } isNodeReady, err := h.isNodeReady(drainEvent.InstanceID) - if err != nil || !isNodeReady { - log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster") + if err != nil { + h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) + return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s: %w", drainEvent.InstanceID, err) + } + if !isNodeReady { h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) - return + return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s", drainEvent.InstanceID) } nodeName, err := h.commonHandler.GetNodeName(drainEvent) if err != nil { - log.Error().Err(err).Msg("unable to retrieve node name for ASG event processing") + return fmt.Errorf("unable to retrieve node name for ASG event processing: %w", err) } if drainEvent.PostDrainTask != nil { h.commonHandler.RunPostDrainTask(nodeName, drainEvent) } + return nil } func (h *Handler) isNodeReady(instanceID string) (bool, error) { @@ -83,7 +91,7 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) { } if len(nodes) == 0 { - log.Warn().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster") + log.Info().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster") return false, nil } @@ -91,7 +99,7 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) { conditions := node.Status.Conditions for _, condition := range conditions { if condition.Type == "Ready" && condition.Status != "True" { - log.Warn().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster") + log.Info().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster") return false, nil } } diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go index b9e7596e..be89eb37 100644 --- a/pkg/interruptionevent/draincordon/handler.go +++ b/pkg/interruptionevent/draincordon/handler.go @@ -14,6 +14,8 @@ package draincordon import ( + "fmt" + "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" "github.com/aws/aws-node-termination-handler/pkg/interruptionevent/internal/common" @@ -55,35 +57,39 @@ func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, n } } -func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { +func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { if !common.IsAllowedKind(drainEvent.Kind, allowedKinds...) { - return + return nil } nodeFound := true nodeName, err := h.commonHandler.GetNodeName(drainEvent) if err != nil { - log.Error().Err(err).Msg("unable to retrieve node name for draining or cordoning") + return fmt.Errorf("unable to retrieve node name for draining or cordoning: %w", err) } nodeLabels, err := h.commonHandler.Node.GetNodeLabels(nodeName) if err != nil { - log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName) + log.Warn().Err(err).Msgf("Unable to fetch node labels for nodeName=%s", nodeName) nodeFound = false + } else { + drainEvent.NodeLabels = nodeLabels } - drainEvent.NodeLabels = nodeLabels + if drainEvent.PreDrainTask != nil { h.commonHandler.RunPreDrainTask(nodeName, drainEvent) } podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName) if err != nil { - log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName) + log.Warn().Err(err).Msgf("Unable to fetch running pods for nodeName=%s", nodeName) + } else { + drainEvent.Pods = podNameList } - drainEvent.Pods = podNameList + err = h.commonHandler.Node.LogPods(podNameList, nodeName) if err != nil { - log.Err(err).Msg("There was a problem while trying to log all pod names on the node") + log.Warn().Err(err).Msgf("There was a problem while trying to log all pod names on the node nodeName=%s", nodeName) } if h.commonHandler.NthConfig.CordonOnly || (!h.commonHandler.NthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !h.commonHandler.NthConfig.EnableRebalanceDraining) { @@ -105,6 +111,7 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) { if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil { h.commonHandler.RunPostDrainTask(nodeName, drainEvent) } + return nil } func (h *Handler) cordonNode(nodeName string, drainEvent *monitor.InterruptionEvent) error { diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 3a9dd390..1c56ba6a 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -130,8 +130,12 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { - if message == nil || event == nil { - return nil, fmt.Errorf("event message is nil for ASG Instance Launch Event creation") + if event == nil { + return nil, fmt.Errorf("EventBridgeEvent is nil for ASG Instance Launch Event creation") + } + + if message == nil { + return nil, fmt.Errorf("SQS message is nil for ASG Instance Launch Event creation") } lifecycleDetail := &LifecycleDetail{} diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 1cdcc985..4aebea33 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -136,27 +136,31 @@ func parseLifecycleEvent(message string) (LifecycleDetail, error) { lifecycleEvent := LifecycleDetail{} err := json.Unmarshal([]byte(message), &lifecycleEventMessage) if err != nil { - return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract Message field: %w", err) + return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract 'Message' field: %w", err) } // Converts escaped JSON object to string, to lifecycle event if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) - err = fmt.Errorf("unmarshalling Message field from SQS message body: %w", err) + if err != nil { + err = fmt.Errorf("unmarshalling 'Message' field from SQS message body: %w", err) + } } else { err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent) - err = fmt.Errorf("unmarshalling SQS message body: %w", err) + if err != nil { + err = fmt.Errorf("unmarshalling SQS message body: %w", err) + } } return lifecycleEvent, err } // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { + log.Debug().Interface("message", message).Msg("processing lifecycle event from ASG") eventBridgeEvent := EventBridgeEvent{} if message == nil { return eventBridgeEvent, fmt.Errorf("ASG event message is nil") } - log.Debug().Str("messageBody", *message.Body).Str("messageID", *message.MessageId).Msg("processing lifecycle event from ASG") lifecycleEvent, err := parseLifecycleEvent(*message.Body) switch { @@ -172,7 +176,7 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" && lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING": - return eventBridgeEvent, fmt.Errorf("unsupported message type (%s) while parsing lifecycle event messsage from ASG", message.String()) + return eventBridgeEvent, fmt.Errorf("unsupported lifecycle transition while parsing lifecycle event messsage from ASG lifecycleTransition=%s", lifecycleEvent.LifecycleTransition) } eventBridgeEvent.Source = "aws.autoscaling" @@ -188,12 +192,16 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, interruptionEvent := &monitor.InterruptionEvent{} var err error - if message == nil || eventBridgeEvent == nil { - return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("event message is nil")}) + if eventBridgeEvent == nil { + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("EventBridgeEvent is nil for EventBridgeEvent processing")}) + } + if message == nil { + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("SQS message is nil for EventBridgeEvent processing")}) } switch eventBridgeEvent.Source { - // LifecycleTransitions other than LAUNCHING or TERMINATING will result in the interruptionEvent being uninitialized + /* LifecycleTransitions other than LAUNCHING and TERMINATING are invalid values. These values result in uninitialized interruptionEvents, whose + messages are later dropped */ case "aws.autoscaling": lifecycleEvent := LifecycleDetail{} err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index f92e0a3a..3988ab88 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -1,10 +1,7 @@ #!/bin/bash set -euo pipefail -REGION="us-west-2" -CLUSTER_NAME="nth-eks-cluster-test" - -node_group_name="nth-eks-cluster-test-spot-ng" +node_group_name="spot-ng" sqs_queue_name="nth-sqs-test" sns_topic_name="nth-sns-test" node_policy_name="nth-test-node-policy" @@ -47,9 +44,9 @@ EOF cat << EOF > /tmp/queue-attributes.json { -"MessageRetentionPeriod": "300", -"Policy": "$(echo $sqs_queue_policy | sed 's/\"/\\"/g' | tr -d -s '\n' " ")", -"SqsManagedSseEnabled": "true" + "MessageRetentionPeriod": "300", + "Policy": "$(echo $sqs_queue_policy | sed 's/\"/\\"/g' | tr -d -s '\n' " ")", + "SqsManagedSseEnabled": "true" } EOF @@ -140,6 +137,15 @@ EOF ##### SETUP ##### +function validate_aws_account { + if [[ -n "$account_id" ]]; then + echo "๐Ÿฅ‘ AWS Account ID: $account_id" + else + echo "โŒ Failed to retrieve AWS Account ID โŒ" + exit 1 + fi +} + ### SQS ### function provision_sqs_queue { queue_exists=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name) @@ -178,27 +184,8 @@ function subscribe_sqs_to_sns { } ### NODEGROUP ### -function provision_node_group { +function update_node_group { create_node_policy - node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) - if [[ -n $node_group_exists ]]; then - get_node_role_name - delete_node_group - echo "" - - node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) - echo -n "Node group Deleting." - while [[ -n $node_group_exists ]]; do - echo -n "." - node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :) - sleep 10 - done - echo "" - sleep 20 - fi - - echo "๐Ÿฅ‘ Provisioning Spot Node Group" - eksctl create nodegroup --config-file=$NODE_GROUP_CONFIG_FILE echo "๐Ÿฅ‘ Attaching Node policy to Node role" get_node_role_name @@ -332,7 +319,8 @@ function start_FIS_experiment { create_FIS_role create_experiment_template echo "๐Ÿฅ‘ Starting Experiment" - experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime') + experiment_start_time=$(date +%s) + experiment=$(aws fis start-experiment --experiment-template-id $template_id) } @@ -344,20 +332,51 @@ function is_new_instance { fi } +function convert_date_to_epoch_seconds { + IFS='T' read -r date_part time_part <<< "$1" + IFS='-' read -r year month day <<< "$date_part" + IFS=':' read -r hour minute second_fractional <<< "$time_part" + IFS='.' read -r second fraction <<< "$second_fractional" + IFS=':' read -r offset_hours offset_minutes <<< "${time_part:16:5}" + + if [[ $time_part =~ .*"-".* ]]; then + offset_hours=$((offset_hours * -1)) + offset_minutes=$((offset_minutes * -1)) + fi + + total_days=$(((year - 1970) * 365 + (year - 1970)/4)) + for ((i = 1; i < month; i++)); do + total_days=$((total_days + $(cal $i $year | awk 'NF {DAYS = $NF} END {print DAYS}'))) + done + total_days=$((total_days + day - 1)) + total_seconds=$((total_days * 86400 + (hour + offset_hours) * 3600 + (minute + offset_minutes) * 60 + second)) +} + function get_launch_activity { + max_duration=$((5 * 60)) + start_time=$(date +%s) + launch_activity="" - while [[ -z $launch_activity ]]; do + while [[ -z $launch_activity ]]; do + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [[ $elapsed_time -ge $max_duration ]]; then + echo "โŒ Failed to find a new launched instance. Timeout Reached โŒ" + exit 1 + fi + sleep 5 activities=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name) - activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode]' <<< $activities) + activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode, .StartTime]' <<< $activities) num_activities=$(jq -r 'length' <<< $activities_details) - for i in $(seq 0 3 $((--num_activities))); do + for i in $(seq 0 4 $((--num_activities))); do id=$(jq -r .[$i] <<< $activities_details) description=$(jq -r .[$((++i))] <<< $activities_details) status=$(jq -r .[$((i+=2))] <<< $activities_details) + start=$(jq -r .[$((i+=3))] <<< $activities_details) activity_instance=${description##*:} - is_new_instance $activity_instance - if [[ $description =~ .*"Launching".* && $is_new == "true" ]]; then + convert_date_to_epoch_seconds $start + if [[ $description =~ .*"Launching".* && $total_seconds -gt $experiment_start_time ]]; then launch_activity=$id echo "๐Ÿฅ‘ Launch Activity found for instance $activity_instance" break @@ -369,20 +388,35 @@ function get_launch_activity { function test_launch_lifecycle { aws sqs receive-message --queue-url $queue_url echo -n "๐Ÿฅ‘ Waiting for launch hook completion." + + max_duration=$((4 * 60)) + start_time=$(date +%s) while [[ true ]]; do + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [[ $elapsed_time -ge $max_duration ]]; then + echo "" + echo "โŒ Launch Lifecycle not Completed. Timeout Reached โŒ" + exit 1 + fi + activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[].StatusCode') if [[ $activity_status == "Successful" ]]; then echo "" echo "โœ… Launch Lifecycle Successfully Completed โœ…" - exit_policy="exit 0" - break + exit 0 fi if [[ $activity_status == "Cancelled" ]]; then + echo "" + echo "โŒ Launch Lifecycle Cancelled โŒ" + exit 1 + fi + + if [[ $activity_status == "Failed" ]]; then echo "" echo "โŒ Launch Lifecycle Failed โŒ" - exit_policy="exit 1" - break + exit 1 fi echo -n "." sleep 10 @@ -395,27 +429,43 @@ function clean_up { echo "=====================================================================================================" echo "๐Ÿงน Cleaning up SQS, SNS, NodeGroup, IAM, FIS ๐Ÿงน" echo "=====================================================================================================" - pod_id=$(get_nth_worker_pod || :) - kubectl logs $pod_id --namespace kube-system || : + print_logs echo "๐Ÿฅ‘ Uninstalling NTH helm chart" helm uninstall "$CLUSTER_NAME-acth" -n kube-system delete_node_group - echo "๐Ÿฅ‘ Unsubscribing SNS from SQS" - aws sns unsubscribe --subscription-arn $subscription_arn - echo "๐Ÿฅ‘ Deleting SQS queue" - aws sqs delete-queue --queue-url $queue_url - echo "๐Ÿฅ‘ Deleting SNS topic" - aws sns delete-topic --topic-arn $sns_arn - echo "๐Ÿฅ‘ Deleting FIS experiment template" - deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate) + if [[ -n $subscription_arn ]]; then + echo "๐Ÿฅ‘ Unsubscribing SNS from SQS" + aws sns unsubscribe --subscription-arn $subscription_arn + fi + if [[ -n $queue_url ]]; then + echo "๐Ÿฅ‘ Deleting SQS queue" + aws sqs delete-queue --queue-url $queue_url + fi + if [[ -n $sns_arn ]]; then + echo "๐Ÿฅ‘ Deleting SNS topic" + aws sns delete-topic --topic-arn $sns_arn + fi + if [[ -n $template_id ]]; then + echo "๐Ÿฅ‘ Deleting FIS experiment template" + deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate) + fi echo "๐Ÿฅ‘ Detaching FIS role policy" aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn echo "๐Ÿฅ‘ Deleting FIS role" aws iam delete-role --role-name $fis_role_name echo "๐Ÿฅ‘ Deleting autoscaling role" aws iam delete-service-linked-role --role-name $auto_scaling_role_name - echo "๐Ÿฅ‘ Deleting Node role policy" - aws iam delete-policy --policy-arn $node_policy_arn + if [[ -n $node_policy_arn ]]; then + echo "๐Ÿฅ‘ Deleting Node role policy" + aws iam delete-policy --policy-arn $node_policy_arn + fi +} + +function print_logs { + pod_id=$(get_nth_worker_pod || :) + if [[ -n $pod_id ]]; then + kubectl logs $pod_id --namespace kube-system || : + fi } function delete_node_group { @@ -426,21 +476,15 @@ function delete_node_group { echo "๐Ÿฅ‘ Detaching NTH Node Group policy" aws iam detach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn fi - echo "๐Ÿฅ‘ Deleting NTH Node Group" - eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve -} - -function main { - provision_sqs_queue - provision_sns_topic - subscribe_sqs_to_sns - provision_node_group - install_helm - start_FIS_experiment - get_launch_activity - test_launch_lifecycle - trap "clean_up" EXIT - eval $exit_policy } -main +trap "clean_up" EXIT +validate_aws_account +provision_sqs_queue +provision_sns_topic +subscribe_sqs_to_sns +update_node_group +install_helm +start_FIS_experiment +get_launch_activity +test_launch_lifecycle diff --git a/test/eks-cluster-test/node_group-spec.yaml b/test/eks-cluster-test/node_group-spec.yaml deleted file mode 100644 index 2fa39a78..00000000 --- a/test/eks-cluster-test/node_group-spec.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: eksctl.io/v1alpha5 -kind: ClusterConfig -metadata: - name: nth-eks-cluster-test - region: us-west-2 -managedNodeGroups: - - name: nth-eks-cluster-test-spot-ng - instanceType: t3.medium - amiFamily: AmazonLinux2 - desiredCapacity: 2 - minSize: 2 - maxSize: 2 - spot: true -iam: - withOIDC: true \ No newline at end of file diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index e9e5388b..64839292 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -275,16 +275,15 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\ kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite function is_denylisted { - is_denied="false" if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then - is_denied="true" + return 1 fi + return 0 } i=0 for assert_script in $ASSERTION_SCRIPTS; do - is_denylisted $assert_script - if[[ $is_denied == "true" ]]; then continue; fi + if [[ is_denylisted $assert_script ]]; then continue; fi reset_cluster START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") From b4ea84a12b3fb563e8d8b6fa72bf6e417a5b47fd Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Wed, 20 Dec 2023 14:13:58 -0600 Subject: [PATCH 21/27] Refactored error and logging messages and bash script tests for ASG launch lifecycle event --- cmd/node-termination-handler.go | 2 + pkg/interruptionevent/asg/launch/handler.go | 14 ++--- pkg/interruptionevent/draincordon/handler.go | 16 ++++-- pkg/monitor/sqsevent/asg-lifecycle-event.go | 4 +- pkg/monitor/sqsevent/sqs-monitor.go | 30 +++++----- test/e2e/asg-launch-lifecycle-sqs-test | 60 +++++++++++++------- test/k8s-local-cluster-test/run-test | 6 +- 7 files changed, 82 insertions(+), 50 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index cf8ee1ad..6145e0e5 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -353,6 +353,8 @@ func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Sto if event == nil { log.Error().Msg("processing nil interruption event") + <-interruptionEventStore.Workers + return } var err error diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go index 9a0cfc82..00df82c4 100644 --- a/pkg/interruptionevent/asg/launch/handler.go +++ b/pkg/interruptionevent/asg/launch/handler.go @@ -56,7 +56,7 @@ func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, n func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { if drainEvent == nil { - return fmt.Errorf("handling nil event") + return fmt.Errorf("drainEvent is nil") } if !common.IsAllowedKind(drainEvent.Kind, monitor.ASGLaunchLifecycleKind) { @@ -66,16 +66,16 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { isNodeReady, err := h.isNodeReady(drainEvent.InstanceID) if err != nil { h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) - return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s: %w", drainEvent.InstanceID, err) + return fmt.Errorf("check if node (instanceID=%s) is present and ready: %w", drainEvent.InstanceID, err) } if !isNodeReady { h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) - return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s", drainEvent.InstanceID) + return nil } nodeName, err := h.commonHandler.GetNodeName(drainEvent) if err != nil { - return fmt.Errorf("unable to retrieve node name for ASG event processing: %w", err) + return fmt.Errorf("get node name for instanceID=%s: %w", drainEvent.InstanceID, err) } if drainEvent.PostDrainTask != nil { @@ -91,7 +91,7 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) { } if len(nodes) == 0 { - log.Info().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster") + log.Info().Str("instanceID", instanceID).Msg("EC2 instance not found") return false, nil } @@ -99,12 +99,12 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) { conditions := node.Status.Conditions for _, condition := range conditions { if condition.Type == "Ready" && condition.Status != "True" { - log.Info().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster") + log.Info().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready") return false, nil } } } - log.Info().Str("instanceID", instanceID).Msg("EC2 instance is found and ready in cluster") + log.Info().Str("instanceID", instanceID).Msg("EC2 instance is found and ready") return true, nil } diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go index be89eb37..0360a31c 100644 --- a/pkg/interruptionevent/draincordon/handler.go +++ b/pkg/interruptionevent/draincordon/handler.go @@ -65,12 +65,16 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { nodeFound := true nodeName, err := h.commonHandler.GetNodeName(drainEvent) if err != nil { - return fmt.Errorf("unable to retrieve node name for draining or cordoning: %w", err) + return fmt.Errorf("get node name for instanceID=%s: %w", drainEvent.InstanceID, err) } nodeLabels, err := h.commonHandler.Node.GetNodeLabels(nodeName) if err != nil { - log.Warn().Err(err).Msgf("Unable to fetch node labels for nodeName=%s", nodeName) + log.Warn(). + Err(err). + Interface("fallbackNodeLabels", drainEvent.NodeLabels). + Str("nodeName", nodeName). + Msg("Failed to get node labels. Proceeding with fallback labels") nodeFound = false } else { drainEvent.NodeLabels = nodeLabels @@ -82,14 +86,18 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName) if err != nil { - log.Warn().Err(err).Msgf("Unable to fetch running pods for nodeName=%s", nodeName) + log.Warn(). + Err(err). + Strs("fallbackPodNames", podNameList). + Str("nodeName", nodeName). + Msg("Failed to fetch pod names. Proceeding with fallback pod names") } else { drainEvent.Pods = podNameList } err = h.commonHandler.Node.LogPods(podNameList, nodeName) if err != nil { - log.Warn().Err(err).Msgf("There was a problem while trying to log all pod names on the node nodeName=%s", nodeName) + log.Warn().Err(err).Str("nodeName", nodeName).Msg("Failed to log pods") } if h.commonHandler.NthConfig.CordonOnly || (!h.commonHandler.NthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !h.commonHandler.NthConfig.EnableRebalanceDraining) { diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 1c56ba6a..c1262519 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -131,11 +131,11 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (* // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { if event == nil { - return nil, fmt.Errorf("EventBridgeEvent is nil for ASG Instance Launch Event creation") + return nil, fmt.Errorf("event is nil") } if message == nil { - return nil, fmt.Errorf("SQS message is nil for ASG Instance Launch Event creation") + return nil, fmt.Errorf("message is nil") } lifecycleDetail := &LifecycleDetail{} diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 4aebea33..18febbb1 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -38,7 +38,9 @@ const ( // SQSMonitorKind is a const to define this monitor kind SQSMonitorKind = "SQS_MONITOR" // ASGTagName is the name of the instance tag whose value is the AutoScaling group name - ASGTagName = "aws:autoscaling:groupName" + ASGTagName = "aws:autoscaling:groupName" + ASGTerminatingLifecycleTransition = "autoscaling:EC2_INSTANCE_TERMINATING" + ASGLaunchingLifecycleTransition = "autoscaling:EC2_INSTANCE_LAUNCHING" ) // SQSMonitor is a struct definition that knows how to process events from Amazon EventBridge @@ -136,18 +138,18 @@ func parseLifecycleEvent(message string) (LifecycleDetail, error) { lifecycleEvent := LifecycleDetail{} err := json.Unmarshal([]byte(message), &lifecycleEventMessage) if err != nil { - return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract 'Message' field: %w", err) + return lifecycleEvent, fmt.Errorf("unmarshalling SQS message: %w", err) } // Converts escaped JSON object to string, to lifecycle event if lifecycleEventMessage.Message != nil { err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent) if err != nil { - err = fmt.Errorf("unmarshalling 'Message' field from SQS message body: %w", err) + err = fmt.Errorf("unmarshalling message body from '.Message': %w", err) } } else { err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent) if err != nil { - err = fmt.Errorf("unmarshalling SQS message body: %w", err) + err = fmt.Errorf("unmarshalling message body: %w", err) } } return lifecycleEvent, err @@ -174,9 +176,9 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri } return eventBridgeEvent, skip{err} - case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" && - lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING": - return eventBridgeEvent, fmt.Errorf("unsupported lifecycle transition while parsing lifecycle event messsage from ASG lifecycleTransition=%s", lifecycleEvent.LifecycleTransition) + case lifecycleEvent.LifecycleTransition != ASGTerminatingLifecycleTransition && + lifecycleEvent.LifecycleTransition != ASGLaunchingLifecycleTransition: + return eventBridgeEvent, fmt.Errorf("lifecycle transition must be %s or %s. Got %s", ASGTerminatingLifecycleTransition, ASGLaunchingLifecycleTransition, lifecycleEvent.LifecycleTransition) } eventBridgeEvent.Source = "aws.autoscaling" @@ -193,27 +195,27 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, var err error if eventBridgeEvent == nil { - return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("EventBridgeEvent is nil for EventBridgeEvent processing")}) + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("eventBridgeEvent is nil")}) } if message == nil { - return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("SQS message is nil for EventBridgeEvent processing")}) + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("message is nil")}) } switch eventBridgeEvent.Source { - /* LifecycleTransitions other than LAUNCHING and TERMINATING are invalid values. These values result in uninitialized interruptionEvents, whose - messages are later dropped */ case "aws.autoscaling": lifecycleEvent := LifecycleDetail{} err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) if err != nil { interruptionEvent, err = nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err) } - if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" { + if lifecycleEvent.LifecycleTransition == ASGLaunchingLifecycleTransition { interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message) - } else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" { + interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) + } else if lifecycleEvent.LifecycleTransition == ASGTerminatingLifecycleTransition { interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) + interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) } - return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) + return interruptionEventWrappers case "aws.ec2": if eventBridgeEvent.DetailType == "EC2 Instance State-change Notification" { diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index 3988ab88..4472f015 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -14,6 +14,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml" account_id=$(aws sts get-caller-identity | jq -r '.Account') nth_label="Use-Case=NTH" +heartbeat_timeout=$((3 * 60)) ##### JSON FILES ##### @@ -235,8 +236,22 @@ function update_ASG { create_auto_scaling_role echo "๐Ÿฅ‘ Creating Lifecycle Hooks" - aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Launch-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_LAUNCHING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="ABANDON" - aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Terminate-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_TERMINATING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="CONTINUE" + aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name "Launch-LC-Hook" \ + --auto-scaling-group-name $asg_name \ + --lifecycle-transition="autoscaling:EC2_INSTANCE_LAUNCHING" \ + --heartbeat-timeout=$heartbeat_timeout \ + --notification-target-arn=$sns_arn \ + --role-arn=$auto_scaling_role_arn \ + --default-result="ABANDON" + aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name "Terminate-LC-Hook" \ + --auto-scaling-group-name $asg_name \ + --lifecycle-transition="autoscaling:EC2_INSTANCE_TERMINATING" \ + --heartbeat-timeout=$heartbeat_timeout \ + --notification-target-arn=$sns_arn \ + --role-arn=$auto_scaling_role_arn \ + --default-result="CONTINUE" } function create_auto_scaling_role { @@ -339,6 +354,11 @@ function convert_date_to_epoch_seconds { IFS='.' read -r second fraction <<< "$second_fractional" IFS=':' read -r offset_hours offset_minutes <<< "${time_part:16:5}" + # Convert time strings to base-10 integers + year=$((10#$year + 0)); month=$((10#$month + 0)); day=$((10#$day + 0)) + hour=$((10#$hour + 0)); minute=$((10#$minute + 0)); second=$((10#$second + 0)) + offset_hours=$((10#$offset_hours + 0)); offset_minutes=$((10#$offset_minutes + 0)) + if [[ $time_part =~ .*"-".* ]]; then offset_hours=$((offset_hours * -1)) offset_minutes=$((offset_minutes * -1)) @@ -389,12 +409,11 @@ function test_launch_lifecycle { aws sqs receive-message --queue-url $queue_url echo -n "๐Ÿฅ‘ Waiting for launch hook completion." - max_duration=$((4 * 60)) start_time=$(date +%s) while [[ true ]]; do current_time=$(date +%s) elapsed_time=$((current_time - start_time)) - if [[ $elapsed_time -ge $max_duration ]]; then + if [[ $elapsed_time -ge $heartbeat_timeout ]]; then echo "" echo "โŒ Launch Lifecycle not Completed. Timeout Reached โŒ" exit 1 @@ -405,17 +424,9 @@ function test_launch_lifecycle { echo "" echo "โœ… Launch Lifecycle Successfully Completed โœ…" exit 0 - fi - - if [[ $activity_status == "Cancelled" ]]; then + elif [[ $activity_status == "Cancelled" || $activity_status == "Failed" ]]; then echo "" - echo "โŒ Launch Lifecycle Cancelled โŒ" - exit 1 - fi - - if [[ $activity_status == "Failed" ]]; then - echo "" - echo "โŒ Launch Lifecycle Failed โŒ" + echo "โŒ Launch Lifecycle $activity_status โŒ" exit 1 fi echo -n "." @@ -430,9 +441,8 @@ function clean_up { echo "๐Ÿงน Cleaning up SQS, SNS, NodeGroup, IAM, FIS ๐Ÿงน" echo "=====================================================================================================" print_logs - echo "๐Ÿฅ‘ Uninstalling NTH helm chart" - helm uninstall "$CLUSTER_NAME-acth" -n kube-system - delete_node_group + uninstall_helm + delete_node_group_policy if [[ -n $subscription_arn ]]; then echo "๐Ÿฅ‘ Unsubscribing SNS from SQS" aws sns unsubscribe --subscription-arn $subscription_arn @@ -465,13 +475,23 @@ function print_logs { pod_id=$(get_nth_worker_pod || :) if [[ -n $pod_id ]]; then kubectl logs $pod_id --namespace kube-system || : + else + echo "โŒ Failed to get pod ID. Unable to print logs โŒ" + fi +} + +function uninstall_helm { + helm_exists=$(helm ls -A | grep "$CLUSTER_NAME-acth") + if [[ -n $helm_exists ]]; then + echo "๐Ÿฅ‘ Uninstalling NTH helm chart" + helm uninstall "$CLUSTER_NAME-acth" -n kube-system fi } -function delete_node_group { - echo "Node Role Name: $node_role_name" +function delete_node_group_policy { + if [[ -z $node_role_name || -z $node_policy_name ]]; then return; fi + node_policy_exists=$(aws iam list-attached-role-policies --role-name $node_role_name | grep "$node_policy_name" || :) - echo $node_policy_exists if [[ -n $node_policy_exists ]]; then echo "๐Ÿฅ‘ Detaching NTH Node Group policy" aws iam detach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index 64839292..34385364 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -276,14 +276,14 @@ kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule function is_denylisted { if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then - return 1 + return 0 fi - return 0 + return 1 } i=0 for assert_script in $ASSERTION_SCRIPTS; do - if [[ is_denylisted $assert_script ]]; then continue; fi + if is_denylisted $assert_script; then continue; fi reset_cluster START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") From f52c7b57474f6389c9fa84a3bfeb43381e6f978c Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 21 Dec 2023 13:42:43 -0600 Subject: [PATCH 22/27] Update ReadME for ASG launch lifecycle hook changes --- README.md | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3f23b4bd..d609bb72 100644 --- a/README.md +++ b/README.md @@ -218,8 +218,9 @@ You'll need the following AWS infrastructure components: 1. Amazon Simple Queue Service (SQS) Queue 2. AutoScaling Group Termination Lifecycle Hook -3. Amazon EventBridge Rule -4. IAM Role for the aws-node-termination-handler Queue Processing Pods +3. AutoScaling Group Launch Lifecycle Hook (optional) +4. Amazon EventBridge Rule +5. IAM Role for the aws-node-termination-handler Queue Processing Pods #### 1. Create an SQS Queue: @@ -294,7 +295,37 @@ aws autoscaling put-lifecycle-hook \ --role-arn ``` -#### 3. Tag the Instances: +#### 3. Create an ASG Launch Lifecycle Hook (optional): + +If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, then a new instance will be launched upon another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node. + +Here is the AWS CLI command to create a launch lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: + +``` +aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-term-launch-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ + --default-result="ABANDON" \ + --heartbeat-timeout=300 +``` + +If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1: + +``` +aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-term-launch-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ + --default-result="ABANDON" \ + --heartbeat-timeout=300 \ + --notification-target-arn \ + --role-arn +``` + +The hook will be completed by NTH upon the instance's verified connection as a node. If not, the ABANDON default result will cause the instance to be terminated, and a new one to replace it repeating the same verification process. + +#### 4. Tag the Instances: By default the aws-node-termination-handler will only manage terminations for instances tagged with `key=aws-node-termination-handler/managed`. The value of the key does not matter. @@ -320,7 +351,7 @@ You can also control what resources NTH manages by adding the resource ARNs to y Take a look at the docs on how to [create rules that only manage certain ASGs](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html), and read about all the [supported ASG events](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-event-reference.html). -#### 4. Create Amazon EventBridge Rules +#### 5. Create Amazon EventBridge Rules You may skip this step if sending events from ASG to SQS directly. @@ -367,7 +398,7 @@ aws events put-targets --rule MyK8sScheduledChangeRule \ --targets "Id"="1","Arn"="arn:aws:sqs:us-east-1:123456789012:MyK8sTermQueue" ``` -#### 5. Create an IAM Role for the Pods +#### 6. Create an IAM Role for the Pods There are many different ways to allow the aws-node-termination-handler pods to assume a role: From ebf8e43b16fe4092cbc0ac7dd74a100709b80266 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 26 Dec 2023 10:57:49 -0600 Subject: [PATCH 23/27] Fixed changes for README update --- README.md | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index d609bb72..8121be62 100644 --- a/README.md +++ b/README.md @@ -218,9 +218,11 @@ You'll need the following AWS infrastructure components: 1. Amazon Simple Queue Service (SQS) Queue 2. AutoScaling Group Termination Lifecycle Hook -3. AutoScaling Group Launch Lifecycle Hook (optional) -4. Amazon EventBridge Rule -5. IAM Role for the aws-node-termination-handler Queue Processing Pods +3. Amazon EventBridge Rule +4. IAM Role for the aws-node-termination-handler Queue Processing Pods + +Optional AWS infrastructure components: +1. AutoScaling Group Launch Lifecycle Hook #### 1. Create an SQS Queue: @@ -271,7 +273,9 @@ There are some caveats when using [server side encryption with SQS](https://docs #### 2. Create an ASG Termination Lifecycle Hook: -Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: +##### 2.1. Send Notification via EventBridge + +This will configure ASG to send termination notifications to EventBridge. ``` aws autoscaling put-lifecycle-hook \ @@ -282,7 +286,9 @@ aws autoscaling put-lifecycle-hook \ --heartbeat-timeout=300 ``` -If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1: +##### 2.2. Send notifications directly to SQS + +This will configure ASG to send termination notifications directly to an SQS queue monitored by NTH. ``` aws autoscaling put-lifecycle-hook \ @@ -291,39 +297,43 @@ aws autoscaling put-lifecycle-hook \ --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ --default-result=CONTINUE \ --heartbeat-timeout=300 \ - --notification-target-arn \ + --notification-target-arn \ --role-arn ``` -#### 3. Create an ASG Launch Lifecycle Hook (optional): +#### 3. Handle ASG Instance Launch Lifecycle Notifications (optional): -If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, then a new instance will be launched upon another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node. +If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, a new instance will be launched before another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node. -Here is the AWS CLI command to create a launch lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: +##### 3.1. Send Notification via EventBridge + +This will configure ASG to send launch notifications to EventBridge. ``` aws autoscaling put-lifecycle-hook \ - --lifecycle-hook-name=my-k8s-term-launch-hook \ + --lifecycle-hook-name=my-k8s-launch-hook \ --auto-scaling-group-name=my-k8s-asg \ --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ --default-result="ABANDON" \ --heartbeat-timeout=300 ``` -If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1: +##### 3.2. Send notifications directly to SQS + +This will configure ASG to send launch notifications directly to an SQS queue monitored by NTH. ``` aws autoscaling put-lifecycle-hook \ - --lifecycle-hook-name=my-k8s-term-launch-hook \ + --lifecycle-hook-name=my-k8s-launch-hook \ --auto-scaling-group-name=my-k8s-asg \ --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ --default-result="ABANDON" \ --heartbeat-timeout=300 \ - --notification-target-arn \ + --notification-target-arn \ --role-arn ``` -The hook will be completed by NTH upon the instance's verified connection as a node. If not, the ABANDON default result will cause the instance to be terminated, and a new one to replace it repeating the same verification process. +When NTH receives a launch notification, it will periodically check for a node backed by the EC2 instance to join the cluster and for the node to have a status of 'ready.' Once a node becomes ready, NTH will complete the lifecycle hook, prompting the ASG to proceed with terminating the previous instance. If the lifecycle hook is not completed before the timeout, the ASG will take the default action. If the default action is 'ABANDON,' the new instance will be terminated, and the notification process will be repeated with another new instance. #### 4. Tag the Instances: From d16182d2befd4b4453e24a9dc5755374beab3fff Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 2 Jan 2024 11:56:21 -0600 Subject: [PATCH 24/27] Use ASG launch event in SQS testing --- pkg/monitor/sqsevent/sqs-monitor.go | 1 + pkg/monitor/sqsevent/sqs-monitor_test.go | 32 ++++++++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 18febbb1..7dea9308 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -207,6 +207,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent) if err != nil { interruptionEvent, err = nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err) + interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) } if lifecycleEvent.LifecycleTransition == ASGLaunchingLifecycleTransition { interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message) diff --git a/pkg/monitor/sqsevent/sqs-monitor_test.go b/pkg/monitor/sqsevent/sqs-monitor_test.go index 61199ed2..8e827377 100644 --- a/pkg/monitor/sqsevent/sqs-monitor_test.go +++ b/pkg/monitor/sqsevent/sqs-monitor_test.go @@ -67,6 +67,26 @@ var asgLifecycleEvent = sqsevent.EventBridgeEvent{ }`), } +var asgLaunchLifecycleEvent = sqsevent.EventBridgeEvent{ + Version: "0", + ID: "83c632dd-0145-1ab0-ae93-a756ebf429b5", + DetailType: "EC2 Instance-launch Lifecycle Action", + Source: "aws.autoscaling", + Account: "123456789012", + Time: "2020-07-01T22:30:58Z", + Region: "us-east-1", + Resources: []string{ + "arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:c4c64181-52c1-dd3f-20bb-f4a0965a09db:autoScalingGroupName/nth-test1", + }, + Detail: []byte(`{ + "LifecycleActionToken": "524632c5-3333-d52d-3992-d9633ec24ed7", + "AutoScalingGroupName": "nth-test1", + "LifecycleHookName": "node-termination-handler-launch", + "EC2InstanceId": "i-0a68bf5ef13e21b52", + "LifecycleTransition": "autoscaling:EC2_INSTANCE_LAUNCHING" + }`), +} + var asgLifecycleEventFromSQS = sqsevent.LifecycleDetail{ LifecycleHookName: "test-nth-asg-to-sqs", RequestID: "3775fac9-93c3-7ead-8713-159816566000", @@ -352,7 +372,7 @@ func TestMonitor_DrainTasks(t *testing.T) { } func TestMonitor_DrainTasks_Delay(t *testing.T) { - msg, err := getSQSMessageFromEvent(asgLifecycleEvent) + msg, err := getSQSMessageFromEvent(asgLaunchLifecycleEvent) h.Ok(t, err) sqsMock := h.MockedSQS{ @@ -384,13 +404,12 @@ func TestMonitor_DrainTasks_Delay(t *testing.T) { err = sqsMonitor.Monitor() h.Ok(t, err) - t.Run(asgLifecycleEvent.DetailType, func(st *testing.T) { + t.Run(asgLaunchLifecycleEvent.DetailType, func(st *testing.T) { result := <-drainChan - h.Equals(st, monitor.ASGLifecycleKind, result.Kind) + h.Equals(st, monitor.ASGLaunchLifecycleKind, result.Kind) h.Equals(st, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(st, result.NodeName, dnsNodeName) h.Assert(st, result.PostDrainTask != nil, "PostDrainTask should have been set") - h.Assert(st, result.PreDrainTask != nil, "PreDrainTask should have been set") err := result.PostDrainTask(result, node.Node{}) h.Ok(st, err) h.Assert(st, hookCalled, "BeforeCompleteLifecycleAction hook not called") @@ -457,7 +476,7 @@ func TestMonitor_DrainTasks_Errors(t *testing.T) { } func TestMonitor_DrainTasksASGFailure(t *testing.T) { - msg, err := getSQSMessageFromEvent(asgLifecycleEvent) + msg, err := getSQSMessageFromEvent(asgLaunchLifecycleEvent) h.Ok(t, err) messages := []*sqs.Message{ &msg, @@ -492,11 +511,10 @@ func TestMonitor_DrainTasksASGFailure(t *testing.T) { select { case result := <-drainChan: - h.Equals(t, monitor.ASGLifecycleKind, result.Kind) + h.Equals(t, monitor.ASGLaunchLifecycleKind, result.Kind) h.Equals(t, sqsevent.SQSMonitorKind, result.Monitor) h.Equals(t, result.NodeName, dnsNodeName) h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") - h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") err = result.PostDrainTask(result, node.Node{}) h.Nok(t, err) default: From 8a9428172e437214d98f002884bbee1811b58f48 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Wed, 3 Jan 2024 16:09:34 -0600 Subject: [PATCH 25/27] Revise formatting for updated README --- README.md | 83 ++++++++++++-------------- test/e2e/asg-launch-lifecycle-sqs-test | 2 +- 2 files changed, 39 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 8121be62..0a906375 100644 --- a/README.md +++ b/README.md @@ -218,8 +218,9 @@ You'll need the following AWS infrastructure components: 1. Amazon Simple Queue Service (SQS) Queue 2. AutoScaling Group Termination Lifecycle Hook -3. Amazon EventBridge Rule -4. IAM Role for the aws-node-termination-handler Queue Processing Pods +3. Instance Tagging +4. Amazon EventBridge Rule +5. IAM Role for the aws-node-termination-handler Queue Processing Pods Optional AWS infrastructure components: 1. AutoScaling Group Launch Lifecycle Hook @@ -273,9 +274,7 @@ There are some caveats when using [server side encryption with SQS](https://docs #### 2. Create an ASG Termination Lifecycle Hook: -##### 2.1. Send Notification via EventBridge - -This will configure ASG to send termination notifications to EventBridge. +Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: ``` aws autoscaling put-lifecycle-hook \ @@ -286,9 +285,7 @@ aws autoscaling put-lifecycle-hook \ --heartbeat-timeout=300 ``` -##### 2.2. Send notifications directly to SQS - -This will configure ASG to send termination notifications directly to an SQS queue monitored by NTH. +If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1: ``` aws autoscaling put-lifecycle-hook \ @@ -301,41 +298,7 @@ aws autoscaling put-lifecycle-hook \ --role-arn ``` -#### 3. Handle ASG Instance Launch Lifecycle Notifications (optional): - -If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, a new instance will be launched before another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node. - -##### 3.1. Send Notification via EventBridge - -This will configure ASG to send launch notifications to EventBridge. - -``` -aws autoscaling put-lifecycle-hook \ - --lifecycle-hook-name=my-k8s-launch-hook \ - --auto-scaling-group-name=my-k8s-asg \ - --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ - --default-result="ABANDON" \ - --heartbeat-timeout=300 -``` - -##### 3.2. Send notifications directly to SQS - -This will configure ASG to send launch notifications directly to an SQS queue monitored by NTH. - -``` -aws autoscaling put-lifecycle-hook \ - --lifecycle-hook-name=my-k8s-launch-hook \ - --auto-scaling-group-name=my-k8s-asg \ - --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ - --default-result="ABANDON" \ - --heartbeat-timeout=300 \ - --notification-target-arn \ - --role-arn -``` - -When NTH receives a launch notification, it will periodically check for a node backed by the EC2 instance to join the cluster and for the node to have a status of 'ready.' Once a node becomes ready, NTH will complete the lifecycle hook, prompting the ASG to proceed with terminating the previous instance. If the lifecycle hook is not completed before the timeout, the ASG will take the default action. If the default action is 'ABANDON,' the new instance will be terminated, and the notification process will be repeated with another new instance. - -#### 4. Tag the Instances: +#### 3. Tag the Instances: By default the aws-node-termination-handler will only manage terminations for instances tagged with `key=aws-node-termination-handler/managed`. The value of the key does not matter. @@ -361,7 +324,7 @@ You can also control what resources NTH manages by adding the resource ARNs to y Take a look at the docs on how to [create rules that only manage certain ASGs](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html), and read about all the [supported ASG events](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-event-reference.html). -#### 5. Create Amazon EventBridge Rules +#### 4. Create Amazon EventBridge Rules You may skip this step if sending events from ASG to SQS directly. @@ -408,7 +371,7 @@ aws events put-targets --rule MyK8sScheduledChangeRule \ --targets "Id"="1","Arn"="arn:aws:sqs:us-east-1:123456789012:MyK8sTermQueue" ``` -#### 6. Create an IAM Role for the Pods +#### 5. Create an IAM Role for the Pods There are many different ways to allow the aws-node-termination-handler pods to assume a role: @@ -439,6 +402,36 @@ IAM Policy for aws-node-termination-handler Deployment: } ``` +#### 1. Handle ASG Instance Launch Lifecycle Notifications (optional): + +NTH can monitor for new instances launched by an ASG and notify the ASG when the instance is available in the EKS cluster. + +NTH will need to receive notifications of new instance launches within the ASG. We can add a lifecycle hook to the ASG that will send instance launch notifications via EventBridge: + +``` +aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-launch-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ + --default-result="ABANDON" \ + --heartbeat-timeout=300 +``` + +Alternatively, ASG can send the instance launch notification directly to an SQS Queue: + +``` +aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-launch-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \ + --default-result="ABANDON" \ + --heartbeat-timeout=300 \ + --notification-target-arn \ + --role-arn +``` + +When NTH receives a launch notification, it will periodically check for a node backed by the EC2 instance to join the cluster and for the node to have a status of 'ready.' Once a node becomes ready, NTH will complete the lifecycle hook, prompting the ASG to proceed with terminating the previous instance. If the lifecycle hook is not completed before the timeout, the ASG will take the default action. If the default action is 'ABANDON', the new instance will be terminated, and the notification process will be repeated with another new instance. + ### Installation #### Pod Security Admission diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index 4472f015..2df04cc0 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -415,7 +415,7 @@ function test_launch_lifecycle { elapsed_time=$((current_time - start_time)) if [[ $elapsed_time -ge $heartbeat_timeout ]]; then echo "" - echo "โŒ Launch Lifecycle not Completed. Timeout Reached โŒ" + echo "โŒ Timeout Reached โŒ" exit 1 fi From cd91851a6478e5f4085f10f0f60f59fd56353115 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Fri, 5 Jan 2024 10:24:38 -0600 Subject: [PATCH 26/27] Revised ASG Launch Lifecycle Assertion test to adhere to shellcheck. Updated test timeouts to match Assertionscript standards --- test/e2e/asg-launch-lifecycle-sqs-test | 121 ++++++++++++------------- test/k8s-local-cluster-test/run-test | 2 +- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index 2df04cc0..05f80e0d 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -6,15 +6,16 @@ sqs_queue_name="nth-sqs-test" sns_topic_name="nth-sns-test" node_policy_name="nth-test-node-policy" auto_scaling_role_name="AWSServiceRoleForAutoScaling_nth-test" -auto_scaling_policy_arn="arn:aws:iam::aws:policy/aws-service-role/AutoScalingServiceRolePolicy" fis_role_name="nth-test-fis-role" fis_template_name="nth-fis-test" fis_policy_arn="arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml" account_id=$(aws sts get-caller-identity | jq -r '.Account') nth_label="Use-Case=NTH" heartbeat_timeout=$((3 * 60)) +LAUNCH_CHECK_CYCLES=15 +LAUNCH_ACTIVITY_CHECK_SLEEP=15 +LAUNCH_STATUS_CHECK_SLEEP=$((heartbeat_timeout / LAUNCH_CHECK_CYCLES)) ##### JSON FILES ##### @@ -212,18 +213,16 @@ function create_node_policy { function get_node_role_name { node_role_arn=$(aws eks describe-nodegroup --cluster-name $CLUSTER_NAME --nodegroup-name $node_group_name | jq -r .nodegroup.nodeRole) - split_node_role_arn=($(tr '/' ' ' <<< $node_role_arn)) - node_role_name=${split_node_role_arn[1]} + IFS="/" read -r -a node_role_arn_array <<< "$node_role_arn" + node_role_name=${node_role_arn_array[1]} } function set_node_data { instance_ids=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_name | jq -r '.AutoScalingGroups | .[0].Instances | .[].InstanceId') instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[].Instances | .[].InstanceId, .[].PrivateDnsName]') - nth_node_id=$(jq -r '.[0]' <<< $instance_data) nth_node_ip=$(jq -r '.[1]' <<< $instance_data) termination_node_id=$(jq -r '.[2]' <<< $instance_data) - termination_node_ip=$(jq -r '.[3]' <<< $instance_data) } function update_ASG { @@ -255,19 +254,20 @@ function update_ASG { } function create_auto_scaling_role { - auto_scaling_role_exists=$(aws iam get-role --role-name=$auto_scaling_role_name | grep "$auto_scaling_role_name" || :) + auto_scaling_role_exists=$(aws iam get-role --role-name=$auto_scaling_role_name 2> /dev/null | grep "$auto_scaling_role_name" || :) if [[ -z $auto_scaling_role_exists ]]; then echo "๐Ÿฅ‘ Creating Auto Scaling Role" auto_scaling_role_arn=$(aws iam create-service-linked-role --aws-service-name autoscaling.amazonaws.com --custom-suffix "nth-test" | jq -r '.Role.Arn') sleep 10 else echo "๐Ÿฅ‘ $auto_scaling_role_name already exists; continuing with test run" - auto_scaling_role_arn=$(aws iam get-role --role-name=$auto_scaling_role_name | jq -r '.Role.Arn') + auto_scaling_role_arn=$(aws iam get-role --role-name=$auto_scaling_role_name 2> /dev/null | jq -r '.Role.Arn') fi } ### HELM ### function install_helm { + get_aws_credentials anth_helm_args=( upgrade @@ -280,8 +280,8 @@ function install_helm { --set image.pullPolicy="Always" --set nodeSelector."${nth_label}" --set tolerations[0].operator=Exists - --set awsAccessKeyID=$(aws --profile default configure get aws_access_key_id) - --set awsSecretAccessKey=$(aws --profile default configure get aws_secret_access_key) + --set awsAccessKeyID="$aws_access_key_id" + --set awsSecretAccessKey="$aws_secret_access_key" --set awsRegion="${REGION}" --set checkTagBeforeDraining=false --set enableSqsTerminationDraining=true @@ -296,9 +296,24 @@ function install_helm { sleep 15 } +function get_aws_credentials { + echo "๐Ÿฅ‘ Retrieving AWS Credentials" + aws_access_key_id=$(aws --profile default configure get aws_access_key_id 2> /dev/null) + if [[ -z $aws_access_key_id ]]; then + echo "โŒ Failed to retrieve AWS Access Key โŒ" + exit 1 + fi + + aws_secret_access_key=$(aws --profile default configure get aws_secret_access_key 2> /dev/null) + if [[ -z $aws_access_key_id ]]; then + echo "โŒ Failed to retrieve AWS Secret Access Key โŒ" + exit 1 + fi +} + ### FIS ### function create_FIS_role { - fis_role_exists=$(aws iam get-role --role-name $fis_role_name | grep "$fis_role_name" || :) + fis_role_exists=$(aws iam get-role --role-name $fis_role_name 2> /dev/null | grep "$fis_role_name" || :) if [[ -z $fis_role_exists ]]; then echo "๐Ÿฅ‘ Creating FIS Role" fis_role_arn=$(aws iam create-role --role-name $fis_role_name --assume-role-policy-document file:///tmp/fis-role-trust-policy.json | jq -r '.Role.Arn') @@ -306,7 +321,7 @@ function create_FIS_role { sleep 10 else echo "๐Ÿฅ‘ $fis_role_name already exists; continuing with test run" - fis_role_arn=$(aws iam get-role --role-name=$fis_role_name | jq -r '.Role.Arn') + fis_role_arn=$(aws iam get-role --role-name=$fis_role_name 2> /dev/null | jq -r '.Role.Arn') fi } @@ -335,28 +350,21 @@ function start_FIS_experiment { create_experiment_template echo "๐Ÿฅ‘ Starting Experiment" experiment_start_time=$(date +%s) - experiment=$(aws fis start-experiment --experiment-template-id $template_id) + aws fis start-experiment --experiment-template-id $template_id > /dev/null } ##### TESTING ##### -function is_new_instance { - is_new="true" - if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then - is_new="false" - fi -} - function convert_date_to_epoch_seconds { IFS='T' read -r date_part time_part <<< "$1" IFS='-' read -r year month day <<< "$date_part" IFS=':' read -r hour minute second_fractional <<< "$time_part" - IFS='.' read -r second fraction <<< "$second_fractional" + IFS='.' read -r -a seconds_array <<< "$second_fractional" IFS=':' read -r offset_hours offset_minutes <<< "${time_part:16:5}" # Convert time strings to base-10 integers year=$((10#$year + 0)); month=$((10#$month + 0)); day=$((10#$day + 0)) - hour=$((10#$hour + 0)); minute=$((10#$minute + 0)); second=$((10#$second + 0)) + hour=$((10#$hour + 0)); minute=$((10#$minute + 0)); second=$((10#${seconds_array[0]} + 0)) offset_hours=$((10#$offset_hours + 0)); offset_minutes=$((10#$offset_minutes + 0)) if [[ $time_part =~ .*"-".* ]]; then @@ -365,60 +373,47 @@ function convert_date_to_epoch_seconds { fi total_days=$(((year - 1970) * 365 + (year - 1970)/4)) - for ((i = 1; i < month; i++)); do - total_days=$((total_days + $(cal $i $year | awk 'NF {DAYS = $NF} END {print DAYS}'))) + for ((k = 1; k < month; k++)); do + total_days=$((total_days + $(cal $k $year | awk 'NF {DAYS = $NF} END {print DAYS}'))) done total_days=$((total_days + day - 1)) total_seconds=$((total_days * 86400 + (hour + offset_hours) * 3600 + (minute + offset_minutes) * 60 + second)) } function get_launch_activity { - max_duration=$((5 * 60)) - start_time=$(date +%s) - + echo "๐Ÿฅ‘ Finding launch activity " launch_activity="" - while [[ -z $launch_activity ]]; do - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [[ $elapsed_time -ge $max_duration ]]; then - echo "โŒ Failed to find a new launched instance. Timeout Reached โŒ" - exit 1 - fi - - sleep 5 + for i in $(seq 1 $LAUNCH_CHECK_CYCLES); do activities=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name) - activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode, .StartTime]' <<< $activities) + activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StartTime]' <<< $activities) num_activities=$(jq -r 'length' <<< $activities_details) - for i in $(seq 0 4 $((--num_activities))); do - id=$(jq -r .[$i] <<< $activities_details) - description=$(jq -r .[$((++i))] <<< $activities_details) - status=$(jq -r .[$((i+=2))] <<< $activities_details) - start=$(jq -r .[$((i+=3))] <<< $activities_details) + for j in $(seq 0 3 $((--num_activities))); do + id=$(jq -r .[$j] <<< $activities_details) + description=$(jq -r .[$((++j))] <<< $activities_details) + start=$(jq -r .[$((j+=2))] <<< $activities_details) activity_instance=${description##*:} convert_date_to_epoch_seconds $start if [[ $description =~ .*"Launching".* && $total_seconds -gt $experiment_start_time ]]; then launch_activity=$id - echo "๐Ÿฅ‘ Launch Activity found for instance $activity_instance" - break + break 2 fi done + + echo "Setup Loop $i/$LAUNCH_CHECK_CYCLES, sleeping for $LAUNCH_ACTIVITY_CHECK_SLEEP seconds" + sleep $LAUNCH_ACTIVITY_CHECK_SLEEP done + + if [[ -n $launch_activity ]]; then + echo "โœ… Launch Activity found for instance $activity_instance" + else + echo "โŒ Failed to find a new launched instance โŒ" + exit 1 + fi } function test_launch_lifecycle { - aws sqs receive-message --queue-url $queue_url - echo -n "๐Ÿฅ‘ Waiting for launch hook completion." - - start_time=$(date +%s) - while [[ true ]]; do - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [[ $elapsed_time -ge $heartbeat_timeout ]]; then - echo "" - echo "โŒ Timeout Reached โŒ" - exit 1 - fi - + echo "๐Ÿฅ‘ Verifying launch hook completion " + for i in $(seq 1 $LAUNCH_CHECK_CYCLES); do activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[].StatusCode') if [[ $activity_status == "Successful" ]]; then echo "" @@ -429,9 +424,13 @@ function test_launch_lifecycle { echo "โŒ Launch Lifecycle $activity_status โŒ" exit 1 fi - echo -n "." - sleep 10 + + echo "Assertion Loop $i/$LAUNCH_CHECK_CYCLES, sleeping for $LAUNCH_STATUS_CHECK_SLEEP seconds" + sleep $LAUNCH_STATUS_CHECK_SLEEP done + + echo "โŒ Failed to verify launch hook completion โŒ" + exit 1 } @@ -457,14 +456,14 @@ function clean_up { fi if [[ -n $template_id ]]; then echo "๐Ÿฅ‘ Deleting FIS experiment template" - deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate) + aws fis delete-experiment-template --id $template_id --no-paginate > /dev/null fi echo "๐Ÿฅ‘ Detaching FIS role policy" aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn echo "๐Ÿฅ‘ Deleting FIS role" aws iam delete-role --role-name $fis_role_name echo "๐Ÿฅ‘ Deleting autoscaling role" - aws iam delete-service-linked-role --role-name $auto_scaling_role_name + aws iam delete-service-linked-role --role-name $auto_scaling_role_name > /dev/null if [[ -n $node_policy_arn ]]; then echo "๐Ÿฅ‘ Deleting Node role policy" aws iam delete-policy --policy-arn $node_policy_arn diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index 34385364..be1e243c 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -275,7 +275,7 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\ kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite function is_denylisted { - if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then + if [[ ${SCRIPT_DENYLIST[*]} =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then return 0 fi return 1 From a8197a4efad2d837b3eb1b93bd403170011b719c Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 18 Jan 2024 11:58:19 -0600 Subject: [PATCH 27/27] Update E22 EKS cluster test with ASG test script --- test/e2e/asg-launch-lifecycle-sqs-test | 2 +- test/eks-cluster-test/run-test | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test index 05f80e0d..42f21c98 100755 --- a/test/e2e/asg-launch-lifecycle-sqs-test +++ b/test/e2e/asg-launch-lifecycle-sqs-test @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -node_group_name="spot-ng" +node_group_name="linux-ng" sqs_queue_name="nth-sqs-test" sns_topic_name="nth-sns-test" node_policy_name="nth-test-node-policy" diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test index beebf0dd..949f68d1 100755 --- a/test/eks-cluster-test/run-test +++ b/test/eks-cluster-test/run-test @@ -194,7 +194,6 @@ function reset_cluster { if [[ -z ${assertion_scripts+x} ]]; then assertion_scripts=( - "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test" "$SCRIPTPATH/../e2e/cordon-only-test" "$SCRIPTPATH/../e2e/imds-v2-test" "$SCRIPTPATH/../e2e/maintenance-event-cancellation-test" @@ -206,6 +205,8 @@ if [[ -z ${assertion_scripts+x} ]]; then #"$SCRIPTPATH/../e2e/webhook-http-proxy-test" #"$SCRIPTPATH/../e2e/webhook-secret-test" "$SCRIPTPATH/../e2e/webhook-test" + # This test terminates nodes in the cluster and needs to be run last + "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test" ) fi