From ccbe23f6edd04e3844c8ccb53a05b422d77ebcb6 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Thu, 17 Aug 2023 15:56:35 -0500
Subject: [PATCH 01/27] Added check of ASG lifecycle hook informing of an EC2
 instance launch before ASG termination event

---
 pkg/monitor/sqsevent/sqs-monitor.go | 48 ++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index e028506c..0d9eea1c 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -148,7 +148,8 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 		}
 		return eventBridgeEvent, skip{err}
 
-	case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING":
+	case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" &&
+		lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING":
 		log.Err(err).Msg("only lifecycle termination events from ASG to SQS are supported outside EventBridge")
 		err = fmt.Errorf("unsupported message type (%s)", message.String())
 		return eventBridgeEvent, err
@@ -157,12 +158,49 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 	eventBridgeEvent.Source = "aws.autoscaling"
 	eventBridgeEvent.Time = lifecycleEvent.Time
 	eventBridgeEvent.ID = lifecycleEvent.RequestID
+	eventBridgeEvent.DetailType = lifecycleEvent.LifecycleTransition
 	eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent)
 
 	log.Debug().Msg("processing lifecycle termination event from ASG")
 	return eventBridgeEvent, err
 }
 
+// Receives and processes SQS messages to check for ASG lifecycle hook informing of an EC2 instance launch
+func (m SQSMonitor) newASGInstanceLifeCycleReceived() (bool, error) {
+	newInstanceCreated := false
+	messages, err := m.receiveQueueMessages(m.QueueURL)
+	if err != nil {
+		log.Err(err).Msg("Error receiveing SQS queue messages.")
+		return false, err
+	}
+
+	failedEventBridgeEvents := 0
+	for _, message := range messages {
+		eventBridgeEvent, err := m.processSQSMessage(message)
+		if err != nil {
+			var s skip
+			if errors.As(err, &s) {
+				log.Warn().Err(s).Msg("skip processing SQS message")
+			} else {
+				log.Err(err).Msg("error processing SQS message")
+			}
+			continue
+		}
+
+		if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" {
+			log.Info().Msg("New EC2 instance created by ASG")
+			newInstanceCreated = true
+			break
+		}
+	}
+
+	if len(messages) > 0 && failedEventBridgeEvents == len(messages) {
+		err = fmt.Errorf("none of the waiting queue events could be processed")
+	}
+
+	return newInstanceCreated, err
+}
+
 // processEventBridgeEvent processes an EventBridge event and returns interruption event wrappers
 func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper {
 	interruptionEventWrappers := []InterruptionEventWrapper{}
@@ -171,6 +209,14 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 
 	switch eventBridgeEvent.Source {
 	case "aws.autoscaling":
+		newInstanceCreated, err := m.newASGInstanceLifeCycleReceived()
+		for !newInstanceCreated {
+			if err != nil {
+				log.Err(err)
+				break
+			}
+			newInstanceCreated, err = m.newASGInstanceLifeCycleReceived()
+		}
 		interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})
 

From ed2a317e1b18a2da9181a6a1c2dfc6a244b9df7e Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Mon, 21 Aug 2023 14:38:29 -0500
Subject: [PATCH 02/27] Completes ASG launch lifecycle hook if new node is
 ready in cluster

---
 pkg/monitor/sqsevent/asg-lifecycle-event.go | 92 +++++++++++++++++++--
 pkg/monitor/sqsevent/sqs-monitor.go         | 50 ++---------
 2 files changed, 90 insertions(+), 52 deletions(-)

diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 5c088030..77f59eb3 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -14,8 +14,10 @@
 package sqsevent
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
+	"strings"
 
 	"github.com/aws/aws-node-termination-handler/pkg/monitor"
 	"github.com/aws/aws-node-termination-handler/pkg/node"
@@ -24,6 +26,10 @@ import (
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/rs/zerolog/log"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
 )
 
 /* Example SQS ASG Lifecycle Termination Event Message:
@@ -92,13 +98,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 	}
 
 	interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error {
-		_, err := m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{
-			AutoScalingGroupName:  &lifecycleDetail.AutoScalingGroupName,
-			LifecycleActionResult: aws.String("CONTINUE"),
-			LifecycleHookName:     &lifecycleDetail.LifecycleHookName,
-			LifecycleActionToken:  &lifecycleDetail.LifecycleActionToken,
-			InstanceId:            &lifecycleDetail.EC2InstanceID,
-		})
+		_, err := m.continueLifecycleAction(lifecycleDetail)
 		if err != nil {
 			if aerr, ok := err.(awserr.RequestFailure); ok && aerr.StatusCode() != 400 {
 				return err
@@ -124,3 +124,81 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 
 	return &interruptionEvent, nil
 }
+
+func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*autoscaling.CompleteLifecycleActionOutput, error) {
+	return m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{
+		AutoScalingGroupName:  &lifecycleDetail.AutoScalingGroupName,
+		LifecycleActionResult: aws.String("CONTINUE"),
+		LifecycleHookName:     &lifecycleDetail.LifecycleHookName,
+		LifecycleActionToken:  &lifecycleDetail.LifecycleActionToken,
+		InstanceId:            &lifecycleDetail.EC2InstanceID,
+	})
+}
+
+func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent) error {
+	lifecycleDetail := &LifecycleDetail{}
+	err := json.Unmarshal(event.Detail, lifecycleDetail)
+	if err != nil {
+		return err
+	}
+
+	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
+		return skip{fmt.Errorf("message is an ASG test notification")}
+	}
+
+	if m.isNodeReady(lifecycleDetail) {
+		_, err = m.continueLifecycleAction(lifecycleDetail)
+	} else {
+		err = skip{fmt.Errorf("New ASG instance has not connected to cluster")}
+	}
+	return err
+}
+
+// If the Node, new EC2 instance, is ready in the K8s cluster
+func (m SQSMonitor) isNodeReady(lifecycleDetail *LifecycleDetail) bool {
+	nodes, err := m.getNodes()
+	if err != nil {
+		return false
+	}
+
+	for _, node := range nodes.Items {
+		instanceID := m.getInstanceID(node)
+		if instanceID != lifecycleDetail.EC2InstanceID {
+			break
+		}
+
+		conditions := node.Status.Conditions
+		for _, condition := range conditions {
+			if condition.Type == "Ready" && condition.Status == "True" {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// Gets Nodes connected to K8s cluster
+func (m SQSMonitor) getNodes() (*v1.NodeList, error) {
+	clusterConfig, err := rest.InClusterConfig()
+	if err != nil {
+		return nil, err
+	}
+	// creates the clientset
+	clientset, err := kubernetes.NewForConfig(clusterConfig)
+	if err != nil {
+		return nil, err
+	}
+	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return nodes, err
+}
+
+// Gets EC2 InstanceID from ProviderID, format: aws:///$az/$instanceid
+func (m SQSMonitor) getInstanceID(node v1.Node) string {
+	providerID := node.Spec.ProviderID
+	providerIDSplit := strings.Split(providerID, "/")
+	instanceID := providerIDSplit[len(providerID)-1]
+	return instanceID
+}
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 0d9eea1c..0898eb50 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -161,46 +161,10 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 	eventBridgeEvent.DetailType = lifecycleEvent.LifecycleTransition
 	eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent)
 
-	log.Debug().Msg("processing lifecycle termination event from ASG")
+	log.Debug().Msg("processing lifecycle event from ASG")
 	return eventBridgeEvent, err
 }
 
-// Receives and processes SQS messages to check for ASG lifecycle hook informing of an EC2 instance launch
-func (m SQSMonitor) newASGInstanceLifeCycleReceived() (bool, error) {
-	newInstanceCreated := false
-	messages, err := m.receiveQueueMessages(m.QueueURL)
-	if err != nil {
-		log.Err(err).Msg("Error receiveing SQS queue messages.")
-		return false, err
-	}
-
-	failedEventBridgeEvents := 0
-	for _, message := range messages {
-		eventBridgeEvent, err := m.processSQSMessage(message)
-		if err != nil {
-			var s skip
-			if errors.As(err, &s) {
-				log.Warn().Err(s).Msg("skip processing SQS message")
-			} else {
-				log.Err(err).Msg("error processing SQS message")
-			}
-			continue
-		}
-
-		if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" {
-			log.Info().Msg("New EC2 instance created by ASG")
-			newInstanceCreated = true
-			break
-		}
-	}
-
-	if len(messages) > 0 && failedEventBridgeEvents == len(messages) {
-		err = fmt.Errorf("none of the waiting queue events could be processed")
-	}
-
-	return newInstanceCreated, err
-}
-
 // processEventBridgeEvent processes an EventBridge event and returns interruption event wrappers
 func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper {
 	interruptionEventWrappers := []InterruptionEventWrapper{}
@@ -209,15 +173,11 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 
 	switch eventBridgeEvent.Source {
 	case "aws.autoscaling":
-		newInstanceCreated, err := m.newASGInstanceLifeCycleReceived()
-		for !newInstanceCreated {
-			if err != nil {
-				log.Err(err)
-				break
-			}
-			newInstanceCreated, err = m.newASGInstanceLifeCycleReceived()
+		if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" {
+			err = m.asgCompleteLaunchLifecycle(eventBridgeEvent)
+		} else if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_TERMINATING" {
+			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		}
-		interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})
 
 	case "aws.ec2":

From 4f47cb1a533b03eb7a0f34cac625eaab196313e9 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Mon, 21 Aug 2023 14:46:47 -0500
Subject: [PATCH 03/27] Avoid processing of interuption event for launching

---
 pkg/monitor/sqsevent/sqs-monitor.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 0898eb50..33c979a7 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -175,6 +175,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	case "aws.autoscaling":
 		if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" {
 			err = m.asgCompleteLaunchLifecycle(eventBridgeEvent)
+			interruptionEvent = nil
 		} else if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_TERMINATING" {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		}

From 2415c4c292b91d4faa146cce9d2f5c69cc4fedaf Mon Sep 17 00:00:00 2001
From: Gavin Burris <gavinburris42@gmail.com>
Date: Wed, 23 Aug 2023 16:15:22 -0500
Subject: [PATCH 04/27] Fixed logic flow for how ASG hooks were checked. Added
 error logs and error messages

---
 pkg/monitor/sqsevent/asg-lifecycle-event.go | 29 ++++++++++++---------
 pkg/monitor/sqsevent/sqs-monitor.go         |  8 +++---
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 77f59eb3..cf6ba327 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -125,6 +125,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 	return &interruptionEvent, nil
 }
 
+// Continues the lifecycle hook thereby indicating a successful action occured
 func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*autoscaling.CompleteLifecycleActionOutput, error) {
 	return m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{
 		AutoScalingGroupName:  &lifecycleDetail.AutoScalingGroupName,
@@ -135,36 +136,38 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 	})
 }
 
+// Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
 func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent) error {
 	lifecycleDetail := &LifecycleDetail{}
 	err := json.Unmarshal(event.Detail, lifecycleDetail)
 	if err != nil {
-		return err
+		return fmt.Errorf("unmarshing ASG lifecycle event: %w", err)
 	}
 
 	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
 		return skip{fmt.Errorf("message is an ASG test notification")}
 	}
 
-	if m.isNodeReady(lifecycleDetail) {
+	if isNodeReady(lifecycleDetail) {
 		_, err = m.continueLifecycleAction(lifecycleDetail)
 	} else {
-		err = skip{fmt.Errorf("New ASG instance has not connected to cluster")}
+		err = skip{fmt.Errorf("new ASG instance has not connected to cluster")}
 	}
 	return err
 }
 
 // If the Node, new EC2 instance, is ready in the K8s cluster
-func (m SQSMonitor) isNodeReady(lifecycleDetail *LifecycleDetail) bool {
-	nodes, err := m.getNodes()
+func isNodeReady(lifecycleDetail *LifecycleDetail) bool {
+	nodes, err := getNodes()
 	if err != nil {
+		log.Err(fmt.Errorf("getting nodes from cluster: %w", err))
 		return false
 	}
 
 	for _, node := range nodes.Items {
-		instanceID := m.getInstanceID(node)
+		instanceID := getInstanceID(node)
 		if instanceID != lifecycleDetail.EC2InstanceID {
-			break
+			continue
 		}
 
 		conditions := node.Status.Conditions
@@ -173,30 +176,32 @@ func (m SQSMonitor) isNodeReady(lifecycleDetail *LifecycleDetail) bool {
 				return true
 			}
 		}
+		log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID))
 	}
+	log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", lifecycleDetail.EC2InstanceID))
 	return false
 }
 
 // Gets Nodes connected to K8s cluster
-func (m SQSMonitor) getNodes() (*v1.NodeList, error) {
+func getNodes() (*v1.NodeList, error) {
 	clusterConfig, err := rest.InClusterConfig()
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("retreiving cluster config: %w", err)
 	}
 	// creates the clientset
 	clientset, err := kubernetes.NewForConfig(clusterConfig)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("creating new clientset with config: %w", err)
 	}
 	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("retreiving nodes from cluster: %w", err)
 	}
 	return nodes, err
 }
 
 // Gets EC2 InstanceID from ProviderID, format: aws:///$az/$instanceid
-func (m SQSMonitor) getInstanceID(node v1.Node) string {
+func getInstanceID(node v1.Node) string {
 	providerID := node.Spec.ProviderID
 	providerIDSplit := strings.Split(providerID, "/")
 	instanceID := providerIDSplit[len(providerID)-1]
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 33c979a7..c11a7e32 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -158,7 +158,6 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 	eventBridgeEvent.Source = "aws.autoscaling"
 	eventBridgeEvent.Time = lifecycleEvent.Time
 	eventBridgeEvent.ID = lifecycleEvent.RequestID
-	eventBridgeEvent.DetailType = lifecycleEvent.LifecycleTransition
 	eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent)
 
 	log.Debug().Msg("processing lifecycle event from ASG")
@@ -169,14 +168,15 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper {
 	interruptionEventWrappers := []InterruptionEventWrapper{}
 	interruptionEvent := &monitor.InterruptionEvent{}
-	var err error
+	lifecycleEvent := LifecycleDetail{}
+	err := json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
 
 	switch eventBridgeEvent.Source {
 	case "aws.autoscaling":
-		if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_LAUNCHING" {
+		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
 			err = m.asgCompleteLaunchLifecycle(eventBridgeEvent)
 			interruptionEvent = nil
-		} else if eventBridgeEvent.DetailType == "autoscaling:EC2_INSTANCE_TERMINATING" {
+		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		}
 		return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})

From a28edd69b47b0187f5022ff61676a9c4dc2071d2 Mon Sep 17 00:00:00 2001
From: Gavin Burris <gavinburris42@gmail.com>
Date: Fri, 25 Aug 2023 11:45:01 -0500
Subject: [PATCH 05/27] Created ASG launch lifecyle test script

---
 test/e2e/asg-launch-lifecycle-sqs-test  | 197 ++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100755 test/e2e/asg-launch-lifecycle-sqs-test 

diff --git a/test/e2e/asg-launch-lifecycle-sqs-test  b/test/e2e/asg-launch-lifecycle-sqs-test 
new file mode 100755
index 00000000..8751fe9a
--- /dev/null
+++ b/test/e2e/asg-launch-lifecycle-sqs-test 	
@@ -0,0 +1,197 @@
+#!/bin/bash
+set -euo pipefail
+
+# Available env vars:
+#   $TMP_DIR
+#   $CLUSTER_NAME
+#   $KUBECONFIG
+#   $NODE_TERMINATION_HANDLER_DOCKER_REPO
+#   $NODE_TERMINATION_HANDLER_DOCKER_TAG
+#   $WEBHOOK_DOCKER_REPO
+#   $WEBHOOK_DOCKER_TAG
+#   $AEMM_URL
+#   $AEMM_VERSION
+
+
+function fail_and_exit {
+    echo "❌ ASG Lifecycle SQS Test failed $CLUSTER_NAME ❌"
+    exit "${1:-1}"
+}
+
+echo "Starting ASG Lifecycle SQS Test for Node Termination Handler"
+START_TIME=$(date -u +"%Y-%m-%dT%TZ")
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+common_helm_args=()
+
+localstack_helm_args=(
+    upgrade
+    --install
+    --namespace default
+    "$CLUSTER_NAME-localstack"
+    "$SCRIPTPATH/../../config/helm/localstack/"
+    --set nodeSelector."${NTH_CONTROL_LABEL}"
+    --set defaultRegion="${AWS_REGION}"
+    --wait
+)
+
+set -x
+helm "${localstack_helm_args[@]}"
+set +x
+
+sleep 10
+
+RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
+localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \
+                                  -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \
+                                  | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }')
+echo "🥑 Using localstack pod ${localstack_pod}"
+run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}")
+private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName')
+instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
+echo "🥑 Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}"
+set -x
+CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}"
+queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl)
+
+echo "🥑 Created SQS Queue ${queue_url}"
+
+anth_helm_args=(
+  upgrade
+  --install
+  --namespace kube-system
+  "$CLUSTER_NAME-acth"
+  "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
+  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
+  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
+  --set nodeSelector."${NTH_CONTROL_LABEL}"
+  --set tolerations[0].operator=Exists
+  --set awsAccessKeyID=foo
+  --set awsSecretAccessKey=bar
+  --set awsRegion="${AWS_REGION}"
+  --set awsEndpoint="http://localstack.default"
+  --set checkTagBeforeDraining=false
+  --set enableSqsTerminationDraining=true
+  --set queueURL="${queue_url}"
+  --wait
+)
+[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
+    anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+    anth_helm_args+=("${common_helm_args[@]}")
+
+set -x
+helm "${anth_helm_args[@]}"
+set +x
+
+emtp_helm_args=(
+  upgrade
+  --install
+  --namespace default
+  "$CLUSTER_NAME-emtp"
+  "$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
+  --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
+  --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
+  --wait
+)
+[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
+    emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+    emtp_helm_args+=("${common_helm_args[@]}")
+
+set -x
+helm "${emtp_helm_args[@]}"
+set +x
+
+TAINT_CHECK_CYCLES=15
+TAINT_CHECK_SLEEP=15
+
+DEPLOYED=0
+
+for i in $(seq 1 $TAINT_CHECK_CYCLES); do
+    if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
+        echo "✅ Verified regular-pod-test pod was scheduled and started!"
+        DEPLOYED=1
+        break
+    fi
+    echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
+    sleep $TAINT_CHECK_SLEEP
+done
+
+if [[ $DEPLOYED -eq 0 ]]; then
+    echo "❌ regular-pod-test pod deployment failed"
+    fail_and_exit 2
+fi
+
+test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
+nodes=$(kubectl get nodes "${test_node}")
+
+if [[${#nodes[@]} -eq 0]]; then 
+    echo "❌ new instance was not found in the cluster"
+    fail_and_exit 2
+conditions=$(kubectl get nodes -o jsonpath='{.status.conditions}')
+launched=0
+
+for i in $conditions; do
+    if [[$i.type == "Ready" && $i.status == "True"]]; then
+        echo "✅ Verified the new instance in ready in the cluster!"
+        launched=1
+    fi
+done
+
+if [[$launched -eq 0]]; then 
+    echo "❌ new instance"
+
+ASG_TERMINATE_EVENT=$(cat <<EOF
+{
+  "version": "0",
+  "id": "782d5b4c-0f6f-1fd6-9d62-ecf6aed0a470",
+  "detail-type": "EC2 Instance-launch Lifecycle Action",
+  "source": "aws.autoscaling",
+  "account": "123456789012",
+  "time": "$(date -u +"%Y-%m-%dT%TZ")",
+  "region": "${AWS_REGION}",
+  "resources": [
+    "arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:26e7234b-03a4-47fb-b0a9-2b241662774e:autoScalingGroupName/nth-integ-test"
+  ],
+  "detail": {
+    "LifecycleActionToken": "0befcbdb-6ecd-498a-9ff7-ae9b54447cd6",
+    "AutoScalingGroupName": "nth-integ-test",
+    "LifecycleHookName": "cluster-termination-handler",
+    "EC2InstanceId": "${instance_id}",
+    "LifecycleTransition": "autoscaling:EC2_INSTANCE_LAUNCHING"
+  }
+}
+EOF
+)
+
+ASG_TERMINATE_EVENT_ONE_LINE=$(echo "${ASG_TERMINATE_EVENT}" | tr -d '\n' |sed 's/\"/\\"/g')
+SEND_SQS_CMD="awslocal sqs send-message --queue-url ${queue_url} --message-body \"${ASG_TERMINATE_EVENT_ONE_LINE}\" --region ${AWS_REGION}"
+kubectl exec -i "${localstack_pod}" -- bash -c "$SEND_SQS_CMD"
+echo "✅ Sent Spot Interruption Event to SQS queue: ${queue_url}"
+
+cordoned=0
+test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
+for i in $(seq 1 $TAINT_CHECK_CYCLES); do
+    if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then
+        echo "✅ Verified the worker node was cordoned!"
+        cordoned=1
+    fi
+
+    if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
+        echo "✅ Verified the regular-pod-test pod was evicted!"
+        echo "✅ ASG Lifecycle SQS Test Passed $CLUSTER_NAME! ✅"
+        exit 0
+    fi
+    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
+    sleep $TAINT_CHECK_SLEEP
+done
+
+if [[ $cordoned -eq 0 ]]; then
+    echo "❌ Worker node was not cordoned"
+else
+    echo "❌ regular-pod-test was not evicted"
+fi
+
+fail_and_exit 1

From 12995c253473704b2aaa4f55097620d5206ed16d Mon Sep 17 00:00:00 2001
From: Gavin Burris <gavinburris42@gmail.com>
Date: Fri, 8 Sep 2023 15:32:40 -0500
Subject: [PATCH 06/27] ASG Launch Lifecyle can be completed. ASG Terminate
 Lifecycle hook is not completed by NTH to allow for Capacity Rebalance

---
 pkg/monitor/sqsevent/asg-lifecycle-event.go |  47 +++--
 pkg/monitor/sqsevent/sqs-monitor.go         |  22 ++-
 test/e2e/asg-launch-lifecycle-sqs-test      | 197 --------------------
 3 files changed, 49 insertions(+), 217 deletions(-)
 delete mode 100755 test/e2e/asg-launch-lifecycle-sqs-test 

diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index cf6ba327..e03ba9b9 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -22,7 +22,6 @@ import (
 	"github.com/aws/aws-node-termination-handler/pkg/monitor"
 	"github.com/aws/aws-node-termination-handler/pkg/node"
 	"github.com/aws/aws-sdk-go/aws"
-	"github.com/aws/aws-sdk-go/aws/awserr"
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/rs/zerolog/log"
@@ -56,6 +55,10 @@ import (
 
 const TEST_NOTIFICATION = "autoscaling:TEST_NOTIFICATION"
 
+type LifecycleDetailMessage struct {
+	Message interface{} `json:"Message"`
+}
+
 // LifecycleDetail provides the ASG lifecycle event details
 type LifecycleDetail struct {
 	LifecycleActionToken string `json:"LifecycleActionToken"`
@@ -98,15 +101,6 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 	}
 
 	interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error {
-		_, err := m.continueLifecycleAction(lifecycleDetail)
-		if err != nil {
-			if aerr, ok := err.(awserr.RequestFailure); ok && aerr.StatusCode() != 400 {
-				return err
-			}
-		}
-		log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
-			lifecycleDetail.LifecycleHookName,
-			lifecycleDetail.EC2InstanceID)
 		errs := m.deleteMessages([]*sqs.Message{message})
 		if errs != nil {
 			return errs[0]
@@ -125,6 +119,17 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 	return &interruptionEvent, nil
 }
 
+func (m SQSMonitor) logAndDeleteLifecycle(lifecycleDetail *LifecycleDetail, message *sqs.Message) error {
+	log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
+		lifecycleDetail.LifecycleHookName,
+		lifecycleDetail.EC2InstanceID)
+	errs := m.deleteMessages([]*sqs.Message{message})
+	if errs != nil {
+		return errs[0]
+	}
+	return nil
+}
+
 // Continues the lifecycle hook thereby indicating a successful action occured
 func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*autoscaling.CompleteLifecycleActionOutput, error) {
 	return m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{
@@ -137,22 +142,28 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 }
 
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
-func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent) error {
+func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error {
 	lifecycleDetail := &LifecycleDetail{}
 	err := json.Unmarshal(event.Detail, lifecycleDetail)
 	if err != nil {
-		return fmt.Errorf("unmarshing ASG lifecycle event: %w", err)
+		return fmt.Errorf("unmarshaling ASG lifecycle event: %w", err)
 	}
 
 	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
-		return skip{fmt.Errorf("message is an ASG test notification")}
+		return ignore{skip{fmt.Errorf("message is an ASG test notification")}}
+	}
+
+	if !isNodeReady(lifecycleDetail) {
+		return ignore{skip{fmt.Errorf("new ASG instance has not connected to cluster")}}
 	}
 
-	if isNodeReady(lifecycleDetail) {
-		_, err = m.continueLifecycleAction(lifecycleDetail)
-	} else {
-		err = skip{fmt.Errorf("new ASG instance has not connected to cluster")}
+	_, err = m.continueLifecycleAction(lifecycleDetail)
+
+	if err != nil {
+		return ignore{skip{fmt.Errorf("completing ASG launch lifecyle: %w", err)}}
 	}
+
+	err = m.logAndDeleteLifecycle(lifecycleDetail, message)
 	return err
 }
 
@@ -204,6 +215,6 @@ func getNodes() (*v1.NodeList, error) {
 func getInstanceID(node v1.Node) string {
 	providerID := node.Spec.ProviderID
 	providerIDSplit := strings.Split(providerID, "/")
-	instanceID := providerIDSplit[len(providerID)-1]
+	instanceID := providerIDSplit[len(providerIDSplit)-1]
 	return instanceID
 }
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index c11a7e32..7e7691d6 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -72,6 +72,18 @@ func (s skip) Unwrap() error {
 	return s.err
 }
 
+type ignore struct {
+	err error
+}
+
+func (i ignore) Error() string {
+	return i.err.Error()
+}
+
+func (i ignore) Unwrap() error {
+	return i.err
+}
+
 // Kind denotes the kind of monitor
 func (m SQSMonitor) Kind() string {
 	return SQSMonitorKind
@@ -133,8 +145,10 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent,
 // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent
 func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
 	eventBridgeEvent := EventBridgeEvent{}
+	lifecycleEventMessage := LifecycleDetailMessage{}
 	lifecycleEvent := LifecycleDetail{}
-	err := json.Unmarshal([]byte(*message.Body), &lifecycleEvent)
+	err := json.Unmarshal([]byte(*message.Body), &lifecycleEventMessage)
+	err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 
 	switch {
 	case err != nil:
@@ -174,7 +188,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	switch eventBridgeEvent.Source {
 	case "aws.autoscaling":
 		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
-			err = m.asgCompleteLaunchLifecycle(eventBridgeEvent)
+			err = m.asgCompleteLaunchLifecycle(eventBridgeEvent, message)
 			interruptionEvent = nil
 		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
@@ -206,9 +220,13 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr
 	dropMessageSuggestionCount := 0
 	failedInterruptionEventsCount := 0
 	var skipErr skip
+	var ignoreErr ignore
 
 	for _, eventWrapper := range interruptionEventWrappers {
 		switch {
+		case errors.As(eventWrapper.Err, &ignoreErr):
+			log.Warn().Err(ignoreErr).Msg("ASG launch cycle not continued")
+
 		case errors.As(eventWrapper.Err, &skipErr):
 			log.Warn().Err(skipErr).Msg("dropping event")
 			dropMessageSuggestionCount++
diff --git a/test/e2e/asg-launch-lifecycle-sqs-test  b/test/e2e/asg-launch-lifecycle-sqs-test 
deleted file mode 100755
index 8751fe9a..00000000
--- a/test/e2e/asg-launch-lifecycle-sqs-test 	
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-# Available env vars:
-#   $TMP_DIR
-#   $CLUSTER_NAME
-#   $KUBECONFIG
-#   $NODE_TERMINATION_HANDLER_DOCKER_REPO
-#   $NODE_TERMINATION_HANDLER_DOCKER_TAG
-#   $WEBHOOK_DOCKER_REPO
-#   $WEBHOOK_DOCKER_TAG
-#   $AEMM_URL
-#   $AEMM_VERSION
-
-
-function fail_and_exit {
-    echo "❌ ASG Lifecycle SQS Test failed $CLUSTER_NAME ❌"
-    exit "${1:-1}"
-}
-
-echo "Starting ASG Lifecycle SQS Test for Node Termination Handler"
-START_TIME=$(date -u +"%Y-%m-%dT%TZ")
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-
-common_helm_args=()
-
-localstack_helm_args=(
-    upgrade
-    --install
-    --namespace default
-    "$CLUSTER_NAME-localstack"
-    "$SCRIPTPATH/../../config/helm/localstack/"
-    --set nodeSelector."${NTH_CONTROL_LABEL}"
-    --set defaultRegion="${AWS_REGION}"
-    --wait
-)
-
-set -x
-helm "${localstack_helm_args[@]}"
-set +x
-
-sleep 10
-
-RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
-localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \
-                                  -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \
-                                  | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }')
-echo "🥑 Using localstack pod ${localstack_pod}"
-run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}")
-private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName')
-instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
-echo "🥑 Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}"
-set -x
-CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}"
-queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl)
-
-echo "🥑 Created SQS Queue ${queue_url}"
-
-anth_helm_args=(
-  upgrade
-  --install
-  --namespace kube-system
-  "$CLUSTER_NAME-acth"
-  "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
-  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
-  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
-  --set nodeSelector."${NTH_CONTROL_LABEL}"
-  --set tolerations[0].operator=Exists
-  --set awsAccessKeyID=foo
-  --set awsSecretAccessKey=bar
-  --set awsRegion="${AWS_REGION}"
-  --set awsEndpoint="http://localstack.default"
-  --set checkTagBeforeDraining=false
-  --set enableSqsTerminationDraining=true
-  --set queueURL="${queue_url}"
-  --wait
-)
-[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
-    anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
-[[ ${#common_helm_args[@]} -gt 0 ]] &&
-    anth_helm_args+=("${common_helm_args[@]}")
-
-set -x
-helm "${anth_helm_args[@]}"
-set +x
-
-emtp_helm_args=(
-  upgrade
-  --install
-  --namespace default
-  "$CLUSTER_NAME-emtp"
-  "$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
-  --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
-  --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
-  --wait
-)
-[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
-    emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
-[[ ${#common_helm_args[@]} -gt 0 ]] &&
-    emtp_helm_args+=("${common_helm_args[@]}")
-
-set -x
-helm "${emtp_helm_args[@]}"
-set +x
-
-TAINT_CHECK_CYCLES=15
-TAINT_CHECK_SLEEP=15
-
-DEPLOYED=0
-
-for i in $(seq 1 $TAINT_CHECK_CYCLES); do
-    if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
-        echo "✅ Verified regular-pod-test pod was scheduled and started!"
-        DEPLOYED=1
-        break
-    fi
-    echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
-    sleep $TAINT_CHECK_SLEEP
-done
-
-if [[ $DEPLOYED -eq 0 ]]; then
-    echo "❌ regular-pod-test pod deployment failed"
-    fail_and_exit 2
-fi
-
-test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
-nodes=$(kubectl get nodes "${test_node}")
-
-if [[${#nodes[@]} -eq 0]]; then 
-    echo "❌ new instance was not found in the cluster"
-    fail_and_exit 2
-conditions=$(kubectl get nodes -o jsonpath='{.status.conditions}')
-launched=0
-
-for i in $conditions; do
-    if [[$i.type == "Ready" && $i.status == "True"]]; then
-        echo "✅ Verified the new instance in ready in the cluster!"
-        launched=1
-    fi
-done
-
-if [[$launched -eq 0]]; then 
-    echo "❌ new instance"
-
-ASG_TERMINATE_EVENT=$(cat <<EOF
-{
-  "version": "0",
-  "id": "782d5b4c-0f6f-1fd6-9d62-ecf6aed0a470",
-  "detail-type": "EC2 Instance-launch Lifecycle Action",
-  "source": "aws.autoscaling",
-  "account": "123456789012",
-  "time": "$(date -u +"%Y-%m-%dT%TZ")",
-  "region": "${AWS_REGION}",
-  "resources": [
-    "arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:26e7234b-03a4-47fb-b0a9-2b241662774e:autoScalingGroupName/nth-integ-test"
-  ],
-  "detail": {
-    "LifecycleActionToken": "0befcbdb-6ecd-498a-9ff7-ae9b54447cd6",
-    "AutoScalingGroupName": "nth-integ-test",
-    "LifecycleHookName": "cluster-termination-handler",
-    "EC2InstanceId": "${instance_id}",
-    "LifecycleTransition": "autoscaling:EC2_INSTANCE_LAUNCHING"
-  }
-}
-EOF
-)
-
-ASG_TERMINATE_EVENT_ONE_LINE=$(echo "${ASG_TERMINATE_EVENT}" | tr -d '\n' |sed 's/\"/\\"/g')
-SEND_SQS_CMD="awslocal sqs send-message --queue-url ${queue_url} --message-body \"${ASG_TERMINATE_EVENT_ONE_LINE}\" --region ${AWS_REGION}"
-kubectl exec -i "${localstack_pod}" -- bash -c "$SEND_SQS_CMD"
-echo "✅ Sent Spot Interruption Event to SQS queue: ${queue_url}"
-
-cordoned=0
-test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
-for i in $(seq 1 $TAINT_CHECK_CYCLES); do
-    if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then
-        echo "✅ Verified the worker node was cordoned!"
-        cordoned=1
-    fi
-
-    if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
-        echo "✅ Verified the regular-pod-test pod was evicted!"
-        echo "✅ ASG Lifecycle SQS Test Passed $CLUSTER_NAME! ✅"
-        exit 0
-    fi
-    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
-    sleep $TAINT_CHECK_SLEEP
-done
-
-if [[ $cordoned -eq 0 ]]; then
-    echo "❌ Worker node was not cordoned"
-else
-    echo "❌ regular-pod-test was not evicted"
-fi
-
-fail_and_exit 1

From 000aa2c630534e598fcd0a3de46913ae6ba2786e Mon Sep 17 00:00:00 2001
From: Gavin Burris <gavinburris42@gmail.com>
Date: Tue, 19 Sep 2023 12:29:41 -0500
Subject: [PATCH 07/27] Created bash script for testing ASG lifecycle hook
 completion

Add functionality for NTH to catch and complete ASG launch lifecycle hooks. Created acceptance test script to test ASG launch lifecycle hook completion
---
 test/e2e/asg-launch-lifecycle-sqs-test     | 355 +++++++++++++++++++++
 test/eks-cluster-test/node_group-spec.yaml |  15 +
 2 files changed, 370 insertions(+)
 create mode 100755 test/e2e/asg-launch-lifecycle-sqs-test
 create mode 100644 test/eks-cluster-test/node_group-spec.yaml

diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
new file mode 100755
index 00000000..db7013f1
--- /dev/null
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -0,0 +1,355 @@
+#!/bin/bash
+set -euo pipefail
+
+REGION="us-west-2"
+CLUSTER_NAME="nth-eks-cluster-test"
+
+node_group_name="nth-eks-cluster-test-spot-ng"
+sqs_queue_name="nth-sqs-test"
+sns_topic_name="nth-sns-test"
+auto_scaling_role_name="AWSServiceRoleForAutoScaling_nth-test"
+auto_scaling_policy_arn="arn:aws:iam::aws:policy/aws-service-role/AutoScalingServiceRolePolicy"
+fis_role_name="nth-test-fis-role"
+fis_template_name="nth-fis-test"
+fis_policy_arn="arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access"
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml"
+account_id=$(aws sts get-caller-identity | jq -r '.Account')
+nth_label="Use-Case=NTH"
+
+
+## Queue Policy
+QUEUE_POLICY=$(cat <<EOF
+{
+    "Version": "2012-10-17",
+    "Id": "MyQueuePolicy",
+    "Statement": [{
+        "Effect": "Allow",
+        "Principal": {
+            "Service": ["events.amazonaws.com", "sqs.amazonaws.com"]
+        },
+        "Action": "sqs:SendMessage",
+        "Resource": [
+            "arn:aws:sqs:${REGION}:${account_id}:${sqs_queue_name}"
+        ]
+    }]
+}
+EOF
+)
+
+function create_queue {
+    cat << EOF > /tmp/queue-attributes.json
+{
+"MessageRetentionPeriod": "300",
+"Policy": "$(echo $QUEUE_POLICY | sed 's/\"/\\"/g' | tr -d -s '\n' " ")",
+"SqsManagedSseEnabled": "true"
+}
+EOF
+
+queue_url=$(aws sqs create-queue --queue-name "${sqs_queue_name}" --attributes file:///tmp/queue-attributes.json | jq -r .QueueUrl)
+}
+
+function provision_sqs_queue {
+    queue_exists=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name)
+    if [[ -z $queue_exists ]]; then
+        echo "🥑 Provisioning SQS Queue"
+        create_queue
+    else 
+        echo "🥑 $sqs_queue_name already exists; continuing with test run"
+        queue_url=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name | jq -r '.QueueUrls | .[0]')
+    fi
+    
+    echo "Queue URL: $queue_url"
+    sqs_arn=$(aws sqs get-queue-attributes --queue-url=$queue_url --attribute-names=QueueArn | jq -r .Attributes.QueueArn)
+}
+
+function provision_sns_topic {
+    topic_exists=$(aws sns list-topics | grep "$sns_topic_name" || :)
+    if [[ -z $topic_exists ]]; then
+        echo "🥑 Provisioning SNS Topic"
+        sns_arn=$(aws sns create-topic --name $sns_topic_name | jq -r .TopicArn)
+    else 
+        echo "🥑 $sns_topic_name already exists; continuing with test run"
+        sns_arn=$(aws sns list-topics | jq -r '.Topics | .[] | .TopicArn' | grep "nth-sns-test")
+    fi
+    echo "SNS ARN: $sns_arn"
+}
+
+function subscribe_sqs_to_sns {
+    num_subscriptions=$(aws sns list-subscriptions-by-topic --topic-arn $sns_arn | jq '.Subscriptions | length')
+    if [[ $num_subscriptions -eq 0 ]]; then
+        echo "🥑 Subscribing $sns_topic_name to $sqs_queue_name"
+        aws sns subscribe --topic-arn $sns_arn --protocol sqs --notification-endpoint $sqs_arn
+    else 
+        echo "🥑 $sns_topic_name already subscribed to $sqs_queue_name; continuing with test run"
+    fi
+}
+
+function install_helm {
+    anth_helm_args=(
+        upgrade
+        --install
+        --namespace kube-system
+        "$CLUSTER_NAME-acth"
+        "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
+        --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
+        --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
+        --set image.pullPolicy="Always"
+        --set nodeSelector."${nth_label}"
+        --set tolerations[0].operator=Exists
+        --set awsAccessKeyID=foo
+        --set awsSecretAccessKey=bar
+        --set awsRegion="${REGION}"
+        --set checkTagBeforeDraining=false
+        --set enableSqsTerminationDraining=true
+        --set queueURL="${queue_url}"
+        --wait
+    )
+
+    set -x
+    helm "${anth_helm_args[@]}"
+    set +x
+
+    sleep 15
+}
+
+function provision_node_group {
+    node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
+    if [[ -z $node_group_exists ]]; then
+        echo "🥑 Provisioning Spot Node Group"
+    else 
+        echo "🥑 Re-initializing $node_group_name for testing purposes"
+        eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve
+        echo ""
+
+        node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
+        echo -n "Node group Deleting."
+        while [[ -n $node_group_exists ]]; do
+            echo -n "."
+            node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
+            sleep 10
+        done
+        echo ""
+        sleep 20
+        # asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName')
+    fi
+
+    eksctl create nodegroup --config-file=$NODE_GROUP_CONFIG_FILE
+    update_ASG
+
+    instance_ids=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_name | jq -r '.AutoScalingGroups | .[0].Instances | .[].InstanceId')
+    instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[] | .Instances | .[].InstanceId, .[].PrivateDnsName]')
+
+    nth_node_id=$(jq -r '.[0]' <<< $instance_data)
+    nth_node_ip=$(jq -r '.[1]' <<< $instance_data)
+    termination_node_id=$(jq -r '.[2]' <<< $instance_data)
+    termination_node_ip=$(jq -r '.[3]' <<< $instance_data)
+
+    kubectl label nodes $nth_node_ip $nth_label
+}
+
+function update_ASG {
+    asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName')
+    echo "Auto Scaling Group: $asg_name"
+
+    echo "🥑 Setting Capacity Rebalance"
+    aws autoscaling update-auto-scaling-group --auto-scaling-group-name $asg_name --capacity-rebalance
+    echo "🥑 Tagging ASG"
+    aws autoscaling create-or-update-tags --tags ResourceId=$asg_name,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true
+
+    create_auto_scaling_role
+    echo "🥑 Creating Lifecycle Hooks"
+    aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Launch-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_LAUNCHING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="ABANDON"     
+    aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Terminate-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_TERMINATING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="CONTINUE"
+}
+
+function create_auto_scaling_role {
+    auto_scaling_role_exists=$(aws iam get-role --role-name=$auto_scaling_role_name | grep "$auto_scaling_role_name" || :)
+    if [[ -z $auto_scaling_role_exists ]]; then
+        echo "🥑 Creating Auto Scaling Role"
+        auto_scaling_role_arn=$(aws iam create-service-linked-role --aws-service-name autoscaling.amazonaws.com --custom-suffix "nth-test" | jq -r '.Role.Arn')
+        sleep 10
+    else
+        echo "🥑 $auto_scaling_role_name already exists; continuing with test run"
+        auto_scaling_role_arn=$(aws iam get-role --role-name=$auto_scaling_role_name | jq -r '.Role.Arn')
+    fi
+}
+
+function create_FIS_role {
+    fis_role_exists=$(aws iam get-role --role-name $fis_role_name | grep "$fis_role_name" || :)
+    if [[ -z $fis_role_exists ]]; then
+        cat << EOF > /tmp/fis-role-trust-policy.json
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Principal": {
+                "Service": [
+                "fis.amazonaws.com"
+                ]
+            },
+            "Action": "sts:AssumeRole"
+        }
+    ]
+}
+EOF
+        echo "🥑 Creating FIS Role"
+        fis_role_arn=$(aws iam create-role --role-name $fis_role_name --assume-role-policy-document file:///tmp/fis-role-trust-policy.json | jq -r '.Role.Arn')
+        aws iam attach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn
+        sleep 10
+    else
+        echo "🥑 $fis_role_name already exists; continuing with test run"
+        fis_role_arn=$(aws iam get-role --role-name=$fis_role_name | jq -r '.Role.Arn')
+    fi
+}
+
+function create_experiment_template {
+    experiment_exists=$(aws fis list-experiment-templates | grep "$fis_template_name" || :)
+    if [[ -z $experiment_exists ]]; then
+        create_FIS_role
+        cat << EOF > /tmp/fis-experiment-template.json
+{
+    "description": "Test Spot Instance interruptions",
+    "targets": {
+        "oneSpotInstance": {
+            "resourceType": "aws:ec2:spot-instance",
+            "resourceTags": {
+                "Name": "interruptMe"
+            },
+            "filters": [
+                {
+                    "path": "State.Name",
+                    "values": [
+                        "running"
+                    ]
+                }
+            ],
+            "selectionMode": "COUNT(1)"
+        }
+    },
+    "actions": {
+        "interruptSpotInstance": {
+            "actionId": "aws:ec2:send-spot-instance-interruptions",
+            "parameters": {
+                "durationBeforeInterruption": "PT2M"
+            },
+            "targets": {
+                "SpotInstances": "oneSpotInstance"
+            }
+        }
+    },
+    "stopConditions": [
+        {
+            "source": "none"
+        }
+    ],
+    "roleArn": "$fis_role_arn",
+    "tags": {
+        "Name": "$fis_template_name"
+    }
+}
+EOF
+        echo "🥑 Creating experiment template"
+        template_id=$(aws fis create-experiment-template --cli-input-json file:///tmp/fis-experiment-template.json | jq -r .experimentTemplate.id)
+        echo "Template_ID: $template_id"
+    else
+        template_id=$(aws fis list-experiment-templates | jq -r --arg template_name $fis_template_name '.experimentTemplates | .[] | select(.tags | has("Name")) | select(.tags.Name | contains($template_name)) | .id')
+        echo "🥑 $fis_template_name already exists; continuing with test run"
+    fi
+}
+
+function start_FIS_experiment {
+    create_experiment_template
+    echo "🥑 Starting Experiment"
+    experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime')
+}
+
+function create_tags {
+    echo "🥑 Creating instance tags"
+    instance_id_string=$(tr '\n' ' ' <<< ${instance_ids})
+    eval 'aws ec2 create-tags --resources'" $instance_id_string "'--tags 'Key="aws-node-termination-handler/managed",Value=''
+    aws ec2 create-tags --resources "${termination_node_id}" --tags Key=Name,Value=interruptMe
+}
+
+function is_new_instance {
+    is_new_instance=""
+    if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
+        is_new_instance=false
+    else 
+        is_new_instance=""
+    fi
+}
+
+function get_launch_activity {
+    launch_activity=""
+    while [[ -z $launch_activity  ]]; do
+        sleep 5
+        activities=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name)
+        activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode]' <<< $activities)
+        num_activities=$(jq -r 'length' <<< $activities_details)
+        for i in $(seq 0 3 $((--num_activities))); do
+            id=$(jq -r .[$i] <<< $activities_details)
+            description=$(jq -r .[$((++i))] <<< $activities_details)
+            status=$(jq -r .[$((i+=2))] <<< $activities_details)
+            activity_instance=${description##*:}
+            is_new_instance $activity_instance
+            if [[ $description =~ .*"Launching".* && -z $is_new_instance ]]; then
+                launch_activity=$id
+                echo "🥑 Launch Activity found for instance $activity_instance"
+                break
+            fi    
+        done
+    done
+}
+
+function test_launch_lifecycle {
+    echo -n "🥑 Waiting for launch hook completion."
+    while [[ true ]]; do
+        activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[] | .StatusCode')
+        if [[ $activity_status == "Success" ]]; then
+            echo ""
+            echo "✅ Launch Lifecycle Successfully Completed ✅"
+            # exit 0
+            break
+        fi
+
+        if [[ $activity_status == "Cancelled" ]]; then
+            echo ""
+            echo "❌ Launch Lifecycle Failed ❌"
+            # exit 1
+            break
+        fi
+        echo -n "."
+        sleep 10
+    done
+}
+
+function clean_up {
+    echo "====================================================================================================="
+    echo "🧹  Cleaning up SQS, SNS, NodeGroup, IAM, FIS  🧹"
+    echo "====================================================================================================="
+    helm uninstall nth-eks-cluster-test-acth -n kube-system
+    eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve
+    aws sqs delete-queue --queue-url $queue_url
+    aws sns delete-topic --topic-arn $sns_arn
+    deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate)
+    aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn
+    aws iam delete-role --role-name $fis_role_name
+    aws iam delete-service-linked-role --role-name $auto_scaling_role_name
+}
+
+function main {
+    provision_sqs_queue
+    provision_sns_topic
+    subscribe_sqs_to_sns
+    provision_node_group
+    install_helm
+    create_tags
+    start_FIS_experiment
+    get_launch_activity
+    test_launch_lifecycle
+}
+
+main
+trap "clean_up" EXIT
diff --git a/test/eks-cluster-test/node_group-spec.yaml b/test/eks-cluster-test/node_group-spec.yaml
new file mode 100644
index 00000000..2fa39a78
--- /dev/null
+++ b/test/eks-cluster-test/node_group-spec.yaml
@@ -0,0 +1,15 @@
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+metadata:
+  name: nth-eks-cluster-test
+  region: us-west-2
+managedNodeGroups:
+  - name: nth-eks-cluster-test-spot-ng
+    instanceType: t3.medium
+    amiFamily: AmazonLinux2
+    desiredCapacity: 2
+    minSize: 2
+    maxSize: 2
+    spot: true
+iam:
+  withOIDC: true
\ No newline at end of file

From 75ed706f8b99e5d5f290ae7360927e78b1b861d5 Mon Sep 17 00:00:00 2001
From: Gavin Burris <gavinburris42@gmail.com>
Date: Thu, 5 Oct 2023 16:42:15 -0500
Subject: [PATCH 08/27] E2E tests for ASG launch lifecycle hook completion is
 complete

---
 test/e2e/asg-launch-lifecycle-sqs-test | 341 ++++++++++++++++---------
 1 file changed, 216 insertions(+), 125 deletions(-)

diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index db7013f1..95d80a6f 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -7,6 +7,7 @@ CLUSTER_NAME="nth-eks-cluster-test"
 node_group_name="nth-eks-cluster-test-spot-ng"
 sqs_queue_name="nth-sqs-test"
 sns_topic_name="nth-sns-test"
+node_policy_name="nth-test-node-policy"
 auto_scaling_role_name="AWSServiceRoleForAutoScaling_nth-test"
 auto_scaling_policy_arn="arn:aws:iam::aws:policy/aws-service-role/AutoScalingServiceRolePolicy"
 fis_role_name="nth-test-fis-role"
@@ -17,9 +18,10 @@ NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml"
 account_id=$(aws sts get-caller-identity | jq -r '.Account')
 nth_label="Use-Case=NTH"
 
+##### JSON FILES #####
 
-## Queue Policy
-QUEUE_POLICY=$(cat <<EOF
+### SQS ###
+sqs_queue_policy=$(cat <<EOF
 {
     "Version": "2012-10-17",
     "Id": "MyQueuePolicy",
@@ -37,32 +39,122 @@ QUEUE_POLICY=$(cat <<EOF
 EOF
 )
 
-function create_queue {
-    cat << EOF > /tmp/queue-attributes.json
+cat << EOF > /tmp/sqs-subscription-policy.json
+{
+    "Policy": "{\"Version\":\"2012-10-17\",\"Id\":\"MyQueuePolicy\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Service\":[\"events.amazonaws.com\",\"sqs.amazonaws.com\"]},\"Action\":\"sqs:SendMessage\",\"Resource\":\"arn:aws:sqs:${REGION}:${account_id}:${sqs_queue_name}\"},{\"Sid\":\"topic-subscription-arn:aws:sns:${REGION}:${account_id}:${sns_topic_name}\",\"Effect\":\"Allow\",\"Principal\":{\"AWS\":\"*\"},\"Action\":\"SQS:SendMessage\",\"Resource\":\"arn:aws:sqs:${REGION}:${account_id}:${sqs_queue_name}\",\"Condition\":{\"ArnLike\":{\"aws:SourceArn\":\"arn:aws:sns:${REGION}:${account_id}:${sns_topic_name}\"}}}]}"
+}
+EOF
+
+cat << EOF > /tmp/queue-attributes.json
 {
 "MessageRetentionPeriod": "300",
-"Policy": "$(echo $QUEUE_POLICY | sed 's/\"/\\"/g' | tr -d -s '\n' " ")",
+"Policy": "$(echo $sqs_queue_policy | sed 's/\"/\\"/g' | tr -d -s '\n' " ")",
 "SqsManagedSseEnabled": "true"
 }
 EOF
 
-queue_url=$(aws sqs create-queue --queue-name "${sqs_queue_name}" --attributes file:///tmp/queue-attributes.json | jq -r .QueueUrl)
+### NODEGROUP ###
+cat << EOF > /tmp/nth-nodegroup-policy.json
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Action": [
+                "autoscaling:CompleteLifecycleAction",
+                "autoscaling:DescribeAutoScalingInstances",
+                "autoscaling:DescribeTags",
+                "ec2:DescribeInstances",
+                "sqs:DeleteMessage",
+                "sqs:ReceiveMessage"
+            ],
+            "Resource": "*"
+        }
+    ]
 }
+EOF
 
+### FIS ###
+cat << EOF > /tmp/fis-role-trust-policy.json
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Principal": {
+                "Service": [
+                "fis.amazonaws.com"
+                ]
+            },
+            "Action": "sts:AssumeRole"
+        }
+    ]
+}
+EOF
+
+function create_FIS_Template_JSON {
+cat << EOF > /tmp/fis-experiment-template.json
+{
+    "description": "Test Spot Instance interruptions",
+    "targets": {
+        "oneSpotInstance": {
+            "resourceType": "aws:ec2:spot-instance",
+            "resourceTags": {
+                "Name": "interruptMe"
+            },
+            "filters": [
+                {
+                    "path": "State.Name",
+                    "values": [
+                        "running"
+                    ]
+                }
+            ],
+            "selectionMode": "COUNT(1)"
+        }
+    },
+    "actions": {
+        "interruptSpotInstance": {
+            "actionId": "aws:ec2:send-spot-instance-interruptions",
+            "parameters": {
+                "durationBeforeInterruption": "PT2M"
+            },
+            "targets": {
+                "SpotInstances": "oneSpotInstance"
+            }
+        }
+    },
+    "stopConditions": [
+        {
+            "source": "none"
+        }
+    ],
+    "roleArn": "$fis_role_arn",
+    "tags": {
+        "Name": "$fis_template_name"
+    }
+}
+EOF
+}
+
+
+##### SETUP #####
+
+### SQS ###
 function provision_sqs_queue {
     queue_exists=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name)
     if [[ -z $queue_exists ]]; then
         echo "🥑 Provisioning SQS Queue"
-        create_queue
+        queue_url=$(aws sqs create-queue --queue-name "${sqs_queue_name}" --attributes file:///tmp/queue-attributes.json | jq -r .QueueUrl)
     else 
         echo "🥑 $sqs_queue_name already exists; continuing with test run"
         queue_url=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name | jq -r '.QueueUrls | .[0]')
     fi
-    
-    echo "Queue URL: $queue_url"
     sqs_arn=$(aws sqs get-queue-attributes --queue-url=$queue_url --attribute-names=QueueArn | jq -r .Attributes.QueueArn)
+    aws sqs set-queue-attributes --queue-url $queue_url --attributes file:///tmp/sqs-subscription-policy.json
 }
 
+### SNS ###
 function provision_sns_topic {
     topic_exists=$(aws sns list-topics | grep "$sns_topic_name" || :)
     if [[ -z $topic_exists ]]; then
@@ -70,56 +162,28 @@ function provision_sns_topic {
         sns_arn=$(aws sns create-topic --name $sns_topic_name | jq -r .TopicArn)
     else 
         echo "🥑 $sns_topic_name already exists; continuing with test run"
-        sns_arn=$(aws sns list-topics | jq -r '.Topics | .[] | .TopicArn' | grep "nth-sns-test")
+        sns_arn=$(aws sns list-topics | jq -r '.Topics | .[].TopicArn' | grep "$sns_topic_name")
     fi
-    echo "SNS ARN: $sns_arn"
 }
 
 function subscribe_sqs_to_sns {
     num_subscriptions=$(aws sns list-subscriptions-by-topic --topic-arn $sns_arn | jq '.Subscriptions | length')
     if [[ $num_subscriptions -eq 0 ]]; then
         echo "🥑 Subscribing $sns_topic_name to $sqs_queue_name"
-        aws sns subscribe --topic-arn $sns_arn --protocol sqs --notification-endpoint $sqs_arn
+        subscription_arn=$(aws sns subscribe --topic-arn $sns_arn --protocol sqs --notification-endpoint $sqs_arn | jq -r .SubscriptionArn)
     else 
         echo "🥑 $sns_topic_name already subscribed to $sqs_queue_name; continuing with test run"
+        subscription_arn=$(aws sns list-subscriptions-by-topic --topic-arn $sns_arn | jq -r '.Subscriptions | .[0].SubscriptionArn')
     fi
 }
 
-function install_helm {
-    anth_helm_args=(
-        upgrade
-        --install
-        --namespace kube-system
-        "$CLUSTER_NAME-acth"
-        "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
-        --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
-        --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
-        --set image.pullPolicy="Always"
-        --set nodeSelector."${nth_label}"
-        --set tolerations[0].operator=Exists
-        --set awsAccessKeyID=foo
-        --set awsSecretAccessKey=bar
-        --set awsRegion="${REGION}"
-        --set checkTagBeforeDraining=false
-        --set enableSqsTerminationDraining=true
-        --set queueURL="${queue_url}"
-        --wait
-    )
-
-    set -x
-    helm "${anth_helm_args[@]}"
-    set +x
-
-    sleep 15
-}
-
+### NODEGROUP ###
 function provision_node_group {
+    create_node_policy
     node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
-    if [[ -z $node_group_exists ]]; then
-        echo "🥑 Provisioning Spot Node Group"
-    else 
-        echo "🥑 Re-initializing $node_group_name for testing purposes"
-        eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve
+    if [[ -n $node_group_exists ]]; then
+        get_node_role_name
+        delete_node_group
         echo ""
 
         node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
@@ -131,26 +195,51 @@ function provision_node_group {
         done
         echo ""
         sleep 20
-        # asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName')
     fi
 
+    echo "🥑 Provisioning Spot Node Group"
     eksctl create nodegroup --config-file=$NODE_GROUP_CONFIG_FILE
+
+    echo "🥑 Attaching Node policy to Node role"
+    get_node_role_name
+    aws iam attach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn
+
     update_ASG
+    set_node_data
+    kubectl label nodes $nth_node_ip $nth_label
+}
+
+function create_node_policy {
+    node_policy_exists=$(aws iam list-policies | grep "$node_policy_name" || :)
+    if [[ -z $node_policy_exists ]]; then
+        echo "🥑 Creating Node policy"
+        node_policy_arn=$(aws iam create-policy --policy-name $node_policy_name --policy-document file:///tmp/nth-nodegroup-policy.json | jq -r .Policy.Arn)
+    else 
+        echo "🥑 $node_policy_name already exists; continuing with test run"
+        node_policy_arn=$(aws iam list-policies | jq -r --arg policy_name $node_policy_name '.Policies | .[] | select(.PolicyName | contains($policy_name)) | .Arn')
+    fi
+
+    sleep 10
+}
+
+function get_node_role_name {
+    node_role_arn=$(aws eks describe-nodegroup --cluster-name $CLUSTER_NAME --nodegroup-name $node_group_name | jq -r .nodegroup.nodeRole)
+    split_node_role_arn=($(tr '/' ' ' <<< $node_role_arn))
+    node_role_name=${split_node_role_arn[1]}
+}
 
+function set_node_data {
     instance_ids=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_name | jq -r '.AutoScalingGroups | .[0].Instances | .[].InstanceId')
-    instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[] | .Instances | .[].InstanceId, .[].PrivateDnsName]')
+    instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[].Instances | .[].InstanceId, .[].PrivateDnsName]')
 
     nth_node_id=$(jq -r '.[0]' <<< $instance_data)
     nth_node_ip=$(jq -r '.[1]' <<< $instance_data)
     termination_node_id=$(jq -r '.[2]' <<< $instance_data)
     termination_node_ip=$(jq -r '.[3]' <<< $instance_data)
-
-    kubectl label nodes $nth_node_ip $nth_label
 }
 
 function update_ASG {
     asg_name=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$node_group_name --output=json | jq -r '.[0].AutoScalingGroupName')
-    echo "Auto Scaling Group: $asg_name"
 
     echo "🥑 Setting Capacity Rebalance"
     aws autoscaling update-auto-scaling-group --auto-scaling-group-name $asg_name --capacity-rebalance
@@ -175,25 +264,40 @@ function create_auto_scaling_role {
     fi
 }
 
+### HELM ###
+function install_helm {
+
+    anth_helm_args=(
+        upgrade
+        --install
+        --namespace kube-system
+        "$CLUSTER_NAME-acth"
+        "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
+        --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
+        --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
+        --set image.pullPolicy="Always"
+        --set nodeSelector."${nth_label}"
+        --set tolerations[0].operator=Exists
+        --set awsAccessKeyID=$(aws --profile default configure get aws_access_key_id)
+        --set awsSecretAccessKey=$(aws --profile default configure get aws_secret_access_key)
+        --set awsRegion="${REGION}"
+        --set checkTagBeforeDraining=false
+        --set enableSqsTerminationDraining=true
+        --set queueURL="${queue_url}"
+        --wait
+    )
+
+    set -x
+    helm "${anth_helm_args[@]}"
+    set +x
+
+    sleep 15
+}
+
+### FIS ###
 function create_FIS_role {
     fis_role_exists=$(aws iam get-role --role-name $fis_role_name | grep "$fis_role_name" || :)
     if [[ -z $fis_role_exists ]]; then
-        cat << EOF > /tmp/fis-role-trust-policy.json
-{
-    "Version": "2012-10-17",
-    "Statement": [
-        {
-            "Effect": "Allow",
-            "Principal": {
-                "Service": [
-                "fis.amazonaws.com"
-                ]
-            },
-            "Action": "sts:AssumeRole"
-        }
-    ]
-}
-EOF
         echo "🥑 Creating FIS Role"
         fis_role_arn=$(aws iam create-role --role-name $fis_role_name --assume-role-policy-document file:///tmp/fis-role-trust-policy.json | jq -r '.Role.Arn')
         aws iam attach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn
@@ -207,64 +311,15 @@ EOF
 function create_experiment_template {
     experiment_exists=$(aws fis list-experiment-templates | grep "$fis_template_name" || :)
     if [[ -z $experiment_exists ]]; then
-        create_FIS_role
-        cat << EOF > /tmp/fis-experiment-template.json
-{
-    "description": "Test Spot Instance interruptions",
-    "targets": {
-        "oneSpotInstance": {
-            "resourceType": "aws:ec2:spot-instance",
-            "resourceTags": {
-                "Name": "interruptMe"
-            },
-            "filters": [
-                {
-                    "path": "State.Name",
-                    "values": [
-                        "running"
-                    ]
-                }
-            ],
-            "selectionMode": "COUNT(1)"
-        }
-    },
-    "actions": {
-        "interruptSpotInstance": {
-            "actionId": "aws:ec2:send-spot-instance-interruptions",
-            "parameters": {
-                "durationBeforeInterruption": "PT2M"
-            },
-            "targets": {
-                "SpotInstances": "oneSpotInstance"
-            }
-        }
-    },
-    "stopConditions": [
-        {
-            "source": "none"
-        }
-    ],
-    "roleArn": "$fis_role_arn",
-    "tags": {
-        "Name": "$fis_template_name"
-    }
-}
-EOF
+        create_FIS_Template_JSON
         echo "🥑 Creating experiment template"
         template_id=$(aws fis create-experiment-template --cli-input-json file:///tmp/fis-experiment-template.json | jq -r .experimentTemplate.id)
-        echo "Template_ID: $template_id"
     else
         template_id=$(aws fis list-experiment-templates | jq -r --arg template_name $fis_template_name '.experimentTemplates | .[] | select(.tags | has("Name")) | select(.tags.Name | contains($template_name)) | .id')
         echo "🥑 $fis_template_name already exists; continuing with test run"
     fi
 }
 
-function start_FIS_experiment {
-    create_experiment_template
-    echo "🥑 Starting Experiment"
-    experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime')
-}
-
 function create_tags {
     echo "🥑 Creating instance tags"
     instance_id_string=$(tr '\n' ' ' <<< ${instance_ids})
@@ -272,6 +327,16 @@ function create_tags {
     aws ec2 create-tags --resources "${termination_node_id}" --tags Key=Name,Value=interruptMe
 }
 
+function start_FIS_experiment {
+    create_tags
+    create_FIS_role
+    create_experiment_template
+    echo "🥑 Starting Experiment"
+    experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime')
+}
+
+
+##### TESTING #####
 function is_new_instance {
     is_new_instance=""
     if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
@@ -304,20 +369,21 @@ function get_launch_activity {
 }
 
 function test_launch_lifecycle {
+    aws sqs receive-message --queue-url $queue_url
     echo -n "🥑 Waiting for launch hook completion."
     while [[ true ]]; do
-        activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[] | .StatusCode')
-        if [[ $activity_status == "Success" ]]; then
+        activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[].StatusCode')
+        if [[ $activity_status == "Successful" ]]; then
             echo ""
             echo "✅ Launch Lifecycle Successfully Completed ✅"
-            # exit 0
+            exit_policy="exit 0"
             break
         fi
 
         if [[ $activity_status == "Cancelled" ]]; then
             echo ""
             echo "❌ Launch Lifecycle Failed ❌"
-            # exit 1
+            exit_policy="exit 1"
             break
         fi
         echo -n "."
@@ -325,31 +391,56 @@ function test_launch_lifecycle {
     done
 }
 
+
+##### CLEAN UP #####
 function clean_up {
     echo "====================================================================================================="
     echo "🧹  Cleaning up SQS, SNS, NodeGroup, IAM, FIS  🧹"
     echo "====================================================================================================="
-    helm uninstall nth-eks-cluster-test-acth -n kube-system
-    eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve
+    echo "🥑 Uninstalling NTH helm chart"
+    helm uninstall "$CLUSTER_NAME-acth" -n kube-system
+    delete_node_group
+    echo "🥑 Unsubscribing SNS from SQS"
+    aws sns unsubscribe --subscription-arn $subscription_arn
+    echo "🥑 Deleting SQS queue"
     aws sqs delete-queue --queue-url $queue_url
+    echo "🥑 Deleting SNS topic"
     aws sns delete-topic --topic-arn $sns_arn
+    echo "🥑 Deleting FIS experiment template"
     deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate)
+    echo "🥑 Detaching FIS role policy"
     aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn
+    echo "🥑 Deleting FIS role"
     aws iam delete-role --role-name $fis_role_name
+    echo "🥑 Deleting autoscaling role"
     aws iam delete-service-linked-role --role-name $auto_scaling_role_name
+    echo "🥑 Deleting Node role policy"
+    aws iam delete-policy --policy-arn $node_policy_arn
+}
+
+function delete_node_group {
+    echo "Node Role Name: $node_role_name"
+    node_policy_exists=$(aws iam list-attached-role-policies --role-name $node_role_name | grep "$node_policy_name" || :)
+    echo $node_policy_exists
+    if [[ -n $node_policy_exists ]]; then
+        echo "🥑 Detaching NTH Node Group policy"
+        aws iam detach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn
+    fi
+    echo "🥑 Deleting NTH Node Group"
+    eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve
 }
 
 function main {
     provision_sqs_queue
-    provision_sns_topic
+    provision_sns_topic    
     subscribe_sqs_to_sns
     provision_node_group
     install_helm
-    create_tags
     start_FIS_experiment
     get_launch_activity
     test_launch_lifecycle
+    trap "clean_up" EXIT
+    eval $exit_policy
 }
 
 main
-trap "clean_up" EXIT

From aed12d161313d5d9e7e0693547759ce0f1d3841f Mon Sep 17 00:00:00 2001
From: Gavin Burris <gavinburris42@gmail.com>
Date: Wed, 1 Nov 2023 12:46:43 -0500
Subject: [PATCH 09/27] Added bash script to run test files. Revised method
 names and placemetns in ASG launch lifecycle calls

---
 pkg/monitor/sqsevent/asg-lifecycle-event.go | 21 ++++++++-------------
 pkg/monitor/sqsevent/sqs-monitor.go         | 13 ++++++++++---
 test/e2e/asg-launch-lifecycle-sqs-test      |  8 +++-----
 test/eks-cluster-test/run-test              |  1 +
 test/k8s-local-cluster-test/run-test        | 14 ++++++++++++++
 5 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index e03ba9b9..dea1ed85 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -101,11 +101,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 	}
 
 	interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error {
-		errs := m.deleteMessages([]*sqs.Message{message})
-		if errs != nil {
-			return errs[0]
-		}
-		return nil
+		return m.deleteMessage(message)
 	}
 
 	interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
@@ -119,10 +115,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
 	return &interruptionEvent, nil
 }
 
-func (m SQSMonitor) logAndDeleteLifecycle(lifecycleDetail *LifecycleDetail, message *sqs.Message) error {
-	log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
-		lifecycleDetail.LifecycleHookName,
-		lifecycleDetail.EC2InstanceID)
+func (m SQSMonitor) deleteMessage(message *sqs.Message) error {
 	errs := m.deleteMessages([]*sqs.Message{message})
 	if errs != nil {
 		return errs[0]
@@ -142,7 +135,7 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 }
 
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
-func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error {
+func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error {
 	lifecycleDetail := &LifecycleDetail{}
 	err := json.Unmarshal(event.Detail, lifecycleDetail)
 	if err != nil {
@@ -158,12 +151,14 @@ func (m SQSMonitor) asgCompleteLaunchLifecycle(event *EventBridgeEvent, message
 	}
 
 	_, err = m.continueLifecycleAction(lifecycleDetail)
-
 	if err != nil {
-		return ignore{skip{fmt.Errorf("completing ASG launch lifecyle: %w", err)}}
+		return ignore{skip{fmt.Errorf("continuing ASG launch lifecyle: %w", err)}}
 	}
 
-	err = m.logAndDeleteLifecycle(lifecycleDetail, message)
+	log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
+		lifecycleDetail.LifecycleHookName,
+		lifecycleDetail.EC2InstanceID)
+	err = m.deleteMessage(message)
 	return err
 }
 
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 7e7691d6..9805c701 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -60,6 +60,7 @@ type InterruptionEventWrapper struct {
 	Err               error
 }
 
+// Used to skip processing an error, but acknowledge an error occured during a termination event
 type skip struct {
 	err error
 }
@@ -72,6 +73,7 @@ func (s skip) Unwrap() error {
 	return s.err
 }
 
+// Used to completely ignore an error. Used when processing a non-terminating event
 type ignore struct {
 	err error
 }
@@ -148,6 +150,10 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 	lifecycleEventMessage := LifecycleDetailMessage{}
 	lifecycleEvent := LifecycleDetail{}
 	err := json.Unmarshal([]byte(*message.Body), &lifecycleEventMessage)
+	if err != nil {
+		log.Err(err).Msg("processing JSON message of lifecycle event from ASG")
+		return eventBridgeEvent, err
+	}
 	err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 
 	switch {
@@ -182,13 +188,14 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper {
 	interruptionEventWrappers := []InterruptionEventWrapper{}
 	interruptionEvent := &monitor.InterruptionEvent{}
-	lifecycleEvent := LifecycleDetail{}
-	err := json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
+	var err error
 
 	switch eventBridgeEvent.Source {
 	case "aws.autoscaling":
+		lifecycleEvent := LifecycleDetail{}
+		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
 		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
-			err = m.asgCompleteLaunchLifecycle(eventBridgeEvent, message)
+			err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message)
 			interruptionEvent = nil
 		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index 95d80a6f..2452a299 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -338,12 +338,11 @@ function start_FIS_experiment {
 
 ##### TESTING #####
 function is_new_instance {
-    is_new_instance=""
+    is_new_instance=true
     if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
         is_new_instance=false
-    else 
-        is_new_instance=""
     fi
+    echo $is_new_instance
 }
 
 function get_launch_activity {
@@ -358,8 +357,7 @@ function get_launch_activity {
             description=$(jq -r .[$((++i))] <<< $activities_details)
             status=$(jq -r .[$((i+=2))] <<< $activities_details)
             activity_instance=${description##*:}
-            is_new_instance $activity_instance
-            if [[ $description =~ .*"Launching".* && -z $is_new_instance ]]; then
+            if [[ $description =~ .*"Launching".* && is_new_instance $activity_instance ]]; then
                 launch_activity=$id
                 echo "🥑 Launch Activity found for instance $activity_instance"
                 break
diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test
index 9d6a7f77..beebf0dd 100755
--- a/test/eks-cluster-test/run-test
+++ b/test/eks-cluster-test/run-test
@@ -194,6 +194,7 @@ function reset_cluster {
 
 if [[ -z ${assertion_scripts+x} ]]; then
     assertion_scripts=(
+        "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test"
         "$SCRIPTPATH/../e2e/cordon-only-test"
         "$SCRIPTPATH/../e2e/imds-v2-test"
         "$SCRIPTPATH/../e2e/maintenance-event-cancellation-test"
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index b21b2c64..4951805c 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -24,6 +24,10 @@ WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"
 
 ASSERTION_SCRIPTS=$(find "$SCRIPTPATH/../e2e" -type f | sort)
 
+SCRIPT_BLACKLIST=(
+    "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test"
+)
+
 function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 
 function relpath() {
@@ -271,8 +275,18 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\
 ## Mark worker2 only for Critical Add-Ons like dns
 kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite
 
+function is_blacklisted {
+    is_blacklisted=false
+    if [[ $SCRIPT_BLACKLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
+        is_blacklisted=true
+    fi
+    return $is_blacklisted
+}
+
 i=0
 for assert_script in $ASSERTION_SCRIPTS; do
+  if[[ is_blacklisted $assert_script ]]; then continue; fi
+
   reset_cluster
   START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ")
   IMDS_PORT=$((i + 1338))

From 6838bcc25b20933101275a7d514ae9bdb9176488 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 21 Nov 2023 13:38:23 -0600
Subject: [PATCH 10/27] Refactored the unmarshalling of the SQS message

---
 pkg/monitor/sqsevent/sqs-monitor.go | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 9805c701..37a7e6ef 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -144,17 +144,26 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent,
 	return &event, err
 }
 
-// processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent
-func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
-	eventBridgeEvent := EventBridgeEvent{}
+func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 	lifecycleEventMessage := LifecycleDetailMessage{}
 	lifecycleEvent := LifecycleDetail{}
-	err := json.Unmarshal([]byte(*message.Body), &lifecycleEventMessage)
+	err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage)
 	if err != nil {
-		log.Err(err).Msg("processing JSON message of lifecycle event from ASG")
-		return eventBridgeEvent, err
+		// log.Err(err).Msg("processing JSON message of lifecycle event from ASG")
+		return lifecycleEvent, err
+	}
+	if lifecycleEventMessage.Message != nil {
+		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
+	} else {
+		err = json.Unmarshal([]byte(fmt.Sprintf("%v", *messageBody)), &lifecycleEvent)
 	}
-	err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
+	return lifecycleEvent, err
+}
+
+// processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent
+func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
+	eventBridgeEvent := EventBridgeEvent{}
+	lifecycleEvent, err := messageToLifecycleEvent(message.Body)
 
 	switch {
 	case err != nil:

From 3ee0880cee76f8b18eafee4f53a8515f2b2826e4 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 21 Nov 2023 15:18:01 -0600
Subject: [PATCH 11/27] Refactor the creation and usage of the K8s client

---
 cmd/node-termination-handler.go             | 16 ++++++++++++++--
 pkg/monitor/sqsevent/asg-lifecycle-event.go | 18 ++++--------------
 pkg/monitor/sqsevent/sqs-monitor.go         |  2 ++
 pkg/node/node.go                            | 17 +++--------------
 pkg/node/node_test.go                       | 10 ++++++++--
 pkg/observability/k8s-events.go             | 13 +------------
 6 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 9f7dcaf1..d4a491bc 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -45,6 +45,8 @@ import (
 	"github.com/rs/zerolog/log"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
 )
 
 const (
@@ -97,7 +99,16 @@ func main() {
 		nthConfig.Print()
 		log.Fatal().Err(err).Msg("Webhook validation failed,")
 	}
-	node, err := node.New(nthConfig)
+
+	clusterConfig, err := rest.InClusterConfig()
+	if err != nil {
+		log.Fatal().Err(err).Msgf("retreiving cluster config: %v", err)
+	}
+	clientset, err := kubernetes.NewForConfig(clusterConfig)
+	if err != nil {
+		log.Fatal().Err(err).Msgf("creating new clientset with config: %v", err)
+	}
+	node, err := node.New(nthConfig, clientset)
 	if err != nil {
 		nthConfig.Print()
 		log.Fatal().Err(err).Msg("Unable to instantiate a node for various kubernetes node functions,")
@@ -137,7 +148,7 @@ func main() {
 		log.Fatal().Msgf("Unable to find the AWS region to process queue events.")
 	}
 
-	recorder, err := observability.InitK8sEventRecorder(nthConfig.EmitKubernetesEvents, nthConfig.NodeName, nthConfig.EnableSQSTerminationDraining, nodeMetadata, nthConfig.KubernetesEventsExtraAnnotations)
+	recorder, err := observability.InitK8sEventRecorder(nthConfig.EmitKubernetesEvents, nthConfig.NodeName, nthConfig.EnableSQSTerminationDraining, nodeMetadata, nthConfig.KubernetesEventsExtraAnnotations, clientset)
 	if err != nil {
 		nthConfig.Print()
 		log.Fatal().Err(err).Msg("Unable to create Kubernetes event recorder,")
@@ -204,6 +215,7 @@ func main() {
 			ASG:                           autoscaling.New(sess),
 			EC2:                           ec2.New(sess),
 			BeforeCompleteLifecycleAction: func() { <-time.After(completeLifecycleActionDelay) },
+			K8sClientset:                  clientset,
 		}
 		monitoringFns[sqsEvents] = sqsMonitor
 	}
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index dea1ed85..50493bf7 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -28,7 +28,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
-	"k8s.io/client-go/rest"
 )
 
 /* Example SQS ASG Lifecycle Termination Event Message:
@@ -146,7 +145,7 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message
 		return ignore{skip{fmt.Errorf("message is an ASG test notification")}}
 	}
 
-	if !isNodeReady(lifecycleDetail) {
+	if !isNodeReady(lifecycleDetail, m.K8sClientset) {
 		return ignore{skip{fmt.Errorf("new ASG instance has not connected to cluster")}}
 	}
 
@@ -163,8 +162,8 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message
 }
 
 // If the Node, new EC2 instance, is ready in the K8s cluster
-func isNodeReady(lifecycleDetail *LifecycleDetail) bool {
-	nodes, err := getNodes()
+func isNodeReady(lifecycleDetail *LifecycleDetail, clientset *kubernetes.Clientset) bool {
+	nodes, err := getNodes(clientset)
 	if err != nil {
 		log.Err(fmt.Errorf("getting nodes from cluster: %w", err))
 		return false
@@ -189,16 +188,7 @@ func isNodeReady(lifecycleDetail *LifecycleDetail) bool {
 }
 
 // Gets Nodes connected to K8s cluster
-func getNodes() (*v1.NodeList, error) {
-	clusterConfig, err := rest.InClusterConfig()
-	if err != nil {
-		return nil, fmt.Errorf("retreiving cluster config: %w", err)
-	}
-	// creates the clientset
-	clientset, err := kubernetes.NewForConfig(clusterConfig)
-	if err != nil {
-		return nil, fmt.Errorf("creating new clientset with config: %w", err)
-	}
+func getNodes(clientset *kubernetes.Clientset) (*v1.NodeList, error) {
 	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 	if err != nil {
 		return nil, fmt.Errorf("retreiving nodes from cluster: %w", err)
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 37a7e6ef..8bdd2a0a 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -28,6 +28,7 @@ import (
 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/aws/aws-sdk-go/service/sqs/sqsiface"
+	"k8s.io/client-go/kubernetes"
 
 	"github.com/rs/zerolog/log"
 
@@ -52,6 +53,7 @@ type SQSMonitor struct {
 	CheckIfManaged                bool
 	ManagedTag                    string
 	BeforeCompleteLifecycleAction func()
+	K8sClientset                  *kubernetes.Clientset
 }
 
 // InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any
diff --git a/pkg/node/node.go b/pkg/node/node.go
index ffd04bb5..2b62e768 100644
--- a/pkg/node/node.go
+++ b/pkg/node/node.go
@@ -31,7 +31,6 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes"
-	"k8s.io/client-go/rest"
 	"k8s.io/kubectl/pkg/drain"
 )
 
@@ -84,8 +83,8 @@ type Node struct {
 }
 
 // New will construct a node struct to perform various node function through the kubernetes api server
-func New(nthConfig config.Config) (*Node, error) {
-	drainHelper, err := getDrainHelper(nthConfig)
+func New(nthConfig config.Config, clientset *kubernetes.Clientset) (*Node, error) {
+	drainHelper, err := getDrainHelper(nthConfig, clientset)
 	if err != nil {
 		return nil, err
 	}
@@ -634,7 +633,7 @@ func (n Node) fetchAllPods(nodeName string) (*corev1.PodList, error) {
 	})
 }
 
-func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) {
+func getDrainHelper(nthConfig config.Config, clientset *kubernetes.Clientset) (*drain.Helper, error) {
 	drainHelper := &drain.Helper{
 		Ctx:                 context.TODO(),
 		Client:              &kubernetes.Clientset{},
@@ -652,17 +651,7 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) {
 		return drainHelper, nil
 	}
 
-	clusterConfig, err := rest.InClusterConfig()
-	if err != nil {
-		return nil, err
-	}
-	// creates the clientset
-	clientset, err := kubernetes.NewForConfig(clusterConfig)
-	if err != nil {
-		return nil, err
-	}
 	drainHelper.Client = clientset
-
 	return drainHelper, nil
 }
 
diff --git a/pkg/node/node_test.go b/pkg/node/node_test.go
index 93496872..e9837e6c 100644
--- a/pkg/node/node_test.go
+++ b/pkg/node/node_test.go
@@ -63,8 +63,13 @@ func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node {
 	return tNode
 }
 
+func getNewNode(nthConfig config.Config, client *fake.Clientset) (*node.Node, error) {
+	drainHelper := getDrainHelper(client)
+	return node.NewWithValues(nthConfig, drainHelper, uptime.Uptime)
+}
+
 func TestDryRun(t *testing.T) {
-	tNode, err := node.New(config.Config{DryRun: true})
+	tNode, err := getNewNode(config.Config{DryRun: true}, fake.NewSimpleClientset())
 	h.Ok(t, err)
 
 	fakeRecorder := record.NewFakeRecorder(recorderBufferSize)
@@ -103,7 +108,8 @@ func TestDryRun(t *testing.T) {
 }
 
 func TestNewFailure(t *testing.T) {
-	_, err := node.New(config.Config{})
+	client := fake.NewSimpleClientset()
+	_, err := getNewNode(config.Config{}, client)
 	h.Assert(t, true, "Failed to return error when creating new Node.", err != nil)
 }
 
diff --git a/pkg/observability/k8s-events.go b/pkg/observability/k8s-events.go
index a3da3778..6b7caf25 100644
--- a/pkg/observability/k8s-events.go
+++ b/pkg/observability/k8s-events.go
@@ -27,7 +27,6 @@ import (
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/kubernetes/scheme"
 	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
-	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/record"
 )
 
@@ -80,7 +79,7 @@ type K8sEventRecorder struct {
 }
 
 // InitK8sEventRecorder creates a Kubernetes event recorder
-func InitK8sEventRecorder(enabled bool, nodeName string, sqsMode bool, nodeMetadata ec2metadata.NodeMetadata, extraAnnotationsStr string) (K8sEventRecorder, error) {
+func InitK8sEventRecorder(enabled bool, nodeName string, sqsMode bool, nodeMetadata ec2metadata.NodeMetadata, extraAnnotationsStr string, clientSet *kubernetes.Clientset) (K8sEventRecorder, error) {
 	if !enabled {
 		return K8sEventRecorder{}, nil
 	}
@@ -107,16 +106,6 @@ func InitK8sEventRecorder(enabled bool, nodeName string, sqsMode bool, nodeMetad
 		}
 	}
 
-	config, err := rest.InClusterConfig()
-	if err != nil {
-		return K8sEventRecorder{}, err
-	}
-
-	clientSet, err := kubernetes.NewForConfig(config)
-	if err != nil {
-		return K8sEventRecorder{}, err
-	}
-
 	broadcaster := record.NewBroadcaster()
 	broadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientSet.CoreV1().Events("")})
 

From 470bc5fdef59a9846799f2278b35647a8f257d72 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Fri, 1 Dec 2023 10:04:57 -0600
Subject: [PATCH 12/27] Updated run-test with inclusive terminology

---
 test/k8s-local-cluster-test/run-test | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index 4951805c..992e1fcc 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -23,8 +23,7 @@ AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v
 WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"}
 
 ASSERTION_SCRIPTS=$(find "$SCRIPTPATH/../e2e" -type f | sort)
-
-SCRIPT_BLACKLIST=(
+SCRIPT_DENYLIST=(
     "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test"
 )
 
@@ -275,17 +274,17 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\
 ## Mark worker2 only for Critical Add-Ons like dns
 kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite
 
-function is_blacklisted {
-    is_blacklisted=false
-    if [[ $SCRIPT_BLACKLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
-        is_blacklisted=true
+function is_denylisted {
+    is_denylisted=false
+    if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
+        is_denylisted=true
     fi
-    return $is_blacklisted
+    return $is_denylisted
 }
 
 i=0
 for assert_script in $ASSERTION_SCRIPTS; do
-  if[[ is_blacklisted $assert_script ]]; then continue; fi
+  if[[ $(is_denylisted $assert_script) ]]; then continue; fi
 
   reset_cluster
   START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ")

From 96127141fccb91602b134ab3adcce6a61dcd40ae Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Fri, 1 Dec 2023 10:09:34 -0600
Subject: [PATCH 13/27] Fix boolean logic in bash scripts

---
 test/e2e/asg-launch-lifecycle-sqs-test | 10 ++++++----
 test/k8s-local-cluster-test/run-test   |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index 2452a299..f92e0a3a 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -338,11 +338,10 @@ function start_FIS_experiment {
 
 ##### TESTING #####
 function is_new_instance {
-    is_new_instance=true
+    is_new="true"
     if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
-        is_new_instance=false
+        is_new="false"
     fi
-    echo $is_new_instance
 }
 
 function get_launch_activity {
@@ -357,7 +356,8 @@ function get_launch_activity {
             description=$(jq -r .[$((++i))] <<< $activities_details)
             status=$(jq -r .[$((i+=2))] <<< $activities_details)
             activity_instance=${description##*:}
-            if [[ $description =~ .*"Launching".* && is_new_instance $activity_instance ]]; then
+            is_new_instance $activity_instance
+            if [[ $description =~ .*"Launching".* && $is_new == "true" ]]; then
                 launch_activity=$id
                 echo "🥑 Launch Activity found for instance $activity_instance"
                 break
@@ -395,6 +395,8 @@ function clean_up {
     echo "====================================================================================================="
     echo "🧹  Cleaning up SQS, SNS, NodeGroup, IAM, FIS  🧹"
     echo "====================================================================================================="
+    pod_id=$(get_nth_worker_pod || :)
+    kubectl logs $pod_id --namespace kube-system || :
     echo "🥑 Uninstalling NTH helm chart"
     helm uninstall "$CLUSTER_NAME-acth" -n kube-system
     delete_node_group
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index 992e1fcc..e9e5388b 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -275,16 +275,16 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\
 kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite
 
 function is_denylisted {
-    is_denylisted=false
+    is_denied="false"
     if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
-        is_denylisted=true
+        is_denied="true"
     fi
-    return $is_denylisted
 }
 
 i=0
 for assert_script in $ASSERTION_SCRIPTS; do
-  if[[ $(is_denylisted $assert_script) ]]; then continue; fi
+  is_denylisted $assert_script
+  if[[ $is_denied == "true" ]]; then continue; fi
 
   reset_cluster
   START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ")

From 7fbfe35545aaca5e3050791af52ff6dd15257788 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Fri, 1 Dec 2023 10:40:03 -0600
Subject: [PATCH 14/27] Refactored the processing of ASG Launch Lifecycle
 events as interruption events.

---
 cmd/node-termination-handler.go             | 139 ++++++++++++++++++--
 pkg/monitor/sqsevent/asg-lifecycle-event.go |  82 ++++--------
 pkg/monitor/sqsevent/sqs-monitor.go         |   5 +-
 pkg/monitor/types.go                        |   2 +
 4 files changed, 154 insertions(+), 74 deletions(-)

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index d4a491bc..1fdfd02f 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -43,7 +43,11 @@ import (
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
+	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
@@ -215,7 +219,6 @@ func main() {
 			ASG:                           autoscaling.New(sess),
 			EC2:                           ec2.New(sess),
 			BeforeCompleteLifecycleAction: func() { <-time.After(completeLifecycleActionDelay) },
-			K8sClientset:                  clientset,
 		}
 		monitoringFns[sqsEvents] = sqsMonitor
 	}
@@ -269,7 +272,7 @@ func main() {
 					event.InProgress = true
 					wg.Add(1)
 					recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description)
-					go drainOrCordonIfNecessary(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, &wg)
+					go processInterruptionEventFunctions(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg)
 				default:
 					log.Warn().Msg("all workers busy, waiting")
 					break EventLoop
@@ -341,21 +344,39 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
 	}
 }
 
-func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, wg *sync.WaitGroup) {
+func processInterruptionEventFunctions(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) {
 	defer wg.Done()
-	nodeFound := true
-	nodeName := drainEvent.NodeName
+	processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset)
+	drainOrCordonIfNecessary(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder)
+	<-interruptionEventStore.Workers
+}
 
-	if nthConfig.UseProviderId {
-		newNodeName, err := node.GetNodeNameFromProviderID(drainEvent.ProviderID)
+func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) {
+	if drainEvent.Kind != monitor.ASGLaunchLifecycleKind {
+		return
+	}
 
-		if err != nil {
-			log.Err(err).Msgf("Unable to get node name for node with ProviderID '%s' using original AWS event node name ", drainEvent.ProviderID)
-		} else {
-			nodeName = newNodeName
-		}
+	if !isNodeReady(drainEvent.InstanceID, clientset) {
+		log.Error().Msgf("new ASG instance, %s, has not connected to cluster", drainEvent.InstanceID)
+		interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
+		return
 	}
 
+	nodeName := getNodeName(drainEvent, node, nthConfig)
+
+	if drainEvent.PostDrainTask != nil {
+		runPostDrainTask(node, nodeName, drainEvent, metrics, recorder)
+	}
+}
+
+func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
+	if drainEvent.Kind == monitor.ASGLaunchLifecycleKind {
+		return
+	}
+
+	nodeFound := true
+	nodeName := getNodeName(drainEvent, node, nthConfig)
+
 	nodeLabels, err := node.GetNodeLabels(nodeName)
 	if err != nil {
 		log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName)
@@ -395,7 +416,99 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
 	if (err == nil || (!nodeFound && nthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
 		runPostDrainTask(node, nodeName, drainEvent, metrics, recorder)
 	}
-	<-interruptionEventStore.Workers
+}
+
+func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config) string {
+	nodeName := drainEvent.NodeName
+	if nthConfig.UseProviderId {
+		newNodeName, err := node.GetNodeNameFromProviderID(drainEvent.ProviderID)
+
+		if err != nil {
+			log.Err(err).Msgf("Unable to get node name for node with ProviderID '%s' using original AWS event node name ", drainEvent.ProviderID)
+		} else {
+			nodeName = newNodeName
+		}
+	}
+	return nodeName
+}
+
+func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool {
+	nodes, err := getNodesWithInstanceID(instanceID, clientset)
+	if err != nil {
+		log.Err(fmt.Errorf("getting nodes with instance ID: %w", err))
+		return false
+	}
+
+	if len(nodes) == 0 {
+		log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", instanceID))
+		return false
+	}
+
+	for _, node := range nodes {
+		conditions := node.Status.Conditions
+		for _, condition := range conditions {
+			if condition.Type != "Ready" {
+				continue
+			}
+			if condition.Status != "True" {
+				log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID))
+				return false
+			}
+		}
+	}
+	log.Info().Msgf("new ASG instance, %s, is found and ready in cluster", instanceID)
+	return true
+}
+
+// Gets Nodes connected to K8s cluster
+func getNodesWithInstanceID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
+	nodes, err := getNodesWithInstanceFromLabel(instanceID, clientset)
+	if err != nil {
+		return nil, err
+	}
+	if len(nodes) != 0 {
+		return nodes, nil
+	}
+
+	nodes, err = getNodesWithInstanceFromProviderID(instanceID, clientset)
+	if err != nil {
+		return nil, err
+	}
+	return nodes, nil
+}
+
+func getNodesWithInstanceFromLabel(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
+	instanceIDReq, err := labels.NewRequirement("alpha.eksctl.io/instance-id", selection.Equals, []string{instanceID})
+	if err != nil {
+		return nil, fmt.Errorf("bad label requirement: %w", err)
+	}
+	selector := labels.NewSelector().Add(*instanceIDReq)
+	options := metav1.ListOptions{LabelSelector: selector.String()}
+	return getNodes(options, clientset)
+}
+
+func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
+	nodes, err := getNodes(metav1.ListOptions{}, clientset)
+	if err != nil {
+		return nil, err
+	}
+
+	var filteredNodes []v1.Node
+	for _, node := range nodes {
+		if !strings.Contains(node.Spec.ProviderID, instanceID) {
+			continue
+		}
+		filteredNodes = append(filteredNodes, node)
+	}
+	return filteredNodes, nil
+}
+
+func getNodes(options metav1.ListOptions, clientset *kubernetes.Clientset) ([]v1.Node, error) {
+	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), options)
+	if err != nil {
+		return nil, fmt.Errorf("retreiving nodes from cluster: %w", err)
+	}
+	return nodes.Items, err
 }
 
 func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 50493bf7..d232cf07 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -14,10 +14,8 @@
 package sqsevent
 
 import (
-	"context"
 	"encoding/json"
 	"fmt"
-	"strings"
 
 	"github.com/aws/aws-node-termination-handler/pkg/monitor"
 	"github.com/aws/aws-node-termination-handler/pkg/node"
@@ -25,9 +23,6 @@ import (
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/rs/zerolog/log"
-	v1 "k8s.io/api/core/v1"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/client-go/kubernetes"
 )
 
 /* Example SQS ASG Lifecycle Termination Event Message:
@@ -134,72 +129,45 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 }
 
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
-func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) error {
+func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) {
 	lifecycleDetail := &LifecycleDetail{}
 	err := json.Unmarshal(event.Detail, lifecycleDetail)
 	if err != nil {
-		return fmt.Errorf("unmarshaling ASG lifecycle event: %w", err)
+		return nil, fmt.Errorf("unmarshaling ASG lifecycle event: %w", err)
 	}
 
 	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
-		return ignore{skip{fmt.Errorf("message is an ASG test notification")}}
+		return nil, ignore{skip{fmt.Errorf("message is an ASG test notification")}}
 	}
 
-	if !isNodeReady(lifecycleDetail, m.K8sClientset) {
-		return ignore{skip{fmt.Errorf("new ASG instance has not connected to cluster")}}
-	}
-
-	_, err = m.continueLifecycleAction(lifecycleDetail)
+	nodeInfo, err := m.getNodeInfo(lifecycleDetail.EC2InstanceID)
 	if err != nil {
-		return ignore{skip{fmt.Errorf("continuing ASG launch lifecyle: %w", err)}}
+		return nil, err
 	}
 
-	log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
-		lifecycleDetail.LifecycleHookName,
-		lifecycleDetail.EC2InstanceID)
-	err = m.deleteMessage(message)
-	return err
-}
-
-// If the Node, new EC2 instance, is ready in the K8s cluster
-func isNodeReady(lifecycleDetail *LifecycleDetail, clientset *kubernetes.Clientset) bool {
-	nodes, err := getNodes(clientset)
-	if err != nil {
-		log.Err(fmt.Errorf("getting nodes from cluster: %w", err))
-		return false
+	interruptionEvent := monitor.InterruptionEvent{
+		EventID:              fmt.Sprintf("asg-lifecycle-term-%x", event.ID),
+		Kind:                 monitor.ASGLaunchLifecycleKind,
+		Monitor:              SQSMonitorKind,
+		AutoScalingGroupName: lifecycleDetail.AutoScalingGroupName,
+		StartTime:            event.getTime(),
+		NodeName:             nodeInfo.Name,
+		IsManaged:            nodeInfo.IsManaged,
+		InstanceID:           lifecycleDetail.EC2InstanceID,
+		ProviderID:           nodeInfo.ProviderID,
+		Description:          fmt.Sprintf("ASG Lifecycle Launch event received. Instance will be interrupted at %s \n", event.getTime()),
 	}
 
-	for _, node := range nodes.Items {
-		instanceID := getInstanceID(node)
-		if instanceID != lifecycleDetail.EC2InstanceID {
-			continue
-		}
-
-		conditions := node.Status.Conditions
-		for _, condition := range conditions {
-			if condition.Type == "Ready" && condition.Status == "True" {
-				return true
-			}
+	interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error {
+		_, err = m.continueLifecycleAction(lifecycleDetail)
+		if err != nil {
+			return fmt.Errorf("continuing ASG launch lifecyle: %w", err)
 		}
-		log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID))
-	}
-	log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", lifecycleDetail.EC2InstanceID))
-	return false
-}
-
-// Gets Nodes connected to K8s cluster
-func getNodes(clientset *kubernetes.Clientset) (*v1.NodeList, error) {
-	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
-	if err != nil {
-		return nil, fmt.Errorf("retreiving nodes from cluster: %w", err)
+		log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
+			lifecycleDetail.LifecycleHookName,
+			lifecycleDetail.EC2InstanceID)
+		return m.deleteMessage(message)
 	}
-	return nodes, err
-}
 
-// Gets EC2 InstanceID from ProviderID, format: aws:///$az/$instanceid
-func getInstanceID(node v1.Node) string {
-	providerID := node.Spec.ProviderID
-	providerIDSplit := strings.Split(providerID, "/")
-	instanceID := providerIDSplit[len(providerIDSplit)-1]
-	return instanceID
+	return &interruptionEvent, err
 }
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 8bdd2a0a..e51ede20 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -28,7 +28,6 @@ import (
 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/aws/aws-sdk-go/service/sqs/sqsiface"
-	"k8s.io/client-go/kubernetes"
 
 	"github.com/rs/zerolog/log"
 
@@ -53,7 +52,6 @@ type SQSMonitor struct {
 	CheckIfManaged                bool
 	ManagedTag                    string
 	BeforeCompleteLifecycleAction func()
-	K8sClientset                  *kubernetes.Clientset
 }
 
 // InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any
@@ -206,8 +204,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 		lifecycleEvent := LifecycleDetail{}
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
 		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
-			err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message)
-			interruptionEvent = nil
+			interruptionEvent, err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message)
 		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		}
diff --git a/pkg/monitor/types.go b/pkg/monitor/types.go
index c3c587d2..93d56625 100644
--- a/pkg/monitor/types.go
+++ b/pkg/monitor/types.go
@@ -31,6 +31,8 @@ const (
 	StateChangeKind = "STATE_CHANGE"
 	// ASGLifecycleKind is a const to define an ASG Lifecycle kind of interruption event
 	ASGLifecycleKind = "ASG_LIFECYCLE"
+	// ASGLifecycleKind is a const to define an ASG Launch Lifecycle kind of interruption event
+	ASGLaunchLifecycleKind = "ASG_LAUNCH_LIFECYCLE"
 	// SQSTerminateKind is a const to define an SQS termination kind of interruption event
 	SQSTerminateKind = "SQS_TERMINATE"
 )

From fee8077469e275339fea3c0f8e1667b92b972a60 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 5 Dec 2023 15:31:17 -0600
Subject: [PATCH 15/27] Revise log messages and formatting

---
 cmd/node-termination-handler.go             | 15 ++++++++++-----
 pkg/monitor/sqsevent/asg-lifecycle-event.go | 13 ++++++++-----
 pkg/monitor/sqsevent/sqs-monitor.go         | 17 +++++++++--------
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 1fdfd02f..9057d6db 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -106,7 +106,7 @@ func main() {
 
 	clusterConfig, err := rest.InClusterConfig()
 	if err != nil {
-		log.Fatal().Err(err).Msgf("retreiving cluster config: %v", err)
+		log.Fatal().Err(err).Msgf("retreiving cluster config")
 	}
 	clientset, err := kubernetes.NewForConfig(clusterConfig)
 	if err != nil {
@@ -344,6 +344,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
 	}
 }
 
+// TODO rename to processInterruptionEvent RENAME
 func processInterruptionEventFunctions(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) {
 	defer wg.Done()
 	processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset)
@@ -351,17 +352,19 @@ func processInterruptionEventFunctions(interruptionEventStore *interruptionevent
 	<-interruptionEventStore.Workers
 }
 
+// TODO move function and helpers to new package pkg/interruptioneventhandler/asg/launch
 func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) {
 	if drainEvent.Kind != monitor.ASGLaunchLifecycleKind {
 		return
 	}
 
 	if !isNodeReady(drainEvent.InstanceID, clientset) {
-		log.Error().Msgf("new ASG instance, %s, has not connected to cluster", drainEvent.InstanceID)
+		log.Error().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster")
 		interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
 		return
 	}
 
+	log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster")
 	nodeName := getNodeName(drainEvent, node, nthConfig)
 
 	if drainEvent.PostDrainTask != nil {
@@ -370,6 +373,7 @@ func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventsto
 }
 
 func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
+	//TODO Use allow list instead of denylist LOGIC
 	if drainEvent.Kind == monitor.ASGLaunchLifecycleKind {
 		return
 	}
@@ -418,6 +422,7 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
 	}
 }
 
+// TODO Restructure indentation LOGIC
 func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config) string {
 	nodeName := drainEvent.NodeName
 	if nthConfig.UseProviderId {
@@ -432,6 +437,7 @@ func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfi
 	return nodeName
 }
 
+// TODO make method return error
 func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool {
 	nodes, err := getNodesWithInstanceID(instanceID, clientset)
 	if err != nil {
@@ -440,23 +446,21 @@ func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool {
 	}
 
 	if len(nodes) == 0 {
-		log.Error().Msg(fmt.Sprintf("ec2 instance, %s, not found in cluster", instanceID))
 		return false
 	}
 
 	for _, node := range nodes {
 		conditions := node.Status.Conditions
 		for _, condition := range conditions {
+			//TODO combine if statements LOGIC
 			if condition.Type != "Ready" {
 				continue
 			}
 			if condition.Status != "True" {
-				log.Error().Msg(fmt.Sprintf("ec2 instance, %s, found, but not ready in cluster", instanceID))
 				return false
 			}
 		}
 	}
-	log.Info().Msgf("new ASG instance, %s, is found and ready in cluster", instanceID)
 	return true
 }
 
@@ -503,6 +507,7 @@ func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes
 	return filteredNodes, nil
 }
 
+// TODO Remove method
 func getNodes(options metav1.ListOptions, clientset *kubernetes.Clientset) ([]v1.Node, error) {
 	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), options)
 	if err != nil {
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index d232cf07..2e53bd39 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -128,12 +128,17 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 	})
 }
 
+// TODO Rename to createAsgInstanceLaunchEvent RENAME
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
 func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) {
+	if message == nil || event == nil {
+		return nil, fmt.Errorf("event message is nil for ASG Instance Launch Event creation")
+	}
+
 	lifecycleDetail := &LifecycleDetail{}
 	err := json.Unmarshal(event.Detail, lifecycleDetail)
 	if err != nil {
-		return nil, fmt.Errorf("unmarshaling ASG lifecycle event: %w", err)
+		return nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err)
 	}
 
 	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
@@ -161,11 +166,9 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message
 	interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error {
 		_, err = m.continueLifecycleAction(lifecycleDetail)
 		if err != nil {
-			return fmt.Errorf("continuing ASG launch lifecyle: %w", err)
+			return fmt.Errorf("continuing ASG launch lifecycle: %w", err)
 		}
-		log.Info().Msgf("Completed ASG Lifecycle Hook (%s) for instance %s",
-			lifecycleDetail.LifecycleHookName,
-			lifecycleDetail.EC2InstanceID)
+		log.Info().Str("lifecycleHookName", lifecycleDetail.LifecycleHookName).Str("instanceID", lifecycleDetail.EC2InstanceID).Msg("Completed ASG Lifecycle Hook")
 		return m.deleteMessage(message)
 	}
 
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index e51ede20..fef06113 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -73,6 +73,7 @@ func (s skip) Unwrap() error {
 	return s.err
 }
 
+// TODO REMOVE
 // Used to completely ignore an error. Used when processing a non-terminating event
 type ignore struct {
 	err error
@@ -144,14 +145,15 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent,
 	return &event, err
 }
 
+// TODO Rename to parseLifecycleEvent, rename messageBody to message RENAME
 func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 	lifecycleEventMessage := LifecycleDetailMessage{}
 	lifecycleEvent := LifecycleDetail{}
 	err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage)
 	if err != nil {
-		// log.Err(err).Msg("processing JSON message of lifecycle event from ASG")
 		return lifecycleEvent, err
 	}
+	//TODO add comment about why Sprintf is needed COMMENT
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 	} else {
@@ -162,13 +164,14 @@ func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 
 // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent
 func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
+	log.Debug().Msg("processing lifecycle event from ASG")
 	eventBridgeEvent := EventBridgeEvent{}
+	//TODO nil-check the pointer and pass in a string instead of a pointer LOGIC
 	lifecycleEvent, err := messageToLifecycleEvent(message.Body)
 
 	switch {
 	case err != nil:
-		log.Err(err).Msg("only lifecycle events from ASG to SQS are supported outside EventBridge")
-		return eventBridgeEvent, err
+		return eventBridgeEvent, fmt.Errorf("parsing lifecycle event messsage from ASG: %w", err)
 
 	case lifecycleEvent.Event == TEST_NOTIFICATION || lifecycleEvent.LifecycleTransition == TEST_NOTIFICATION:
 		err := fmt.Errorf("message is a test notification")
@@ -179,17 +182,13 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 
 	case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" &&
 		lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING":
-		log.Err(err).Msg("only lifecycle termination events from ASG to SQS are supported outside EventBridge")
-		err = fmt.Errorf("unsupported message type (%s)", message.String())
-		return eventBridgeEvent, err
+		return eventBridgeEvent, fmt.Errorf("unsupported message type (%s) while parsing lifecycle event messsage from ASG", message.String())
 	}
 
 	eventBridgeEvent.Source = "aws.autoscaling"
 	eventBridgeEvent.Time = lifecycleEvent.Time
 	eventBridgeEvent.ID = lifecycleEvent.RequestID
 	eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent)
-
-	log.Debug().Msg("processing lifecycle event from ASG")
 	return eventBridgeEvent, err
 }
 
@@ -200,9 +199,11 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	var err error
 
 	switch eventBridgeEvent.Source {
+	//TODO add comment for other cases
 	case "aws.autoscaling":
 		lifecycleEvent := LifecycleDetail{}
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
+		//TODO handle err != nil
 		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
 			interruptionEvent, err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message)
 		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {

From 45869afd0d5ec500914ab73b8a1643a923fa5399 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 5 Dec 2023 16:01:33 -0600
Subject: [PATCH 16/27] Removed ignore errors and getNodes method. Changed
 method names

---
 cmd/node-termination-handler.go             | 34 +++++++++------------
 pkg/monitor/sqsevent/asg-lifecycle-event.go |  5 ++-
 pkg/monitor/sqsevent/sqs-monitor.go         | 33 +++++---------------
 3 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 9057d6db..09c7e7d1 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -272,7 +272,7 @@ func main() {
 					event.InProgress = true
 					wg.Add(1)
 					recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description)
-					go processInterruptionEventFunctions(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg)
+					go processInterruptionEvent(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg)
 				default:
 					log.Warn().Msg("all workers busy, waiting")
 					break EventLoop
@@ -344,8 +344,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
 	}
 }
 
-// TODO rename to processInterruptionEvent RENAME
-func processInterruptionEventFunctions(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) {
+func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) {
 	defer wg.Done()
 	processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset)
 	drainOrCordonIfNecessary(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder)
@@ -373,7 +372,7 @@ func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventsto
 }
 
 func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
-	//TODO Use allow list instead of denylist LOGIC
+	// TODO Use allow list instead of denylist LOGIC
 	if drainEvent.Kind == monitor.ASGLaunchLifecycleKind {
 		return
 	}
@@ -452,7 +451,7 @@ func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool {
 	for _, node := range nodes {
 		conditions := node.Status.Conditions
 		for _, condition := range conditions {
-			//TODO combine if statements LOGIC
+			// TODO combine if statements LOGIC
 			if condition.Type != "Ready" {
 				continue
 			}
@@ -482,23 +481,27 @@ func getNodesWithInstanceID(instanceID string, clientset *kubernetes.Clientset)
 }
 
 func getNodesWithInstanceFromLabel(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
-	instanceIDReq, err := labels.NewRequirement("alpha.eksctl.io/instance-id", selection.Equals, []string{instanceID})
+	instanceIDLabel := "alpha.eksctl.io/instance-id"
+	instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID})
 	if err != nil {
 		return nil, fmt.Errorf("bad label requirement: %w", err)
 	}
 	selector := labels.NewSelector().Add(*instanceIDReq)
-	options := metav1.ListOptions{LabelSelector: selector.String()}
-	return getNodes(options, clientset)
+	nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()})
+	if err != nil {
+		return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err)
+	}
+	return nodeList.Items, nil
 }
 
 func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
-	nodes, err := getNodes(metav1.ListOptions{}, clientset)
+	nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err)
 	}
 
 	var filteredNodes []v1.Node
-	for _, node := range nodes {
+	for _, node := range nodeList.Items {
 		if !strings.Contains(node.Spec.ProviderID, instanceID) {
 			continue
 		}
@@ -507,15 +510,6 @@ func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes
 	return filteredNodes, nil
 }
 
-// TODO Remove method
-func getNodes(options metav1.ListOptions, clientset *kubernetes.Clientset) ([]v1.Node, error) {
-	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), options)
-	if err != nil {
-		return nil, fmt.Errorf("retreiving nodes from cluster: %w", err)
-	}
-	return nodes.Items, err
-}
-
 func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
 	err := drainEvent.PreDrainTask(*drainEvent, node)
 	if err != nil {
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 2e53bd39..0db51eab 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -128,9 +128,8 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 	})
 }
 
-// TODO Rename to createAsgInstanceLaunchEvent RENAME
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
-func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) {
+func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) {
 	if message == nil || event == nil {
 		return nil, fmt.Errorf("event message is nil for ASG Instance Launch Event creation")
 	}
@@ -142,7 +141,7 @@ func (m SQSMonitor) continueAsgLaunchLifecycle(event *EventBridgeEvent, message
 	}
 
 	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
-		return nil, ignore{skip{fmt.Errorf("message is an ASG test notification")}}
+		return nil, skip{fmt.Errorf("message is an ASG test notification")}
 	}
 
 	nodeInfo, err := m.getNodeInfo(lifecycleDetail.EC2InstanceID)
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index fef06113..d62fde76 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -73,20 +73,6 @@ func (s skip) Unwrap() error {
 	return s.err
 }
 
-// TODO REMOVE
-// Used to completely ignore an error. Used when processing a non-terminating event
-type ignore struct {
-	err error
-}
-
-func (i ignore) Error() string {
-	return i.err.Error()
-}
-
-func (i ignore) Unwrap() error {
-	return i.err
-}
-
 // Kind denotes the kind of monitor
 func (m SQSMonitor) Kind() string {
 	return SQSMonitorKind
@@ -145,15 +131,14 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent,
 	return &event, err
 }
 
-// TODO Rename to parseLifecycleEvent, rename messageBody to message RENAME
-func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
+func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 	lifecycleEventMessage := LifecycleDetailMessage{}
 	lifecycleEvent := LifecycleDetail{}
 	err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage)
 	if err != nil {
 		return lifecycleEvent, err
 	}
-	//TODO add comment about why Sprintf is needed COMMENT
+	// TODO add comment about why Sprintf is needed COMMENT
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 	} else {
@@ -166,8 +151,8 @@ func messageToLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
 	log.Debug().Msg("processing lifecycle event from ASG")
 	eventBridgeEvent := EventBridgeEvent{}
-	//TODO nil-check the pointer and pass in a string instead of a pointer LOGIC
-	lifecycleEvent, err := messageToLifecycleEvent(message.Body)
+	// TODO nil-check the pointer and pass in a string instead of a pointer LOGIC
+	lifecycleEvent, err := parseLifecycleEvent(message.Body)
 
 	switch {
 	case err != nil:
@@ -199,13 +184,13 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	var err error
 
 	switch eventBridgeEvent.Source {
-	//TODO add comment for other cases
+	// TODO add comment for other cases
 	case "aws.autoscaling":
 		lifecycleEvent := LifecycleDetail{}
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
-		//TODO handle err != nil
+		// TODO handle err != nil
 		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
-			interruptionEvent, err = m.continueAsgLaunchLifecycle(eventBridgeEvent, message)
+			interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message)
 		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
 		}
@@ -236,13 +221,9 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr
 	dropMessageSuggestionCount := 0
 	failedInterruptionEventsCount := 0
 	var skipErr skip
-	var ignoreErr ignore
 
 	for _, eventWrapper := range interruptionEventWrappers {
 		switch {
-		case errors.As(eventWrapper.Err, &ignoreErr):
-			log.Warn().Err(ignoreErr).Msg("ASG launch cycle not continued")
-
 		case errors.As(eventWrapper.Err, &skipErr):
 			log.Warn().Err(skipErr).Msg("dropping event")
 			dropMessageSuggestionCount++

From 10f0f34d075268a811e177b483a2208153a3db77 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 5 Dec 2023 16:43:33 -0600
Subject: [PATCH 17/27] Refactor error handling, and add helpful comments

---
 cmd/node-termination-handler.go             | 17 ++++++++---------
 pkg/monitor/sqsevent/asg-lifecycle-event.go |  2 +-
 pkg/monitor/sqsevent/sqs-monitor.go         | 17 +++++++++++++----
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 09c7e7d1..9619aad3 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -357,8 +357,9 @@ func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventsto
 		return
 	}
 
-	if !isNodeReady(drainEvent.InstanceID, clientset) {
-		log.Error().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster")
+	isNodeReady, err := isNodeReady(drainEvent.InstanceID, clientset)
+	if err != nil || !isNodeReady {
+		log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster")
 		interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
 		return
 	}
@@ -436,16 +437,14 @@ func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfi
 	return nodeName
 }
 
-// TODO make method return error
-func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool {
+func isNodeReady(instanceID string, clientset *kubernetes.Clientset) (bool, error) {
 	nodes, err := getNodesWithInstanceID(instanceID, clientset)
 	if err != nil {
-		log.Err(fmt.Errorf("getting nodes with instance ID: %w", err))
-		return false
+		return false, fmt.Errorf("getting nodes with instance ID: %w", err)
 	}
 
 	if len(nodes) == 0 {
-		return false
+		return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID)
 	}
 
 	for _, node := range nodes {
@@ -456,11 +455,11 @@ func isNodeReady(instanceID string, clientset *kubernetes.Clientset) bool {
 				continue
 			}
 			if condition.Status != "True" {
-				return false
+				return false, fmt.Errorf("ec2 instance, %s, found, but not ready in cluster", instanceID)
 			}
 		}
 	}
-	return true
+	return true, nil
 }
 
 // Gets Nodes connected to K8s cluster
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 0db51eab..3a9dd390 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -137,7 +137,7 @@ func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, messag
 	lifecycleDetail := &LifecycleDetail{}
 	err := json.Unmarshal(event.Detail, lifecycleDetail)
 	if err != nil {
-		return nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err)
+		return nil, fmt.Errorf("unmarshaling message, %s, from ASG launch lifecycle event: %w", *message.MessageId, err)
 	}
 
 	if lifecycleDetail.Event == TEST_NOTIFICATION || lifecycleDetail.LifecycleTransition == TEST_NOTIFICATION {
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index d62fde76..d155ecf3 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -138,7 +138,7 @@ func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 	if err != nil {
 		return lifecycleEvent, err
 	}
-	// TODO add comment about why Sprintf is needed COMMENT
+	// Converts escaped JSON object to string, to lifecycle event
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 	} else {
@@ -151,7 +151,10 @@ func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
 	log.Debug().Msg("processing lifecycle event from ASG")
 	eventBridgeEvent := EventBridgeEvent{}
-	// TODO nil-check the pointer and pass in a string instead of a pointer LOGIC
+
+	if message == nil {
+		return eventBridgeEvent, fmt.Errorf("ASG event message is nil")
+	}
 	lifecycleEvent, err := parseLifecycleEvent(message.Body)
 
 	switch {
@@ -183,12 +186,18 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	interruptionEvent := &monitor.InterruptionEvent{}
 	var err error
 
+	if message == nil || eventBridgeEvent == nil {
+		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("event message is nil")})
+	}
+
 	switch eventBridgeEvent.Source {
-	// TODO add comment for other cases
+	// LifecycleTransitions other than LAUNCHING or TERMINATING will result in the interruptionEvent being uninitialized
 	case "aws.autoscaling":
 		lifecycleEvent := LifecycleDetail{}
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
-		// TODO handle err != nil
+		if err != nil {
+			interruptionEvent, err = nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err)
+		}
 		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
 			interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message)
 		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {

From 1ab824bf73f6a2683b0b9d40376a1967781e09e9 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Thu, 7 Dec 2023 10:46:41 -0600
Subject: [PATCH 18/27] Refactored interruption event handling into a seperate
 package with distinct handlers for different interruption event Kinds

---
 cmd/node-termination-handler.go               | 240 +-----------------
 pkg/interruptionevent/asg/launch/handler.go   | 144 +++++++++++
 pkg/interruptionevent/draincordon/handler.go  | 139 ++++++++++
 .../internal/common/handler.go                |  76 ++++++
 pkg/monitor/sqsevent/sqs-monitor.go           |   8 +-
 5 files changed, 377 insertions(+), 230 deletions(-)
 create mode 100644 pkg/interruptionevent/asg/launch/handler.go
 create mode 100644 pkg/interruptionevent/draincordon/handler.go
 create mode 100644 pkg/interruptionevent/internal/common/handler.go

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 9619aad3..6fd0c745 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -25,6 +25,8 @@ import (
 
 	"github.com/aws/aws-node-termination-handler/pkg/config"
 	"github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptionevent/asg/launch"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptionevent/draincordon"
 	"github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore"
 	"github.com/aws/aws-node-termination-handler/pkg/logging"
 	"github.com/aws/aws-node-termination-handler/pkg/monitor"
@@ -43,11 +45,6 @@ import (
 	"github.com/aws/aws-sdk-go/service/sqs"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
-	v1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/errors"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/labels"
-	"k8s.io/apimachinery/pkg/selection"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
@@ -62,6 +59,10 @@ const (
 	duplicateErrThreshold   = 3
 )
 
+type interruptionEventHandler interface {
+	HandleEvent(*monitor.InterruptionEvent)
+}
+
 func main() {
 	// Zerolog uses json formatting by default, so change that to a human-readable format instead
 	log.Logger = log.Output(logging.RoutingLevelWriter{
@@ -258,6 +259,9 @@ func main() {
 
 	var wg sync.WaitGroup
 
+	asgLaunchHandler := launch.New(interruptionEventStore, *node, nthConfig, metrics, recorder, clientset)
+	drainCordonHander := draincordon.New(interruptionEventStore, *node, nthConfig, nodeMetadata, metrics, recorder)
+
 	for range time.NewTicker(1 * time.Second).C {
 		select {
 		case <-signalChan:
@@ -272,7 +276,7 @@ func main() {
 					event.InProgress = true
 					wg.Add(1)
 					recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description)
-					go processInterruptionEvent(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, clientset, &wg)
+					go processInterruptionEvent(interruptionEventStore, event, []interruptionEventHandler{asgLaunchHandler, drainCordonHander}, &wg)
 				default:
 					log.Warn().Msg("all workers busy, waiting")
 					break EventLoop
@@ -344,228 +348,12 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
 	}
 }
 
-func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset, wg *sync.WaitGroup) {
+func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, event *monitor.InterruptionEvent, eventHandlers []interruptionEventHandler, wg *sync.WaitGroup) {
 	defer wg.Done()
-	processASGLaunchLifecycleEvent(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder, clientset)
-	drainOrCordonIfNecessary(interruptionEventStore, drainEvent, node, nthConfig, nodeMetadata, metrics, recorder)
-	<-interruptionEventStore.Workers
-}
-
-// TODO move function and helpers to new package pkg/interruptioneventhandler/asg/launch
-func processASGLaunchLifecycleEvent(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) {
-	if drainEvent.Kind != monitor.ASGLaunchLifecycleKind {
-		return
+	for _, eventHandler := range eventHandlers {
+		eventHandler.HandleEvent(event)
 	}
-
-	isNodeReady, err := isNodeReady(drainEvent.InstanceID, clientset)
-	if err != nil || !isNodeReady {
-		log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster")
-		interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
-		return
-	}
-
-	log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster")
-	nodeName := getNodeName(drainEvent, node, nthConfig)
-
-	if drainEvent.PostDrainTask != nil {
-		runPostDrainTask(node, nodeName, drainEvent, metrics, recorder)
-	}
-}
-
-func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
-	// TODO Use allow list instead of denylist LOGIC
-	if drainEvent.Kind == monitor.ASGLaunchLifecycleKind {
-		return
-	}
-
-	nodeFound := true
-	nodeName := getNodeName(drainEvent, node, nthConfig)
-
-	nodeLabels, err := node.GetNodeLabels(nodeName)
-	if err != nil {
-		log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName)
-		nodeFound = false
-	}
-	drainEvent.NodeLabels = nodeLabels
-	if drainEvent.PreDrainTask != nil {
-		runPreDrainTask(node, nodeName, drainEvent, metrics, recorder)
-	}
-
-	podNameList, err := node.FetchPodNameList(nodeName)
-	if err != nil {
-		log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
-	}
-	drainEvent.Pods = podNameList
-	err = node.LogPods(podNameList, nodeName)
-	if err != nil {
-		log.Err(err).Msg("There was a problem while trying to log all pod names on the node")
-	}
-
-	if nthConfig.CordonOnly || (!nthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !nthConfig.EnableRebalanceDraining) {
-		err = cordonNode(node, nodeName, drainEvent, metrics, recorder)
-	} else {
-		err = cordonAndDrainNode(node, nodeName, drainEvent, metrics, recorder, nthConfig.EnableSQSTerminationDraining)
-	}
-
-	if nthConfig.WebhookURL != "" {
-		webhook.Post(nodeMetadata, drainEvent, nthConfig)
-	}
-
-	if err != nil {
-		interruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
-	} else {
-		interruptionEventStore.MarkAllAsProcessed(nodeName)
-	}
-
-	if (err == nil || (!nodeFound && nthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
-		runPostDrainTask(node, nodeName, drainEvent, metrics, recorder)
-	}
-}
-
-// TODO Restructure indentation LOGIC
-func getNodeName(drainEvent *monitor.InterruptionEvent, node node.Node, nthConfig config.Config) string {
-	nodeName := drainEvent.NodeName
-	if nthConfig.UseProviderId {
-		newNodeName, err := node.GetNodeNameFromProviderID(drainEvent.ProviderID)
-
-		if err != nil {
-			log.Err(err).Msgf("Unable to get node name for node with ProviderID '%s' using original AWS event node name ", drainEvent.ProviderID)
-		} else {
-			nodeName = newNodeName
-		}
-	}
-	return nodeName
-}
-
-func isNodeReady(instanceID string, clientset *kubernetes.Clientset) (bool, error) {
-	nodes, err := getNodesWithInstanceID(instanceID, clientset)
-	if err != nil {
-		return false, fmt.Errorf("getting nodes with instance ID: %w", err)
-	}
-
-	if len(nodes) == 0 {
-		return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID)
-	}
-
-	for _, node := range nodes {
-		conditions := node.Status.Conditions
-		for _, condition := range conditions {
-			// TODO combine if statements LOGIC
-			if condition.Type != "Ready" {
-				continue
-			}
-			if condition.Status != "True" {
-				return false, fmt.Errorf("ec2 instance, %s, found, but not ready in cluster", instanceID)
-			}
-		}
-	}
-	return true, nil
-}
-
-// Gets Nodes connected to K8s cluster
-func getNodesWithInstanceID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
-	nodes, err := getNodesWithInstanceFromLabel(instanceID, clientset)
-	if err != nil {
-		return nil, err
-	}
-	if len(nodes) != 0 {
-		return nodes, nil
-	}
-
-	nodes, err = getNodesWithInstanceFromProviderID(instanceID, clientset)
-	if err != nil {
-		return nil, err
-	}
-	return nodes, nil
-}
-
-func getNodesWithInstanceFromLabel(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
-	instanceIDLabel := "alpha.eksctl.io/instance-id"
-	instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID})
-	if err != nil {
-		return nil, fmt.Errorf("bad label requirement: %w", err)
-	}
-	selector := labels.NewSelector().Add(*instanceIDReq)
-	nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()})
-	if err != nil {
-		return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err)
-	}
-	return nodeList.Items, nil
-}
-
-func getNodesWithInstanceFromProviderID(instanceID string, clientset *kubernetes.Clientset) ([]v1.Node, error) {
-	nodeList, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
-	if err != nil {
-		return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err)
-	}
-
-	var filteredNodes []v1.Node
-	for _, node := range nodeList.Items {
-		if !strings.Contains(node.Spec.ProviderID, instanceID) {
-			continue
-		}
-		filteredNodes = append(filteredNodes, node)
-	}
-	return filteredNodes, nil
-}
-
-func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
-	err := drainEvent.PreDrainTask(*drainEvent, node)
-	if err != nil {
-		log.Err(err).Msg("There was a problem executing the pre-drain task")
-		recorder.Emit(nodeName, observability.Warning, observability.PreDrainErrReason, observability.PreDrainErrMsgFmt, err.Error())
-	} else {
-		recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg)
-	}
-	metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err)
-}
-
-func cordonNode(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) error {
-	err := node.Cordon(nodeName, drainEvent.Description)
-	if err != nil {
-		if errors.IsNotFound(err) {
-			log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
-		} else {
-			log.Err(err).Msg("There was a problem while trying to cordon the node")
-			recorder.Emit(nodeName, observability.Warning, observability.CordonErrReason, observability.CordonErrMsgFmt, err.Error())
-		}
-		return err
-	} else {
-		log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned")
-		metrics.NodeActionsInc("cordon", nodeName, drainEvent.EventID, err)
-		recorder.Emit(nodeName, observability.Normal, observability.CordonReason, observability.CordonMsg)
-	}
-	return nil
-}
-
-func cordonAndDrainNode(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder, sqsTerminationDraining bool) error {
-	err := node.CordonAndDrain(nodeName, drainEvent.Description, recorder.EventRecorder)
-	if err != nil {
-		if errors.IsNotFound(err) {
-			log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
-		} else {
-			log.Err(err).Msg("There was a problem while trying to cordon and drain the node")
-			metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err)
-			recorder.Emit(nodeName, observability.Warning, observability.CordonAndDrainErrReason, observability.CordonAndDrainErrMsgFmt, err.Error())
-		}
-		return err
-	} else {
-		log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned and drained")
-		metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err)
-		recorder.Emit(nodeName, observability.Normal, observability.CordonAndDrainReason, observability.CordonAndDrainMsg)
-	}
-	return nil
-}
-
-func runPostDrainTask(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) {
-	err := drainEvent.PostDrainTask(*drainEvent, node)
-	if err != nil {
-		log.Err(err).Msg("There was a problem executing the post-drain task")
-		recorder.Emit(nodeName, observability.Warning, observability.PostDrainErrReason, observability.PostDrainErrMsgFmt, err.Error())
-	} else {
-		recorder.Emit(nodeName, observability.Normal, observability.PostDrainReason, observability.PostDrainMsg)
-	}
-	metrics.NodeActionsInc("post-drain", nodeName, drainEvent.EventID, err)
+	<-interruptionEventStore.Workers
 }
 
 func getRegionFromQueueURL(queueURL string) string {
diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go
new file mode 100644
index 00000000..46d917a8
--- /dev/null
+++ b/pkg/interruptionevent/asg/launch/handler.go
@@ -0,0 +1,144 @@
+// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"). You may
+// not use this file except in compliance with the License. A copy of the
+// License is located at
+//
+//     http://aws.amazon.com/apache2.0/
+//
+// or in the "license" file accompanying this file. This file is distributed
+// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+// express or implied. See the License for the specific language governing
+// permissions and limitations under the License
+
+package launch
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/aws/aws-node-termination-handler/pkg/config"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptionevent/internal/common"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore"
+	"github.com/aws/aws-node-termination-handler/pkg/monitor"
+	"github.com/aws/aws-node-termination-handler/pkg/node"
+	"github.com/aws/aws-node-termination-handler/pkg/observability"
+	"github.com/rs/zerolog/log"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
+	"k8s.io/client-go/kubernetes"
+)
+
+type Handler struct {
+	commonHandler *common.Handler
+	clientset     *kubernetes.Clientset
+}
+
+func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, nthConfig config.Config, metrics observability.Metrics, recorder observability.K8sEventRecorder, clientset *kubernetes.Clientset) *Handler {
+	commonHandler := &common.Handler{
+		InterruptionEventStore: interruptionEventStore,
+		Node:                   node,
+		NthConfig:              nthConfig,
+		Metrics:                metrics,
+		Recorder:               recorder,
+	}
+
+	return &Handler{
+		commonHandler: commonHandler,
+		clientset:     clientset,
+	}
+}
+
+func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
+	if !common.IsAllowedKind(drainEvent.Kind, monitor.ASGLaunchLifecycleKind) {
+		return
+	}
+
+	isNodeReady, err := h.isNodeReady(drainEvent.InstanceID)
+	if err != nil || !isNodeReady {
+		log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster")
+		h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
+		return
+	}
+
+	log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster")
+	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
+	if err != nil {
+		log.Error().Err(err).Msg("unable to retrieve node name for ASG event processing")
+	}
+
+	if drainEvent.PostDrainTask != nil {
+		h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
+	}
+}
+
+func (h *Handler) isNodeReady(instanceID string) (bool, error) {
+	nodes, err := h.getNodesWithInstanceID(instanceID)
+	if err != nil {
+		return false, fmt.Errorf("getting nodes with instance ID: %w", err)
+	}
+
+	if len(nodes) == 0 {
+		return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID)
+	}
+
+	for _, node := range nodes {
+		conditions := node.Status.Conditions
+		for _, condition := range conditions {
+			if condition.Type == "Ready" && condition.Status != "True" {
+				return false, fmt.Errorf("EC2 instance, %s, found, but not ready in cluster", instanceID)
+			}
+		}
+	}
+	return true, nil
+}
+
+// Gets Nodes connected to K8s cluster
+func (h *Handler) getNodesWithInstanceID(instanceID string) ([]v1.Node, error) {
+	nodes, err := h.getNodesWithInstanceFromLabel(instanceID)
+	if err != nil {
+		return nil, err
+	}
+	if len(nodes) != 0 {
+		return nodes, nil
+	}
+
+	nodes, err = h.getNodesWithInstanceFromProviderID(instanceID)
+	if err != nil {
+		return nil, err
+	}
+	return nodes, nil
+}
+
+func (h *Handler) getNodesWithInstanceFromLabel(instanceID string) ([]v1.Node, error) {
+	instanceIDLabel := "alpha.eksctl.io/instance-id"
+	instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID})
+	if err != nil {
+		return nil, fmt.Errorf("bad label requirement: %w", err)
+	}
+	selector := labels.NewSelector().Add(*instanceIDReq)
+	nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()})
+	if err != nil {
+		return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err)
+	}
+	return nodeList.Items, nil
+}
+
+func (h *Handler) getNodesWithInstanceFromProviderID(instanceID string) ([]v1.Node, error) {
+	nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err)
+	}
+
+	var filteredNodes []v1.Node
+	for _, node := range nodeList.Items {
+		if !strings.Contains(node.Spec.ProviderID, instanceID) {
+			continue
+		}
+		filteredNodes = append(filteredNodes, node)
+	}
+	return filteredNodes, nil
+}
diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go
new file mode 100644
index 00000000..32ea8cae
--- /dev/null
+++ b/pkg/interruptionevent/draincordon/handler.go
@@ -0,0 +1,139 @@
+// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"). You may
+// not use this file except in compliance with the License. A copy of the
+// License is located at
+//
+//     http://aws.amazon.com/apache2.0/
+//
+// or in the "license" file accompanying this file. This file is distributed
+// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+// express or implied. See the License for the specific language governing
+// permissions and limitations under the License
+
+package draincordon
+
+import (
+	"github.com/aws/aws-node-termination-handler/pkg/config"
+	"github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptionevent/internal/common"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore"
+	"github.com/aws/aws-node-termination-handler/pkg/monitor"
+	"github.com/aws/aws-node-termination-handler/pkg/node"
+	"github.com/aws/aws-node-termination-handler/pkg/observability"
+	"github.com/aws/aws-node-termination-handler/pkg/webhook"
+	"github.com/rs/zerolog/log"
+	"k8s.io/apimachinery/pkg/api/errors"
+)
+
+var allowedKinds = []string{monitor.ASGLifecycleKind, monitor.RebalanceRecommendationKind, monitor.SQSTerminateKind, monitor.ScheduledEventKind,
+	monitor.SpotITNKind, monitor.StateChangeKind}
+
+type Handler struct {
+	commonHandler *common.Handler
+	nodeMetadata  ec2metadata.NodeMetadata
+}
+
+func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics, recorder observability.K8sEventRecorder) *Handler {
+	commonHandler := &common.Handler{
+		InterruptionEventStore: interruptionEventStore,
+		Node:                   node,
+		NthConfig:              nthConfig,
+		Metrics:                metrics,
+		Recorder:               recorder,
+	}
+
+	return &Handler{
+		commonHandler: commonHandler,
+		nodeMetadata:  nodeMetadata,
+	}
+}
+
+func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
+	if !common.IsAllowedKind(drainEvent.Kind, allowedKinds...) {
+		return
+	}
+
+	nodeFound := true
+	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
+	if err != nil {
+		log.Error().Err(err).Msg("unable to retrieve node name for draining or cordoning")
+	}
+
+	nodeLabels, err := h.commonHandler.Node.GetNodeLabels(nodeName)
+	if err != nil {
+		log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName)
+		nodeFound = false
+	}
+	drainEvent.NodeLabels = nodeLabels
+	if drainEvent.PreDrainTask != nil {
+		h.commonHandler.RunPreDrainTask(nodeName, drainEvent)
+	}
+
+	podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName)
+	if err != nil {
+		log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
+	}
+	drainEvent.Pods = podNameList
+	err = h.commonHandler.Node.LogPods(podNameList, nodeName)
+	if err != nil {
+		log.Err(err).Msg("There was a problem while trying to log all pod names on the node")
+	}
+
+	if h.commonHandler.NthConfig.CordonOnly || (!h.commonHandler.NthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !h.commonHandler.NthConfig.EnableRebalanceDraining) {
+		err = h.cordonNode(nodeName, drainEvent)
+	} else {
+		err = h.cordonAndDrainNode(nodeName, drainEvent)
+	}
+
+	if h.commonHandler.NthConfig.WebhookURL != "" {
+		webhook.Post(h.nodeMetadata, drainEvent, h.commonHandler.NthConfig)
+	}
+
+	if err != nil {
+		h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
+	} else {
+		h.commonHandler.InterruptionEventStore.MarkAllAsProcessed(nodeName)
+	}
+
+	if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
+		h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
+	}
+}
+
+func (h *Handler) cordonNode(nodeName string, drainEvent *monitor.InterruptionEvent) error {
+	err := h.commonHandler.Node.Cordon(nodeName, drainEvent.Description)
+	if err != nil {
+		if errors.IsNotFound(err) {
+			log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
+		} else {
+			log.Err(err).Msg("There was a problem while trying to cordon the node")
+			h.commonHandler.Recorder.Emit(nodeName, observability.Warning, observability.CordonErrReason, observability.CordonErrMsgFmt, err.Error())
+		}
+		return err
+	} else {
+		log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned")
+		h.commonHandler.Metrics.NodeActionsInc("cordon", nodeName, drainEvent.EventID, err)
+		h.commonHandler.Recorder.Emit(nodeName, observability.Normal, observability.CordonReason, observability.CordonMsg)
+	}
+	return nil
+}
+
+func (h *Handler) cordonAndDrainNode(nodeName string, drainEvent *monitor.InterruptionEvent) error {
+	err := h.commonHandler.Node.CordonAndDrain(nodeName, drainEvent.Description, h.commonHandler.Recorder.EventRecorder)
+	if err != nil {
+		if errors.IsNotFound(err) {
+			log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
+		} else {
+			log.Err(err).Msg("There was a problem while trying to cordon and drain the node")
+			h.commonHandler.Metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err)
+			h.commonHandler.Recorder.Emit(nodeName, observability.Warning, observability.CordonAndDrainErrReason, observability.CordonAndDrainErrMsgFmt, err.Error())
+		}
+		return err
+	} else {
+		log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned and drained")
+		h.commonHandler.Metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err)
+		h.commonHandler.Recorder.Emit(nodeName, observability.Normal, observability.CordonAndDrainReason, observability.CordonAndDrainMsg)
+	}
+	return nil
+}
diff --git a/pkg/interruptionevent/internal/common/handler.go b/pkg/interruptionevent/internal/common/handler.go
new file mode 100644
index 00000000..0c58366a
--- /dev/null
+++ b/pkg/interruptionevent/internal/common/handler.go
@@ -0,0 +1,76 @@
+// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"). You may
+// not use this file except in compliance with the License. A copy of the
+// License is located at
+//
+//     http://aws.amazon.com/apache2.0/
+//
+// or in the "license" file accompanying this file. This file is distributed
+// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+// express or implied. See the License for the specific language governing
+// permissions and limitations under the License
+
+package common
+
+import (
+	"fmt"
+
+	"github.com/aws/aws-node-termination-handler/pkg/config"
+	"github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore"
+	"github.com/aws/aws-node-termination-handler/pkg/monitor"
+	"github.com/aws/aws-node-termination-handler/pkg/node"
+	"github.com/aws/aws-node-termination-handler/pkg/observability"
+	"github.com/rs/zerolog/log"
+)
+
+type Handler struct {
+	InterruptionEventStore *interruptioneventstore.Store
+	Node                   node.Node
+	NthConfig              config.Config
+	Metrics                observability.Metrics
+	Recorder               observability.K8sEventRecorder
+}
+
+func (h *Handler) GetNodeName(drainEvent *monitor.InterruptionEvent) (string, error) {
+	if !h.NthConfig.UseProviderId {
+		return drainEvent.NodeName, nil
+	}
+
+	nodeName, err := h.Node.GetNodeNameFromProviderID(drainEvent.ProviderID)
+	if err != nil {
+		return "", fmt.Errorf("parse node name from providerID=%q: %w", drainEvent.ProviderID, err)
+	}
+	return nodeName, nil
+}
+
+func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) {
+	err := drainEvent.PreDrainTask(*drainEvent, h.Node)
+	if err != nil {
+		log.Err(err).Msg("There was a problem executing the pre-drain task")
+		h.Recorder.Emit(nodeName, observability.Warning, observability.PreDrainErrReason, observability.PreDrainErrMsgFmt, err.Error())
+	} else {
+		h.Recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg)
+	}
+	h.Metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err)
+}
+
+func (h *Handler) RunPostDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) {
+	err := drainEvent.PostDrainTask(*drainEvent, h.Node)
+	if err != nil {
+		log.Err(err).Msg("There was a problem executing the post-drain task")
+		h.Recorder.Emit(nodeName, observability.Warning, observability.PostDrainErrReason, observability.PostDrainErrMsgFmt, err.Error())
+	} else {
+		h.Recorder.Emit(nodeName, observability.Normal, observability.PostDrainReason, observability.PostDrainMsg)
+	}
+	h.Metrics.NodeActionsInc("post-drain", nodeName, drainEvent.EventID, err)
+}
+
+func IsAllowedKind(kind string, allowedKinds ...string) bool {
+	for _, allowedKind := range allowedKinds {
+		if kind == allowedKind {
+			return true
+		}
+	}
+	return false
+}
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index d155ecf3..2e4f2f52 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -131,10 +131,10 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent,
 	return &event, err
 }
 
-func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
+func parseLifecycleEvent(message string) (LifecycleDetail, error) {
 	lifecycleEventMessage := LifecycleDetailMessage{}
 	lifecycleEvent := LifecycleDetail{}
-	err := json.Unmarshal([]byte(*messageBody), &lifecycleEventMessage)
+	err := json.Unmarshal([]byte(message), &lifecycleEventMessage)
 	if err != nil {
 		return lifecycleEvent, err
 	}
@@ -142,7 +142,7 @@ func parseLifecycleEvent(messageBody *string) (LifecycleDetail, error) {
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 	} else {
-		err = json.Unmarshal([]byte(fmt.Sprintf("%v", *messageBody)), &lifecycleEvent)
+		err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent)
 	}
 	return lifecycleEvent, err
 }
@@ -155,7 +155,7 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 	if message == nil {
 		return eventBridgeEvent, fmt.Errorf("ASG event message is nil")
 	}
-	lifecycleEvent, err := parseLifecycleEvent(message.Body)
+	lifecycleEvent, err := parseLifecycleEvent(*message.Body)
 
 	switch {
 	case err != nil:

From 9832d2b157c7d1a34cc1e49925b5a77b7ef6e354 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Fri, 8 Dec 2023 17:52:52 -0600
Subject: [PATCH 19/27] Revised formatting, logging, and error message issues

---
 pkg/interruptionevent/asg/launch/handler.go  | 19 +++++++++++--------
 pkg/interruptionevent/draincordon/handler.go | 10 ++++++++--
 pkg/monitor/sqsevent/sqs-monitor.go          |  6 ++++--
 pkg/node/node_test.go                        |  6 +++---
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go
index 46d917a8..a7a51188 100644
--- a/pkg/interruptionevent/asg/launch/handler.go
+++ b/pkg/interruptionevent/asg/launch/handler.go
@@ -32,6 +32,8 @@ import (
 	"k8s.io/client-go/kubernetes"
 )
 
+const instanceIDLabel = "alpha.eksctl.io/instance-id"
+
 type Handler struct {
 	commonHandler *common.Handler
 	clientset     *kubernetes.Clientset
@@ -64,7 +66,6 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
 		return
 	}
 
-	log.Info().Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is found and ready in cluster")
 	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
 	if err != nil {
 		log.Error().Err(err).Msg("unable to retrieve node name for ASG event processing")
@@ -78,21 +79,24 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
 func (h *Handler) isNodeReady(instanceID string) (bool, error) {
 	nodes, err := h.getNodesWithInstanceID(instanceID)
 	if err != nil {
-		return false, fmt.Errorf("getting nodes with instance ID: %w", err)
+		return false, fmt.Errorf("find node(s) with instanceId=%s: %w", instanceID, err)
 	}
 
 	if len(nodes) == 0 {
-		return false, fmt.Errorf("EC2 instance, %s, not found in cluster", instanceID)
+		log.Warn().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster")
+		return false, nil
 	}
 
 	for _, node := range nodes {
 		conditions := node.Status.Conditions
 		for _, condition := range conditions {
 			if condition.Type == "Ready" && condition.Status != "True" {
-				return false, fmt.Errorf("EC2 instance, %s, found, but not ready in cluster", instanceID)
+				log.Warn().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster")
+				return false, nil
 			}
 		}
 	}
+	log.Info().Str("instanceID", instanceID).Msg("EC2 instance is found and ready in cluster")
 	return true, nil
 }
 
@@ -114,15 +118,14 @@ func (h *Handler) getNodesWithInstanceID(instanceID string) ([]v1.Node, error) {
 }
 
 func (h *Handler) getNodesWithInstanceFromLabel(instanceID string) ([]v1.Node, error) {
-	instanceIDLabel := "alpha.eksctl.io/instance-id"
 	instanceIDReq, err := labels.NewRequirement(instanceIDLabel, selection.Equals, []string{instanceID})
 	if err != nil {
-		return nil, fmt.Errorf("bad label requirement: %w", err)
+		return nil, fmt.Errorf("construct node search requirement %s=%s: %w", instanceIDLabel, instanceID, err)
 	}
 	selector := labels.NewSelector().Add(*instanceIDReq)
 	nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()})
 	if err != nil {
-		return nil, fmt.Errorf("retreiving nodes with label, %s, from cluster: %w", instanceIDLabel, err)
+		return nil, fmt.Errorf("list nodes using selector %q: %w", selector.String(), err)
 	}
 	return nodeList.Items, nil
 }
@@ -130,7 +133,7 @@ func (h *Handler) getNodesWithInstanceFromLabel(instanceID string) ([]v1.Node, e
 func (h *Handler) getNodesWithInstanceFromProviderID(instanceID string) ([]v1.Node, error) {
 	nodeList, err := h.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 	if err != nil {
-		return nil, fmt.Errorf("retreiving all nodes from cluster: %w", err)
+		return nil, fmt.Errorf("list all nodes: %w", err)
 	}
 
 	var filteredNodes []v1.Node
diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go
index 32ea8cae..b9e7596e 100644
--- a/pkg/interruptionevent/draincordon/handler.go
+++ b/pkg/interruptionevent/draincordon/handler.go
@@ -26,8 +26,14 @@ import (
 	"k8s.io/apimachinery/pkg/api/errors"
 )
 
-var allowedKinds = []string{monitor.ASGLifecycleKind, monitor.RebalanceRecommendationKind, monitor.SQSTerminateKind, monitor.ScheduledEventKind,
-	monitor.SpotITNKind, monitor.StateChangeKind}
+var allowedKinds = []string{
+	monitor.ASGLifecycleKind,
+	monitor.RebalanceRecommendationKind,
+	monitor.SQSTerminateKind,
+	monitor.ScheduledEventKind,
+	monitor.SpotITNKind,
+	monitor.StateChangeKind,
+}
 
 type Handler struct {
 	commonHandler *common.Handler
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 2e4f2f52..1cdcc985 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -136,25 +136,27 @@ func parseLifecycleEvent(message string) (LifecycleDetail, error) {
 	lifecycleEvent := LifecycleDetail{}
 	err := json.Unmarshal([]byte(message), &lifecycleEventMessage)
 	if err != nil {
-		return lifecycleEvent, err
+		return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract Message field: %w", err)
 	}
 	// Converts escaped JSON object to string, to lifecycle event
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
+		err = fmt.Errorf("unmarshalling Message field from SQS message body: %w", err)
 	} else {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent)
+		err = fmt.Errorf("unmarshalling SQS message body: %w", err)
 	}
 	return lifecycleEvent, err
 }
 
 // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent
 func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
-	log.Debug().Msg("processing lifecycle event from ASG")
 	eventBridgeEvent := EventBridgeEvent{}
 
 	if message == nil {
 		return eventBridgeEvent, fmt.Errorf("ASG event message is nil")
 	}
+	log.Debug().Str("messageBody", *message.Body).Str("messageID", *message.MessageId).Msg("processing lifecycle event from ASG")
 	lifecycleEvent, err := parseLifecycleEvent(*message.Body)
 
 	switch {
diff --git a/pkg/node/node_test.go b/pkg/node/node_test.go
index e9837e6c..945b98af 100644
--- a/pkg/node/node_test.go
+++ b/pkg/node/node_test.go
@@ -63,13 +63,13 @@ func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node {
 	return tNode
 }
 
-func getNewNode(nthConfig config.Config, client *fake.Clientset) (*node.Node, error) {
+func newNode(nthConfig config.Config, client *fake.Clientset) (*node.Node, error) {
 	drainHelper := getDrainHelper(client)
 	return node.NewWithValues(nthConfig, drainHelper, uptime.Uptime)
 }
 
 func TestDryRun(t *testing.T) {
-	tNode, err := getNewNode(config.Config{DryRun: true}, fake.NewSimpleClientset())
+	tNode, err := newNode(config.Config{DryRun: true}, fake.NewSimpleClientset())
 	h.Ok(t, err)
 
 	fakeRecorder := record.NewFakeRecorder(recorderBufferSize)
@@ -109,7 +109,7 @@ func TestDryRun(t *testing.T) {
 
 func TestNewFailure(t *testing.T) {
 	client := fake.NewSimpleClientset()
-	_, err := getNewNode(config.Config{}, client)
+	_, err := newNode(config.Config{}, client)
 	h.Assert(t, true, "Failed to return error when creating new Node.", err != nil)
 }
 

From 307b6a8a1daf17d090aa812ab95981b83486d2fd Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Mon, 18 Dec 2023 23:10:07 -0600
Subject: [PATCH 20/27] Refactors for log and error handling for eventhandlers.
 Refacors for ASG launch lifecycle bash script

Refactored interruption event handling into a seperate package with distinct handlers for different interruption event Kinds. Updated ASG launch lifecycle hook acceptance test and eks-cluster run-test
---
 cmd/node-termination-handler.go              |  13 +-
 pkg/interruptionevent/asg/launch/handler.go  |  24 ++-
 pkg/interruptionevent/draincordon/handler.go |  23 ++-
 pkg/monitor/sqsevent/asg-lifecycle-event.go  |   8 +-
 pkg/monitor/sqsevent/sqs-monitor.go          |  24 ++-
 test/e2e/asg-launch-lifecycle-sqs-test       | 174 ++++++++++++-------
 test/eks-cluster-test/node_group-spec.yaml   |  15 --
 test/k8s-local-cluster-test/run-test         |   7 +-
 8 files changed, 176 insertions(+), 112 deletions(-)
 delete mode 100644 test/eks-cluster-test/node_group-spec.yaml

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 6fd0c745..cf8ee1ad 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -60,7 +60,7 @@ const (
 )
 
 type interruptionEventHandler interface {
-	HandleEvent(*monitor.InterruptionEvent)
+	HandleEvent(*monitor.InterruptionEvent) error
 }
 
 func main() {
@@ -350,8 +350,17 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
 
 func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Store, event *monitor.InterruptionEvent, eventHandlers []interruptionEventHandler, wg *sync.WaitGroup) {
 	defer wg.Done()
+
+	if event == nil {
+		log.Error().Msg("processing nil interruption event")
+	}
+
+	var err error
 	for _, eventHandler := range eventHandlers {
-		eventHandler.HandleEvent(event)
+		err = eventHandler.HandleEvent(event)
+		if err != nil {
+			log.Error().Err(err).Interface("event", event).Msg("handling event")
+		}
 	}
 	<-interruptionEventStore.Workers
 }
diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go
index a7a51188..9a0cfc82 100644
--- a/pkg/interruptionevent/asg/launch/handler.go
+++ b/pkg/interruptionevent/asg/launch/handler.go
@@ -54,26 +54,34 @@ func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, n
 	}
 }
 
-func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
+func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
+	if drainEvent == nil {
+		return fmt.Errorf("handling nil event")
+	}
+
 	if !common.IsAllowedKind(drainEvent.Kind, monitor.ASGLaunchLifecycleKind) {
-		return
+		return nil
 	}
 
 	isNodeReady, err := h.isNodeReady(drainEvent.InstanceID)
-	if err != nil || !isNodeReady {
-		log.Error().Err(err).Str("instanceID", drainEvent.InstanceID).Msg("EC2 instance is not found and ready in cluster")
+	if err != nil {
+		h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
+		return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s: %w", drainEvent.InstanceID, err)
+	}
+	if !isNodeReady {
 		h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
-		return
+		return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s", drainEvent.InstanceID)
 	}
 
 	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
 	if err != nil {
-		log.Error().Err(err).Msg("unable to retrieve node name for ASG event processing")
+		return fmt.Errorf("unable to retrieve node name for ASG event processing: %w", err)
 	}
 
 	if drainEvent.PostDrainTask != nil {
 		h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
 	}
+	return nil
 }
 
 func (h *Handler) isNodeReady(instanceID string) (bool, error) {
@@ -83,7 +91,7 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) {
 	}
 
 	if len(nodes) == 0 {
-		log.Warn().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster")
+		log.Info().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster")
 		return false, nil
 	}
 
@@ -91,7 +99,7 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) {
 		conditions := node.Status.Conditions
 		for _, condition := range conditions {
 			if condition.Type == "Ready" && condition.Status != "True" {
-				log.Warn().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster")
+				log.Info().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster")
 				return false, nil
 			}
 		}
diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go
index b9e7596e..be89eb37 100644
--- a/pkg/interruptionevent/draincordon/handler.go
+++ b/pkg/interruptionevent/draincordon/handler.go
@@ -14,6 +14,8 @@
 package draincordon
 
 import (
+	"fmt"
+
 	"github.com/aws/aws-node-termination-handler/pkg/config"
 	"github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
 	"github.com/aws/aws-node-termination-handler/pkg/interruptionevent/internal/common"
@@ -55,35 +57,39 @@ func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, n
 	}
 }
 
-func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
+func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
 	if !common.IsAllowedKind(drainEvent.Kind, allowedKinds...) {
-		return
+		return nil
 	}
 
 	nodeFound := true
 	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
 	if err != nil {
-		log.Error().Err(err).Msg("unable to retrieve node name for draining or cordoning")
+		return fmt.Errorf("unable to retrieve node name for draining or cordoning: %w", err)
 	}
 
 	nodeLabels, err := h.commonHandler.Node.GetNodeLabels(nodeName)
 	if err != nil {
-		log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName)
+		log.Warn().Err(err).Msgf("Unable to fetch node labels for nodeName=%s", nodeName)
 		nodeFound = false
+	} else {
+		drainEvent.NodeLabels = nodeLabels
 	}
-	drainEvent.NodeLabels = nodeLabels
+
 	if drainEvent.PreDrainTask != nil {
 		h.commonHandler.RunPreDrainTask(nodeName, drainEvent)
 	}
 
 	podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName)
 	if err != nil {
-		log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
+		log.Warn().Err(err).Msgf("Unable to fetch running pods for nodeName=%s", nodeName)
+	} else {
+		drainEvent.Pods = podNameList
 	}
-	drainEvent.Pods = podNameList
+
 	err = h.commonHandler.Node.LogPods(podNameList, nodeName)
 	if err != nil {
-		log.Err(err).Msg("There was a problem while trying to log all pod names on the node")
+		log.Warn().Err(err).Msgf("There was a problem while trying to log all pod names on the node nodeName=%s", nodeName)
 	}
 
 	if h.commonHandler.NthConfig.CordonOnly || (!h.commonHandler.NthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !h.commonHandler.NthConfig.EnableRebalanceDraining) {
@@ -105,6 +111,7 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) {
 	if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
 		h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
 	}
+	return nil
 }
 
 func (h *Handler) cordonNode(nodeName string, drainEvent *monitor.InterruptionEvent) error {
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 3a9dd390..1c56ba6a 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -130,8 +130,12 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
 func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) {
-	if message == nil || event == nil {
-		return nil, fmt.Errorf("event message is nil for ASG Instance Launch Event creation")
+	if event == nil {
+		return nil, fmt.Errorf("EventBridgeEvent is nil for ASG Instance Launch Event creation")
+	}
+
+	if message == nil {
+		return nil, fmt.Errorf("SQS message is nil for ASG Instance Launch Event creation")
 	}
 
 	lifecycleDetail := &LifecycleDetail{}
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 1cdcc985..4aebea33 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -136,27 +136,31 @@ func parseLifecycleEvent(message string) (LifecycleDetail, error) {
 	lifecycleEvent := LifecycleDetail{}
 	err := json.Unmarshal([]byte(message), &lifecycleEventMessage)
 	if err != nil {
-		return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract Message field: %w", err)
+		return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract 'Message' field: %w", err)
 	}
 	// Converts escaped JSON object to string, to lifecycle event
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
-		err = fmt.Errorf("unmarshalling Message field from SQS message body: %w", err)
+		if err != nil {
+			err = fmt.Errorf("unmarshalling 'Message' field from SQS message body: %w", err)
+		}
 	} else {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent)
-		err = fmt.Errorf("unmarshalling SQS message body: %w", err)
+		if err != nil {
+			err = fmt.Errorf("unmarshalling SQS message body: %w", err)
+		}
 	}
 	return lifecycleEvent, err
 }
 
 // processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent
 func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) {
+	log.Debug().Interface("message", message).Msg("processing lifecycle event from ASG")
 	eventBridgeEvent := EventBridgeEvent{}
 
 	if message == nil {
 		return eventBridgeEvent, fmt.Errorf("ASG event message is nil")
 	}
-	log.Debug().Str("messageBody", *message.Body).Str("messageID", *message.MessageId).Msg("processing lifecycle event from ASG")
 	lifecycleEvent, err := parseLifecycleEvent(*message.Body)
 
 	switch {
@@ -172,7 +176,7 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 
 	case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" &&
 		lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING":
-		return eventBridgeEvent, fmt.Errorf("unsupported message type (%s) while parsing lifecycle event messsage from ASG", message.String())
+		return eventBridgeEvent, fmt.Errorf("unsupported lifecycle transition while parsing lifecycle event messsage from ASG lifecycleTransition=%s", lifecycleEvent.LifecycleTransition)
 	}
 
 	eventBridgeEvent.Source = "aws.autoscaling"
@@ -188,12 +192,16 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	interruptionEvent := &monitor.InterruptionEvent{}
 	var err error
 
-	if message == nil || eventBridgeEvent == nil {
-		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("event message is nil")})
+	if eventBridgeEvent == nil {
+		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("EventBridgeEvent is nil for EventBridgeEvent processing")})
+	}
+	if message == nil {
+		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("SQS message is nil for EventBridgeEvent processing")})
 	}
 
 	switch eventBridgeEvent.Source {
-	// LifecycleTransitions other than LAUNCHING or TERMINATING will result in the interruptionEvent being uninitialized
+	/* LifecycleTransitions other than LAUNCHING and TERMINATING are invalid values. These values result in uninitialized interruptionEvents, whose
+	   messages are later dropped */
 	case "aws.autoscaling":
 		lifecycleEvent := LifecycleDetail{}
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index f92e0a3a..3988ab88 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -1,10 +1,7 @@
 #!/bin/bash
 set -euo pipefail
 
-REGION="us-west-2"
-CLUSTER_NAME="nth-eks-cluster-test"
-
-node_group_name="nth-eks-cluster-test-spot-ng"
+node_group_name="spot-ng"
 sqs_queue_name="nth-sqs-test"
 sns_topic_name="nth-sns-test"
 node_policy_name="nth-test-node-policy"
@@ -47,9 +44,9 @@ EOF
 
 cat << EOF > /tmp/queue-attributes.json
 {
-"MessageRetentionPeriod": "300",
-"Policy": "$(echo $sqs_queue_policy | sed 's/\"/\\"/g' | tr -d -s '\n' " ")",
-"SqsManagedSseEnabled": "true"
+    "MessageRetentionPeriod": "300",
+    "Policy": "$(echo $sqs_queue_policy | sed 's/\"/\\"/g' | tr -d -s '\n' " ")",
+    "SqsManagedSseEnabled": "true"
 }
 EOF
 
@@ -140,6 +137,15 @@ EOF
 
 ##### SETUP #####
 
+function validate_aws_account {
+    if [[ -n "$account_id" ]]; then
+        echo "🥑 AWS Account ID: $account_id"
+    else
+        echo "❌ Failed to retrieve AWS Account ID ❌"
+        exit 1
+    fi
+}
+
 ### SQS ###
 function provision_sqs_queue {
     queue_exists=$(aws sqs list-queues --queue-name-prefix $sqs_queue_name)
@@ -178,27 +184,8 @@ function subscribe_sqs_to_sns {
 }
 
 ### NODEGROUP ###
-function provision_node_group {
+function update_node_group {
     create_node_policy
-    node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
-    if [[ -n $node_group_exists ]]; then
-        get_node_role_name
-        delete_node_group
-        echo ""
-
-        node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
-        echo -n "Node group Deleting."
-        while [[ -n $node_group_exists ]]; do
-            echo -n "."
-            node_group_exists=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --name $node_group_name || :)
-            sleep 10
-        done
-        echo ""
-        sleep 20
-    fi
-
-    echo "🥑 Provisioning Spot Node Group"
-    eksctl create nodegroup --config-file=$NODE_GROUP_CONFIG_FILE
 
     echo "🥑 Attaching Node policy to Node role"
     get_node_role_name
@@ -332,7 +319,8 @@ function start_FIS_experiment {
     create_FIS_role
     create_experiment_template
     echo "🥑 Starting Experiment"
-    experiment_start_time=$(aws fis start-experiment --experiment-template-id $template_id | jq -r '.experiment.startTime')
+    experiment_start_time=$(date +%s)
+    experiment=$(aws fis start-experiment --experiment-template-id $template_id)
 }
 
 
@@ -344,20 +332,51 @@ function is_new_instance {
     fi
 }
 
+function convert_date_to_epoch_seconds {
+    IFS='T' read -r date_part time_part <<< "$1"
+    IFS='-' read -r year month day <<< "$date_part"
+    IFS=':' read -r hour minute second_fractional <<< "$time_part"
+    IFS='.' read -r second fraction <<< "$second_fractional"
+    IFS=':' read -r offset_hours offset_minutes <<< "${time_part:16:5}"
+
+    if [[ $time_part =~ .*"-".* ]]; then
+        offset_hours=$((offset_hours * -1))
+        offset_minutes=$((offset_minutes * -1))
+    fi
+
+    total_days=$(((year - 1970) * 365 + (year - 1970)/4))
+    for ((i = 1; i < month; i++)); do
+        total_days=$((total_days + $(cal $i $year | awk 'NF {DAYS = $NF} END {print DAYS}')))
+    done
+    total_days=$((total_days + day - 1))
+    total_seconds=$((total_days * 86400 + (hour + offset_hours) * 3600 + (minute + offset_minutes) * 60 + second))
+}
+
 function get_launch_activity {
+    max_duration=$((5 * 60))
+    start_time=$(date +%s)
+
     launch_activity=""
-    while [[ -z $launch_activity  ]]; do
+    while [[ -z $launch_activity ]]; do
+        current_time=$(date +%s)
+        elapsed_time=$((current_time - start_time))
+        if [[ $elapsed_time -ge $max_duration ]]; then 
+            echo "❌ Failed to find a new launched instance. Timeout Reached ❌"
+            exit 1
+        fi
+
         sleep 5
         activities=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name)
-        activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode]' <<< $activities)
+        activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode, .StartTime]' <<< $activities)
         num_activities=$(jq -r 'length' <<< $activities_details)
-        for i in $(seq 0 3 $((--num_activities))); do
+        for i in $(seq 0 4 $((--num_activities))); do
             id=$(jq -r .[$i] <<< $activities_details)
             description=$(jq -r .[$((++i))] <<< $activities_details)
             status=$(jq -r .[$((i+=2))] <<< $activities_details)
+            start=$(jq -r .[$((i+=3))] <<< $activities_details)
             activity_instance=${description##*:}
-            is_new_instance $activity_instance
-            if [[ $description =~ .*"Launching".* && $is_new == "true" ]]; then
+            convert_date_to_epoch_seconds $start
+            if [[ $description =~ .*"Launching".* && $total_seconds -gt $experiment_start_time ]]; then
                 launch_activity=$id
                 echo "🥑 Launch Activity found for instance $activity_instance"
                 break
@@ -369,20 +388,35 @@ function get_launch_activity {
 function test_launch_lifecycle {
     aws sqs receive-message --queue-url $queue_url
     echo -n "🥑 Waiting for launch hook completion."
+
+    max_duration=$((4 * 60))
+    start_time=$(date +%s)
     while [[ true ]]; do
+        current_time=$(date +%s)
+        elapsed_time=$((current_time - start_time))
+        if [[ $elapsed_time -ge $max_duration ]]; then 
+            echo ""
+            echo "❌ Launch Lifecycle not Completed. Timeout Reached ❌"
+            exit 1
+        fi
+
         activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[].StatusCode')
         if [[ $activity_status == "Successful" ]]; then
             echo ""
             echo "✅ Launch Lifecycle Successfully Completed ✅"
-            exit_policy="exit 0"
-            break
+            exit 0
         fi
 
         if [[ $activity_status == "Cancelled" ]]; then
+            echo ""
+            echo "❌ Launch Lifecycle Cancelled ❌"
+            exit 1
+        fi
+
+        if [[ $activity_status == "Failed" ]]; then
             echo ""
             echo "❌ Launch Lifecycle Failed ❌"
-            exit_policy="exit 1"
-            break
+            exit 1
         fi
         echo -n "."
         sleep 10
@@ -395,27 +429,43 @@ function clean_up {
     echo "====================================================================================================="
     echo "🧹  Cleaning up SQS, SNS, NodeGroup, IAM, FIS  🧹"
     echo "====================================================================================================="
-    pod_id=$(get_nth_worker_pod || :)
-    kubectl logs $pod_id --namespace kube-system || :
+    print_logs
     echo "🥑 Uninstalling NTH helm chart"
     helm uninstall "$CLUSTER_NAME-acth" -n kube-system
     delete_node_group
-    echo "🥑 Unsubscribing SNS from SQS"
-    aws sns unsubscribe --subscription-arn $subscription_arn
-    echo "🥑 Deleting SQS queue"
-    aws sqs delete-queue --queue-url $queue_url
-    echo "🥑 Deleting SNS topic"
-    aws sns delete-topic --topic-arn $sns_arn
-    echo "🥑 Deleting FIS experiment template"
-    deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate)
+    if [[ -n $subscription_arn ]]; then 
+        echo "🥑 Unsubscribing SNS from SQS"
+        aws sns unsubscribe --subscription-arn $subscription_arn
+    fi 
+    if [[ -n $queue_url ]]; then 
+        echo "🥑 Deleting SQS queue"
+        aws sqs delete-queue --queue-url $queue_url
+    fi
+    if [[ -n $sns_arn ]]; then 
+        echo "🥑 Deleting SNS topic"
+        aws sns delete-topic --topic-arn $sns_arn
+    fi
+    if [[ -n $template_id ]]; then 
+        echo "🥑 Deleting FIS experiment template"
+        deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate)
+    fi
     echo "🥑 Detaching FIS role policy"
     aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn
     echo "🥑 Deleting FIS role"
     aws iam delete-role --role-name $fis_role_name
     echo "🥑 Deleting autoscaling role"
     aws iam delete-service-linked-role --role-name $auto_scaling_role_name
-    echo "🥑 Deleting Node role policy"
-    aws iam delete-policy --policy-arn $node_policy_arn
+    if [[ -n $node_policy_arn ]]; then
+        echo "🥑 Deleting Node role policy"
+        aws iam delete-policy --policy-arn $node_policy_arn
+    fi
+}
+
+function print_logs {
+    pod_id=$(get_nth_worker_pod || :)
+    if [[ -n $pod_id ]]; then
+        kubectl logs $pod_id --namespace kube-system || :
+    fi
 }
 
 function delete_node_group {
@@ -426,21 +476,15 @@ function delete_node_group {
         echo "🥑 Detaching NTH Node Group policy"
         aws iam detach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn
     fi
-    echo "🥑 Deleting NTH Node Group"
-    eksctl delete nodegroup -f $NODE_GROUP_CONFIG_FILE --approve
-}
-
-function main {
-    provision_sqs_queue
-    provision_sns_topic    
-    subscribe_sqs_to_sns
-    provision_node_group
-    install_helm
-    start_FIS_experiment
-    get_launch_activity
-    test_launch_lifecycle
-    trap "clean_up" EXIT
-    eval $exit_policy
 }
 
-main
+trap "clean_up" EXIT
+validate_aws_account
+provision_sqs_queue
+provision_sns_topic    
+subscribe_sqs_to_sns
+update_node_group
+install_helm
+start_FIS_experiment
+get_launch_activity
+test_launch_lifecycle
diff --git a/test/eks-cluster-test/node_group-spec.yaml b/test/eks-cluster-test/node_group-spec.yaml
deleted file mode 100644
index 2fa39a78..00000000
--- a/test/eks-cluster-test/node_group-spec.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-apiVersion: eksctl.io/v1alpha5
-kind: ClusterConfig
-metadata:
-  name: nth-eks-cluster-test
-  region: us-west-2
-managedNodeGroups:
-  - name: nth-eks-cluster-test-spot-ng
-    instanceType: t3.medium
-    amiFamily: AmazonLinux2
-    desiredCapacity: 2
-    minSize: 2
-    maxSize: 2
-    spot: true
-iam:
-  withOIDC: true
\ No newline at end of file
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index e9e5388b..64839292 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -275,16 +275,15 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\
 kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite
 
 function is_denylisted {
-    is_denied="false"
     if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
-        is_denied="true"
+        return 1
     fi
+    return 0
 }
 
 i=0
 for assert_script in $ASSERTION_SCRIPTS; do
-  is_denylisted $assert_script
-  if[[ $is_denied == "true" ]]; then continue; fi
+  if [[ is_denylisted $assert_script ]]; then continue; fi
 
   reset_cluster
   START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ")

From b4ea84a12b3fb563e8d8b6fa72bf6e417a5b47fd Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Wed, 20 Dec 2023 14:13:58 -0600
Subject: [PATCH 21/27] Refactored error and logging messages and bash script
 tests for ASG launch lifecycle event

---
 cmd/node-termination-handler.go              |  2 +
 pkg/interruptionevent/asg/launch/handler.go  | 14 ++---
 pkg/interruptionevent/draincordon/handler.go | 16 ++++--
 pkg/monitor/sqsevent/asg-lifecycle-event.go  |  4 +-
 pkg/monitor/sqsevent/sqs-monitor.go          | 30 +++++-----
 test/e2e/asg-launch-lifecycle-sqs-test       | 60 +++++++++++++-------
 test/k8s-local-cluster-test/run-test         |  6 +-
 7 files changed, 82 insertions(+), 50 deletions(-)

diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index cf8ee1ad..6145e0e5 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -353,6 +353,8 @@ func processInterruptionEvent(interruptionEventStore *interruptioneventstore.Sto
 
 	if event == nil {
 		log.Error().Msg("processing nil interruption event")
+		<-interruptionEventStore.Workers
+		return
 	}
 
 	var err error
diff --git a/pkg/interruptionevent/asg/launch/handler.go b/pkg/interruptionevent/asg/launch/handler.go
index 9a0cfc82..00df82c4 100644
--- a/pkg/interruptionevent/asg/launch/handler.go
+++ b/pkg/interruptionevent/asg/launch/handler.go
@@ -56,7 +56,7 @@ func New(interruptionEventStore *interruptioneventstore.Store, node node.Node, n
 
 func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
 	if drainEvent == nil {
-		return fmt.Errorf("handling nil event")
+		return fmt.Errorf("drainEvent is nil")
 	}
 
 	if !common.IsAllowedKind(drainEvent.Kind, monitor.ASGLaunchLifecycleKind) {
@@ -66,16 +66,16 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
 	isNodeReady, err := h.isNodeReady(drainEvent.InstanceID)
 	if err != nil {
 		h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
-		return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s: %w", drainEvent.InstanceID, err)
+		return fmt.Errorf("check if node (instanceID=%s) is present and ready: %w", drainEvent.InstanceID, err)
 	}
 	if !isNodeReady {
 		h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
-		return fmt.Errorf("EC2 instance is not found and ready in cluster instanceID=%s", drainEvent.InstanceID)
+		return nil
 	}
 
 	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
 	if err != nil {
-		return fmt.Errorf("unable to retrieve node name for ASG event processing: %w", err)
+		return fmt.Errorf("get node name for instanceID=%s: %w", drainEvent.InstanceID, err)
 	}
 
 	if drainEvent.PostDrainTask != nil {
@@ -91,7 +91,7 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) {
 	}
 
 	if len(nodes) == 0 {
-		log.Info().Str("instanceID", instanceID).Msg("EC2 instance not found in cluster")
+		log.Info().Str("instanceID", instanceID).Msg("EC2 instance not found")
 		return false, nil
 	}
 
@@ -99,12 +99,12 @@ func (h *Handler) isNodeReady(instanceID string) (bool, error) {
 		conditions := node.Status.Conditions
 		for _, condition := range conditions {
 			if condition.Type == "Ready" && condition.Status != "True" {
-				log.Info().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready in cluster")
+				log.Info().Str("instanceID", instanceID).Msg("EC2 instance found, but not ready")
 				return false, nil
 			}
 		}
 	}
-	log.Info().Str("instanceID", instanceID).Msg("EC2 instance is found and ready in cluster")
+	log.Info().Str("instanceID", instanceID).Msg("EC2 instance is found and ready")
 	return true, nil
 }
 
diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go
index be89eb37..0360a31c 100644
--- a/pkg/interruptionevent/draincordon/handler.go
+++ b/pkg/interruptionevent/draincordon/handler.go
@@ -65,12 +65,16 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
 	nodeFound := true
 	nodeName, err := h.commonHandler.GetNodeName(drainEvent)
 	if err != nil {
-		return fmt.Errorf("unable to retrieve node name for draining or cordoning: %w", err)
+		return fmt.Errorf("get node name for instanceID=%s: %w", drainEvent.InstanceID, err)
 	}
 
 	nodeLabels, err := h.commonHandler.Node.GetNodeLabels(nodeName)
 	if err != nil {
-		log.Warn().Err(err).Msgf("Unable to fetch node labels for nodeName=%s", nodeName)
+		log.Warn().
+			Err(err).
+			Interface("fallbackNodeLabels", drainEvent.NodeLabels).
+			Str("nodeName", nodeName).
+			Msg("Failed to get node labels. Proceeding with fallback labels")
 		nodeFound = false
 	} else {
 		drainEvent.NodeLabels = nodeLabels
@@ -82,14 +86,18 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
 
 	podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName)
 	if err != nil {
-		log.Warn().Err(err).Msgf("Unable to fetch running pods for nodeName=%s", nodeName)
+		log.Warn().
+			Err(err).
+			Strs("fallbackPodNames", podNameList).
+			Str("nodeName", nodeName).
+			Msg("Failed to fetch pod names. Proceeding with fallback pod names")
 	} else {
 		drainEvent.Pods = podNameList
 	}
 
 	err = h.commonHandler.Node.LogPods(podNameList, nodeName)
 	if err != nil {
-		log.Warn().Err(err).Msgf("There was a problem while trying to log all pod names on the node nodeName=%s", nodeName)
+		log.Warn().Err(err).Str("nodeName", nodeName).Msg("Failed to log pods")
 	}
 
 	if h.commonHandler.NthConfig.CordonOnly || (!h.commonHandler.NthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !h.commonHandler.NthConfig.EnableRebalanceDraining) {
diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go
index 1c56ba6a..c1262519 100644
--- a/pkg/monitor/sqsevent/asg-lifecycle-event.go
+++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go
@@ -131,11 +131,11 @@ func (m SQSMonitor) continueLifecycleAction(lifecycleDetail *LifecycleDetail) (*
 // Completes the ASG launch lifecycle hook if the new EC2 instance launched by ASG is Ready in the cluster
 func (m SQSMonitor) createAsgInstanceLaunchEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) {
 	if event == nil {
-		return nil, fmt.Errorf("EventBridgeEvent is nil for ASG Instance Launch Event creation")
+		return nil, fmt.Errorf("event is nil")
 	}
 
 	if message == nil {
-		return nil, fmt.Errorf("SQS message is nil for ASG Instance Launch Event creation")
+		return nil, fmt.Errorf("message is nil")
 	}
 
 	lifecycleDetail := &LifecycleDetail{}
diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 4aebea33..18febbb1 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -38,7 +38,9 @@ const (
 	// SQSMonitorKind is a const to define this monitor kind
 	SQSMonitorKind = "SQS_MONITOR"
 	// ASGTagName is the name of the instance tag whose value is the AutoScaling group name
-	ASGTagName = "aws:autoscaling:groupName"
+	ASGTagName                        = "aws:autoscaling:groupName"
+	ASGTerminatingLifecycleTransition = "autoscaling:EC2_INSTANCE_TERMINATING"
+	ASGLaunchingLifecycleTransition   = "autoscaling:EC2_INSTANCE_LAUNCHING"
 )
 
 // SQSMonitor is a struct definition that knows how to process events from Amazon EventBridge
@@ -136,18 +138,18 @@ func parseLifecycleEvent(message string) (LifecycleDetail, error) {
 	lifecycleEvent := LifecycleDetail{}
 	err := json.Unmarshal([]byte(message), &lifecycleEventMessage)
 	if err != nil {
-		return lifecycleEvent, fmt.Errorf("unmarshalling SQS message body to extract 'Message' field: %w", err)
+		return lifecycleEvent, fmt.Errorf("unmarshalling SQS message: %w", err)
 	}
 	// Converts escaped JSON object to string, to lifecycle event
 	if lifecycleEventMessage.Message != nil {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", lifecycleEventMessage.Message)), &lifecycleEvent)
 		if err != nil {
-			err = fmt.Errorf("unmarshalling 'Message' field from SQS message body: %w", err)
+			err = fmt.Errorf("unmarshalling message body from '.Message': %w", err)
 		}
 	} else {
 		err = json.Unmarshal([]byte(fmt.Sprintf("%v", message)), &lifecycleEvent)
 		if err != nil {
-			err = fmt.Errorf("unmarshalling SQS message body: %w", err)
+			err = fmt.Errorf("unmarshalling message body: %w", err)
 		}
 	}
 	return lifecycleEvent, err
@@ -174,9 +176,9 @@ func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBri
 		}
 		return eventBridgeEvent, skip{err}
 
-	case lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" &&
-		lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_LAUNCHING":
-		return eventBridgeEvent, fmt.Errorf("unsupported lifecycle transition while parsing lifecycle event messsage from ASG lifecycleTransition=%s", lifecycleEvent.LifecycleTransition)
+	case lifecycleEvent.LifecycleTransition != ASGTerminatingLifecycleTransition &&
+		lifecycleEvent.LifecycleTransition != ASGLaunchingLifecycleTransition:
+		return eventBridgeEvent, fmt.Errorf("lifecycle transition must be %s or %s. Got %s", ASGTerminatingLifecycleTransition, ASGLaunchingLifecycleTransition, lifecycleEvent.LifecycleTransition)
 	}
 
 	eventBridgeEvent.Source = "aws.autoscaling"
@@ -193,27 +195,27 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 	var err error
 
 	if eventBridgeEvent == nil {
-		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("EventBridgeEvent is nil for EventBridgeEvent processing")})
+		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("eventBridgeEvent is nil")})
 	}
 	if message == nil {
-		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("SQS message is nil for EventBridgeEvent processing")})
+		return append(interruptionEventWrappers, InterruptionEventWrapper{nil, fmt.Errorf("message is nil")})
 	}
 
 	switch eventBridgeEvent.Source {
-	/* LifecycleTransitions other than LAUNCHING and TERMINATING are invalid values. These values result in uninitialized interruptionEvents, whose
-	   messages are later dropped */
 	case "aws.autoscaling":
 		lifecycleEvent := LifecycleDetail{}
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
 		if err != nil {
 			interruptionEvent, err = nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err)
 		}
-		if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_LAUNCHING" {
+		if lifecycleEvent.LifecycleTransition == ASGLaunchingLifecycleTransition {
 			interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message)
-		} else if lifecycleEvent.LifecycleTransition == "autoscaling:EC2_INSTANCE_TERMINATING" {
+			interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})
+		} else if lifecycleEvent.LifecycleTransition == ASGTerminatingLifecycleTransition {
 			interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message)
+			interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})
 		}
-		return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})
+		return interruptionEventWrappers
 
 	case "aws.ec2":
 		if eventBridgeEvent.DetailType == "EC2 Instance State-change Notification" {
diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index 3988ab88..4472f015 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -14,6 +14,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml"
 account_id=$(aws sts get-caller-identity | jq -r '.Account')
 nth_label="Use-Case=NTH"
+heartbeat_timeout=$((3 * 60))
 
 ##### JSON FILES #####
 
@@ -235,8 +236,22 @@ function update_ASG {
 
     create_auto_scaling_role
     echo "🥑 Creating Lifecycle Hooks"
-    aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Launch-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_LAUNCHING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="ABANDON"     
-    aws autoscaling put-lifecycle-hook --lifecycle-hook-name "Terminate-LC-Hook" --auto-scaling-group-name $asg_name --lifecycle-transition="autoscaling:EC2_INSTANCE_TERMINATING" --heartbeat-timeout=180 --notification-target-arn=$sns_arn --role-arn=$auto_scaling_role_arn --default-result="CONTINUE"
+    aws autoscaling put-lifecycle-hook \
+        --lifecycle-hook-name "Launch-LC-Hook" \
+        --auto-scaling-group-name $asg_name \
+        --lifecycle-transition="autoscaling:EC2_INSTANCE_LAUNCHING" \
+        --heartbeat-timeout=$heartbeat_timeout \
+        --notification-target-arn=$sns_arn \
+        --role-arn=$auto_scaling_role_arn \
+        --default-result="ABANDON"     
+    aws autoscaling put-lifecycle-hook \
+        --lifecycle-hook-name "Terminate-LC-Hook" \
+        --auto-scaling-group-name $asg_name \
+        --lifecycle-transition="autoscaling:EC2_INSTANCE_TERMINATING" \
+        --heartbeat-timeout=$heartbeat_timeout \
+        --notification-target-arn=$sns_arn \
+        --role-arn=$auto_scaling_role_arn \
+        --default-result="CONTINUE"
 }
 
 function create_auto_scaling_role {
@@ -339,6 +354,11 @@ function convert_date_to_epoch_seconds {
     IFS='.' read -r second fraction <<< "$second_fractional"
     IFS=':' read -r offset_hours offset_minutes <<< "${time_part:16:5}"
 
+    # Convert time strings to base-10 integers
+    year=$((10#$year + 0)); month=$((10#$month + 0)); day=$((10#$day + 0)) 
+    hour=$((10#$hour + 0)); minute=$((10#$minute + 0)); second=$((10#$second + 0))
+    offset_hours=$((10#$offset_hours + 0)); offset_minutes=$((10#$offset_minutes + 0))
+
     if [[ $time_part =~ .*"-".* ]]; then
         offset_hours=$((offset_hours * -1))
         offset_minutes=$((offset_minutes * -1))
@@ -389,12 +409,11 @@ function test_launch_lifecycle {
     aws sqs receive-message --queue-url $queue_url
     echo -n "🥑 Waiting for launch hook completion."
 
-    max_duration=$((4 * 60))
     start_time=$(date +%s)
     while [[ true ]]; do
         current_time=$(date +%s)
         elapsed_time=$((current_time - start_time))
-        if [[ $elapsed_time -ge $max_duration ]]; then 
+        if [[ $elapsed_time -ge $heartbeat_timeout ]]; then 
             echo ""
             echo "❌ Launch Lifecycle not Completed. Timeout Reached ❌"
             exit 1
@@ -405,17 +424,9 @@ function test_launch_lifecycle {
             echo ""
             echo "✅ Launch Lifecycle Successfully Completed ✅"
             exit 0
-        fi
-
-        if [[ $activity_status == "Cancelled" ]]; then
+        elif [[ $activity_status == "Cancelled" || $activity_status == "Failed" ]]; then
             echo ""
-            echo "❌ Launch Lifecycle Cancelled ❌"
-            exit 1
-        fi
-
-        if [[ $activity_status == "Failed" ]]; then
-            echo ""
-            echo "❌ Launch Lifecycle Failed ❌"
+            echo "❌ Launch Lifecycle $activity_status ❌"
             exit 1
         fi
         echo -n "."
@@ -430,9 +441,8 @@ function clean_up {
     echo "🧹  Cleaning up SQS, SNS, NodeGroup, IAM, FIS  🧹"
     echo "====================================================================================================="
     print_logs
-    echo "🥑 Uninstalling NTH helm chart"
-    helm uninstall "$CLUSTER_NAME-acth" -n kube-system
-    delete_node_group
+    uninstall_helm
+    delete_node_group_policy
     if [[ -n $subscription_arn ]]; then 
         echo "🥑 Unsubscribing SNS from SQS"
         aws sns unsubscribe --subscription-arn $subscription_arn
@@ -465,13 +475,23 @@ function print_logs {
     pod_id=$(get_nth_worker_pod || :)
     if [[ -n $pod_id ]]; then
         kubectl logs $pod_id --namespace kube-system || :
+    else 
+        echo "❌ Failed to get pod ID. Unable to print logs ❌"
+    fi
+}
+
+function uninstall_helm {
+    helm_exists=$(helm ls -A | grep "$CLUSTER_NAME-acth")
+    if [[ -n $helm_exists ]]; then 
+        echo "🥑 Uninstalling NTH helm chart"
+        helm uninstall "$CLUSTER_NAME-acth" -n kube-system
     fi
 }
 
-function delete_node_group {
-    echo "Node Role Name: $node_role_name"
+function delete_node_group_policy {
+    if [[ -z $node_role_name || -z $node_policy_name ]]; then return; fi
+
     node_policy_exists=$(aws iam list-attached-role-policies --role-name $node_role_name | grep "$node_policy_name" || :)
-    echo $node_policy_exists
     if [[ -n $node_policy_exists ]]; then
         echo "🥑 Detaching NTH Node Group policy"
         aws iam detach-role-policy --role-name $node_role_name --policy-arn $node_policy_arn
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index 64839292..34385364 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -276,14 +276,14 @@ kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule
 
 function is_denylisted {
     if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
-        return 1
+        return 0
     fi
-    return 0
+    return 1
 }
 
 i=0
 for assert_script in $ASSERTION_SCRIPTS; do
-  if [[ is_denylisted $assert_script ]]; then continue; fi
+  if is_denylisted $assert_script; then continue; fi
 
   reset_cluster
   START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ")

From f52c7b57474f6389c9fa84a3bfeb43381e6f978c Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Thu, 21 Dec 2023 13:42:43 -0600
Subject: [PATCH 22/27] Update ReadME for ASG launch lifecycle hook changes

---
 README.md | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 3f23b4bd..d609bb72 100644
--- a/README.md
+++ b/README.md
@@ -218,8 +218,9 @@ You'll need the following AWS infrastructure components:
 
 1. Amazon Simple Queue Service (SQS) Queue
 2. AutoScaling Group Termination Lifecycle Hook
-3. Amazon EventBridge Rule
-4. IAM Role for the aws-node-termination-handler Queue Processing Pods
+3. AutoScaling Group Launch Lifecycle Hook (optional)
+4. Amazon EventBridge Rule
+5. IAM Role for the aws-node-termination-handler Queue Processing Pods
 
 #### 1. Create an SQS Queue:
 
@@ -294,7 +295,37 @@ aws autoscaling put-lifecycle-hook \
   --role-arn <your SQS access role ARN here>
 ```
 
-#### 3. Tag the Instances:
+#### 3. Create an ASG Launch Lifecycle Hook (optional):
+
+If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, then a new instance will be launched upon another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node.
+
+Here is the AWS CLI command to create a launch lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform:
+
+```
+aws autoscaling put-lifecycle-hook \
+  --lifecycle-hook-name=my-k8s-term-launch-hook \
+  --auto-scaling-group-name=my-k8s-asg \
+  --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
+  --default-result="ABANDON" \
+  --heartbeat-timeout=300
+```
+
+If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1:
+
+```
+aws autoscaling put-lifecycle-hook \
+  --lifecycle-hook-name=my-k8s-term-launch-hook \
+  --auto-scaling-group-name=my-k8s-asg \
+  --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
+  --default-result="ABANDON" \
+  --heartbeat-timeout=300 \
+  --notification-target-arn <your test queue ARN here> \
+  --role-arn <your SQS access role ARN here>
+```    
+
+The hook will be completed by NTH upon the instance's verified connection as a node. If not, the ABANDON default result will cause the instance to be terminated, and a new one to replace it repeating the same verification process.
+
+#### 4. Tag the Instances:
 
 By default the aws-node-termination-handler will only manage terminations for instances tagged with `key=aws-node-termination-handler/managed`.
 The value of the key does not matter.
@@ -320,7 +351,7 @@ You can also control what resources NTH manages by adding the resource ARNs to y
 
 Take a look at the docs on how to [create rules that only manage certain ASGs](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html), and read about all the [supported ASG events](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-event-reference.html).
 
-#### 4. Create Amazon EventBridge Rules
+#### 5. Create Amazon EventBridge Rules
 
 You may skip this step if sending events from ASG to SQS directly.
 
@@ -367,7 +398,7 @@ aws events put-targets --rule MyK8sScheduledChangeRule \
   --targets "Id"="1","Arn"="arn:aws:sqs:us-east-1:123456789012:MyK8sTermQueue"
 ```
 
-#### 5. Create an IAM Role for the Pods
+#### 6. Create an IAM Role for the Pods
 
 There are many different ways to allow the aws-node-termination-handler pods to assume a role:
 

From ebf8e43b16fe4092cbc0ac7dd74a100709b80266 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 26 Dec 2023 10:57:49 -0600
Subject: [PATCH 23/27] Fixed changes for README update

---
 README.md | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index d609bb72..8121be62 100644
--- a/README.md
+++ b/README.md
@@ -218,9 +218,11 @@ You'll need the following AWS infrastructure components:
 
 1. Amazon Simple Queue Service (SQS) Queue
 2. AutoScaling Group Termination Lifecycle Hook
-3. AutoScaling Group Launch Lifecycle Hook (optional)
-4. Amazon EventBridge Rule
-5. IAM Role for the aws-node-termination-handler Queue Processing Pods
+3. Amazon EventBridge Rule
+4. IAM Role for the aws-node-termination-handler Queue Processing Pods
+
+Optional AWS infrastructure components:
+1. AutoScaling Group Launch Lifecycle Hook
 
 #### 1. Create an SQS Queue:
 
@@ -271,7 +273,9 @@ There are some caveats when using [server side encryption with SQS](https://docs
 
 #### 2. Create an ASG Termination Lifecycle Hook:
 
-Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform:
+##### 2.1. Send Notification via EventBridge
+
+This will configure ASG to send termination notifications to EventBridge.
 
 ```
 aws autoscaling put-lifecycle-hook \
@@ -282,7 +286,9 @@ aws autoscaling put-lifecycle-hook \
   --heartbeat-timeout=300
 ```
 
-If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1:
+##### 2.2. Send notifications directly to SQS
+
+This will configure ASG to send termination notifications directly to an SQS queue monitored by NTH.
 
 ```
 aws autoscaling put-lifecycle-hook \
@@ -291,39 +297,43 @@ aws autoscaling put-lifecycle-hook \
   --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \
   --default-result=CONTINUE \
   --heartbeat-timeout=300 \
-  --notification-target-arn <your test queue ARN here> \
+  --notification-target-arn <your queue ARN here> \
   --role-arn <your SQS access role ARN here>
 ```
 
-#### 3. Create an ASG Launch Lifecycle Hook (optional):
+#### 3. Handle ASG Instance Launch Lifecycle Notifications (optional):
 
-If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, then a new instance will be launched upon another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node.
+If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, a new instance will be launched before another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node.
 
-Here is the AWS CLI command to create a launch lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform:
+##### 3.1. Send Notification via EventBridge
+
+This will configure ASG to send launch notifications to EventBridge.
 
 ```
 aws autoscaling put-lifecycle-hook \
-  --lifecycle-hook-name=my-k8s-term-launch-hook \
+  --lifecycle-hook-name=my-k8s-launch-hook \
   --auto-scaling-group-name=my-k8s-asg \
   --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
   --default-result="ABANDON" \
   --heartbeat-timeout=300
 ```
 
-If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1:
+##### 3.2. Send notifications directly to SQS
+
+This will configure ASG to send launch notifications directly to an SQS queue monitored by NTH.
 
 ```
 aws autoscaling put-lifecycle-hook \
-  --lifecycle-hook-name=my-k8s-term-launch-hook \
+  --lifecycle-hook-name=my-k8s-launch-hook \
   --auto-scaling-group-name=my-k8s-asg \
   --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
   --default-result="ABANDON" \
   --heartbeat-timeout=300 \
-  --notification-target-arn <your test queue ARN here> \
+  --notification-target-arn <your queue ARN here> \
   --role-arn <your SQS access role ARN here>
 ```    
 
-The hook will be completed by NTH upon the instance's verified connection as a node. If not, the ABANDON default result will cause the instance to be terminated, and a new one to replace it repeating the same verification process.
+When NTH receives a launch notification, it will periodically check for a node backed by the EC2 instance to join the cluster and for the node to have a status of 'ready.' Once a node becomes ready, NTH will complete the lifecycle hook, prompting the ASG to proceed with terminating the previous instance. If the lifecycle hook is not completed before the timeout, the ASG will take the default action. If the default action is 'ABANDON,' the new instance will be terminated, and the notification process will be repeated with another new instance.
 
 #### 4. Tag the Instances:
 

From d16182d2befd4b4453e24a9dc5755374beab3fff Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Tue, 2 Jan 2024 11:56:21 -0600
Subject: [PATCH 24/27] Use ASG launch event in SQS testing

---
 pkg/monitor/sqsevent/sqs-monitor.go      |  1 +
 pkg/monitor/sqsevent/sqs-monitor_test.go | 32 ++++++++++++++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go
index 18febbb1..7dea9308 100644
--- a/pkg/monitor/sqsevent/sqs-monitor.go
+++ b/pkg/monitor/sqsevent/sqs-monitor.go
@@ -207,6 +207,7 @@ func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent,
 		err = json.Unmarshal([]byte(eventBridgeEvent.Detail), &lifecycleEvent)
 		if err != nil {
 			interruptionEvent, err = nil, fmt.Errorf("unmarshaling message, %s, from ASG lifecycle event: %w", *message.MessageId, err)
+			interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err})
 		}
 		if lifecycleEvent.LifecycleTransition == ASGLaunchingLifecycleTransition {
 			interruptionEvent, err = m.createAsgInstanceLaunchEvent(eventBridgeEvent, message)
diff --git a/pkg/monitor/sqsevent/sqs-monitor_test.go b/pkg/monitor/sqsevent/sqs-monitor_test.go
index 61199ed2..8e827377 100644
--- a/pkg/monitor/sqsevent/sqs-monitor_test.go
+++ b/pkg/monitor/sqsevent/sqs-monitor_test.go
@@ -67,6 +67,26 @@ var asgLifecycleEvent = sqsevent.EventBridgeEvent{
 	  }`),
 }
 
+var asgLaunchLifecycleEvent = sqsevent.EventBridgeEvent{
+	Version:    "0",
+	ID:         "83c632dd-0145-1ab0-ae93-a756ebf429b5",
+	DetailType: "EC2 Instance-launch Lifecycle Action",
+	Source:     "aws.autoscaling",
+	Account:    "123456789012",
+	Time:       "2020-07-01T22:30:58Z",
+	Region:     "us-east-1",
+	Resources: []string{
+		"arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:c4c64181-52c1-dd3f-20bb-f4a0965a09db:autoScalingGroupName/nth-test1",
+	},
+	Detail: []byte(`{
+		"LifecycleActionToken": "524632c5-3333-d52d-3992-d9633ec24ed7",
+		"AutoScalingGroupName": "nth-test1",
+		"LifecycleHookName": "node-termination-handler-launch",
+		"EC2InstanceId": "i-0a68bf5ef13e21b52",
+		"LifecycleTransition": "autoscaling:EC2_INSTANCE_LAUNCHING"
+	  }`),
+}
+
 var asgLifecycleEventFromSQS = sqsevent.LifecycleDetail{
 	LifecycleHookName:    "test-nth-asg-to-sqs",
 	RequestID:            "3775fac9-93c3-7ead-8713-159816566000",
@@ -352,7 +372,7 @@ func TestMonitor_DrainTasks(t *testing.T) {
 }
 
 func TestMonitor_DrainTasks_Delay(t *testing.T) {
-	msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
+	msg, err := getSQSMessageFromEvent(asgLaunchLifecycleEvent)
 	h.Ok(t, err)
 
 	sqsMock := h.MockedSQS{
@@ -384,13 +404,12 @@ func TestMonitor_DrainTasks_Delay(t *testing.T) {
 	err = sqsMonitor.Monitor()
 	h.Ok(t, err)
 
-	t.Run(asgLifecycleEvent.DetailType, func(st *testing.T) {
+	t.Run(asgLaunchLifecycleEvent.DetailType, func(st *testing.T) {
 		result := <-drainChan
-		h.Equals(st, monitor.ASGLifecycleKind, result.Kind)
+		h.Equals(st, monitor.ASGLaunchLifecycleKind, result.Kind)
 		h.Equals(st, sqsevent.SQSMonitorKind, result.Monitor)
 		h.Equals(st, result.NodeName, dnsNodeName)
 		h.Assert(st, result.PostDrainTask != nil, "PostDrainTask should have been set")
-		h.Assert(st, result.PreDrainTask != nil, "PreDrainTask should have been set")
 		err := result.PostDrainTask(result, node.Node{})
 		h.Ok(st, err)
 		h.Assert(st, hookCalled, "BeforeCompleteLifecycleAction hook not called")
@@ -457,7 +476,7 @@ func TestMonitor_DrainTasks_Errors(t *testing.T) {
 }
 
 func TestMonitor_DrainTasksASGFailure(t *testing.T) {
-	msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
+	msg, err := getSQSMessageFromEvent(asgLaunchLifecycleEvent)
 	h.Ok(t, err)
 	messages := []*sqs.Message{
 		&msg,
@@ -492,11 +511,10 @@ func TestMonitor_DrainTasksASGFailure(t *testing.T) {
 
 	select {
 	case result := <-drainChan:
-		h.Equals(t, monitor.ASGLifecycleKind, result.Kind)
+		h.Equals(t, monitor.ASGLaunchLifecycleKind, result.Kind)
 		h.Equals(t, sqsevent.SQSMonitorKind, result.Monitor)
 		h.Equals(t, result.NodeName, dnsNodeName)
 		h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set")
-		h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set")
 		err = result.PostDrainTask(result, node.Node{})
 		h.Nok(t, err)
 	default:

From 8a9428172e437214d98f002884bbee1811b58f48 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Wed, 3 Jan 2024 16:09:34 -0600
Subject: [PATCH 25/27] Revise formatting for updated README

---
 README.md                              | 83 ++++++++++++--------------
 test/e2e/asg-launch-lifecycle-sqs-test |  2 +-
 2 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index 8121be62..0a906375 100644
--- a/README.md
+++ b/README.md
@@ -218,8 +218,9 @@ You'll need the following AWS infrastructure components:
 
 1. Amazon Simple Queue Service (SQS) Queue
 2. AutoScaling Group Termination Lifecycle Hook
-3. Amazon EventBridge Rule
-4. IAM Role for the aws-node-termination-handler Queue Processing Pods
+3. Instance Tagging
+4. Amazon EventBridge Rule
+5. IAM Role for the aws-node-termination-handler Queue Processing Pods
 
 Optional AWS infrastructure components:
 1. AutoScaling Group Launch Lifecycle Hook
@@ -273,9 +274,7 @@ There are some caveats when using [server side encryption with SQS](https://docs
 
 #### 2. Create an ASG Termination Lifecycle Hook:
 
-##### 2.1. Send Notification via EventBridge
-
-This will configure ASG to send termination notifications to EventBridge.
+Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform:
 
 ```
 aws autoscaling put-lifecycle-hook \
@@ -286,9 +285,7 @@ aws autoscaling put-lifecycle-hook \
   --heartbeat-timeout=300
 ```
 
-##### 2.2. Send notifications directly to SQS
-
-This will configure ASG to send termination notifications directly to an SQS queue monitored by NTH.
+If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1:
 
 ```
 aws autoscaling put-lifecycle-hook \
@@ -301,41 +298,7 @@ aws autoscaling put-lifecycle-hook \
   --role-arn <your SQS access role ARN here>
 ```
 
-#### 3. Handle ASG Instance Launch Lifecycle Notifications (optional):
-
-If [Capacity Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-capacity-rebalancing.html) is configured for your ASG, a new instance will be launched before another's termination. The use of an ASG launch lifecycle hook, as configured below, can verify the new instance has successfully connected as a Kubernetes node.
-
-##### 3.1. Send Notification via EventBridge
-
-This will configure ASG to send launch notifications to EventBridge.
-
-```
-aws autoscaling put-lifecycle-hook \
-  --lifecycle-hook-name=my-k8s-launch-hook \
-  --auto-scaling-group-name=my-k8s-asg \
-  --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
-  --default-result="ABANDON" \
-  --heartbeat-timeout=300
-```
-
-##### 3.2. Send notifications directly to SQS
-
-This will configure ASG to send launch notifications directly to an SQS queue monitored by NTH.
-
-```
-aws autoscaling put-lifecycle-hook \
-  --lifecycle-hook-name=my-k8s-launch-hook \
-  --auto-scaling-group-name=my-k8s-asg \
-  --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
-  --default-result="ABANDON" \
-  --heartbeat-timeout=300 \
-  --notification-target-arn <your queue ARN here> \
-  --role-arn <your SQS access role ARN here>
-```    
-
-When NTH receives a launch notification, it will periodically check for a node backed by the EC2 instance to join the cluster and for the node to have a status of 'ready.' Once a node becomes ready, NTH will complete the lifecycle hook, prompting the ASG to proceed with terminating the previous instance. If the lifecycle hook is not completed before the timeout, the ASG will take the default action. If the default action is 'ABANDON,' the new instance will be terminated, and the notification process will be repeated with another new instance.
-
-#### 4. Tag the Instances:
+#### 3. Tag the Instances:
 
 By default the aws-node-termination-handler will only manage terminations for instances tagged with `key=aws-node-termination-handler/managed`.
 The value of the key does not matter.
@@ -361,7 +324,7 @@ You can also control what resources NTH manages by adding the resource ARNs to y
 
 Take a look at the docs on how to [create rules that only manage certain ASGs](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html), and read about all the [supported ASG events](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-event-reference.html).
 
-#### 5. Create Amazon EventBridge Rules
+#### 4. Create Amazon EventBridge Rules
 
 You may skip this step if sending events from ASG to SQS directly.
 
@@ -408,7 +371,7 @@ aws events put-targets --rule MyK8sScheduledChangeRule \
   --targets "Id"="1","Arn"="arn:aws:sqs:us-east-1:123456789012:MyK8sTermQueue"
 ```
 
-#### 6. Create an IAM Role for the Pods
+#### 5. Create an IAM Role for the Pods
 
 There are many different ways to allow the aws-node-termination-handler pods to assume a role:
 
@@ -439,6 +402,36 @@ IAM Policy for aws-node-termination-handler Deployment:
 }
 ```
 
+#### 1. Handle ASG Instance Launch Lifecycle Notifications (optional):
+
+NTH can monitor for new instances launched by an ASG and notify the ASG when the instance is available in the EKS cluster.
+
+NTH will need to receive notifications of new instance launches within the ASG.  We can add a lifecycle hook to the ASG that will send instance launch notifications via EventBridge:
+
+```
+aws autoscaling put-lifecycle-hook \
+  --lifecycle-hook-name=my-k8s-launch-hook \
+  --auto-scaling-group-name=my-k8s-asg \
+  --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
+  --default-result="ABANDON" \
+  --heartbeat-timeout=300
+```
+
+Alternatively, ASG can send the instance launch notification directly to an SQS Queue:
+
+```
+aws autoscaling put-lifecycle-hook \
+  --lifecycle-hook-name=my-k8s-launch-hook \
+  --auto-scaling-group-name=my-k8s-asg \
+  --lifecycle-transition=autoscaling:EC2_INSTANCE_LAUNCHING \
+  --default-result="ABANDON" \
+  --heartbeat-timeout=300 \
+  --notification-target-arn <your queue ARN here> \
+  --role-arn <your SQS access role ARN here>
+```    
+
+When NTH receives a launch notification, it will periodically check for a node backed by the EC2 instance to join the cluster and for the node to have a status of 'ready.' Once a node becomes ready, NTH will complete the lifecycle hook, prompting the ASG to proceed with terminating the previous instance. If the lifecycle hook is not completed before the timeout, the ASG will take the default action. If the default action is 'ABANDON', the new instance will be terminated, and the notification process will be repeated with another new instance.
+
 ### Installation
 
 #### Pod Security Admission
diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index 4472f015..2df04cc0 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -415,7 +415,7 @@ function test_launch_lifecycle {
         elapsed_time=$((current_time - start_time))
         if [[ $elapsed_time -ge $heartbeat_timeout ]]; then 
             echo ""
-            echo "❌ Launch Lifecycle not Completed. Timeout Reached ❌"
+            echo "❌ Timeout Reached ❌"
             exit 1
         fi
 

From cd91851a6478e5f4085f10f0f60f59fd56353115 Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Fri, 5 Jan 2024 10:24:38 -0600
Subject: [PATCH 26/27] Revised ASG Launch Lifecycle Assertion test to adhere
 to shellcheck. Updated test timeouts to match Assertionscript standards

---
 test/e2e/asg-launch-lifecycle-sqs-test | 121 ++++++++++++-------------
 test/k8s-local-cluster-test/run-test   |   2 +-
 2 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index 2df04cc0..05f80e0d 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -6,15 +6,16 @@ sqs_queue_name="nth-sqs-test"
 sns_topic_name="nth-sns-test"
 node_policy_name="nth-test-node-policy"
 auto_scaling_role_name="AWSServiceRoleForAutoScaling_nth-test"
-auto_scaling_policy_arn="arn:aws:iam::aws:policy/aws-service-role/AutoScalingServiceRolePolicy"
 fis_role_name="nth-test-fis-role"
 fis_template_name="nth-fis-test"
 fis_policy_arn="arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access"
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-NODE_GROUP_CONFIG_FILE="$SCRIPTPATH/../eks-cluster-test/node_group-spec.yaml"
 account_id=$(aws sts get-caller-identity | jq -r '.Account')
 nth_label="Use-Case=NTH"
 heartbeat_timeout=$((3 * 60))
+LAUNCH_CHECK_CYCLES=15
+LAUNCH_ACTIVITY_CHECK_SLEEP=15
+LAUNCH_STATUS_CHECK_SLEEP=$((heartbeat_timeout / LAUNCH_CHECK_CYCLES))
 
 ##### JSON FILES #####
 
@@ -212,18 +213,16 @@ function create_node_policy {
 
 function get_node_role_name {
     node_role_arn=$(aws eks describe-nodegroup --cluster-name $CLUSTER_NAME --nodegroup-name $node_group_name | jq -r .nodegroup.nodeRole)
-    split_node_role_arn=($(tr '/' ' ' <<< $node_role_arn))
-    node_role_name=${split_node_role_arn[1]}
+    IFS="/" read -r -a node_role_arn_array <<< "$node_role_arn"
+    node_role_name=${node_role_arn_array[1]}
 }
 
 function set_node_data {
     instance_ids=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_name | jq -r '.AutoScalingGroups | .[0].Instances | .[].InstanceId')
     instance_data=$(aws ec2 describe-instances --instance-ids $instance_ids | jq -r '[.Reservations | .[].Instances | .[].InstanceId, .[].PrivateDnsName]')
 
-    nth_node_id=$(jq -r '.[0]' <<< $instance_data)
     nth_node_ip=$(jq -r '.[1]' <<< $instance_data)
     termination_node_id=$(jq -r '.[2]' <<< $instance_data)
-    termination_node_ip=$(jq -r '.[3]' <<< $instance_data)
 }
 
 function update_ASG {
@@ -255,19 +254,20 @@ function update_ASG {
 }
 
 function create_auto_scaling_role {
-    auto_scaling_role_exists=$(aws iam get-role --role-name=$auto_scaling_role_name | grep "$auto_scaling_role_name" || :)
+    auto_scaling_role_exists=$(aws iam get-role --role-name=$auto_scaling_role_name 2> /dev/null | grep "$auto_scaling_role_name" || :) 
     if [[ -z $auto_scaling_role_exists ]]; then
         echo "🥑 Creating Auto Scaling Role"
         auto_scaling_role_arn=$(aws iam create-service-linked-role --aws-service-name autoscaling.amazonaws.com --custom-suffix "nth-test" | jq -r '.Role.Arn')
         sleep 10
     else
         echo "🥑 $auto_scaling_role_name already exists; continuing with test run"
-        auto_scaling_role_arn=$(aws iam get-role --role-name=$auto_scaling_role_name | jq -r '.Role.Arn')
+        auto_scaling_role_arn=$(aws iam get-role --role-name=$auto_scaling_role_name 2> /dev/null | jq -r '.Role.Arn')
     fi
 }
 
 ### HELM ###
 function install_helm {
+    get_aws_credentials
 
     anth_helm_args=(
         upgrade
@@ -280,8 +280,8 @@ function install_helm {
         --set image.pullPolicy="Always"
         --set nodeSelector."${nth_label}"
         --set tolerations[0].operator=Exists
-        --set awsAccessKeyID=$(aws --profile default configure get aws_access_key_id)
-        --set awsSecretAccessKey=$(aws --profile default configure get aws_secret_access_key)
+        --set awsAccessKeyID="$aws_access_key_id"
+        --set awsSecretAccessKey="$aws_secret_access_key"
         --set awsRegion="${REGION}"
         --set checkTagBeforeDraining=false
         --set enableSqsTerminationDraining=true
@@ -296,9 +296,24 @@ function install_helm {
     sleep 15
 }
 
+function get_aws_credentials {
+    echo "🥑 Retrieving AWS Credentials"
+    aws_access_key_id=$(aws --profile default configure get aws_access_key_id 2> /dev/null)
+    if [[ -z $aws_access_key_id ]]; then
+        echo "❌ Failed to retrieve AWS Access Key ❌"
+        exit 1
+    fi
+
+    aws_secret_access_key=$(aws --profile default configure get aws_secret_access_key 2> /dev/null)
+    if [[ -z $aws_access_key_id ]]; then
+        echo "❌ Failed to retrieve AWS Secret Access Key ❌"
+        exit 1
+    fi
+}
+
 ### FIS ###
 function create_FIS_role {
-    fis_role_exists=$(aws iam get-role --role-name $fis_role_name | grep "$fis_role_name" || :)
+    fis_role_exists=$(aws iam get-role --role-name $fis_role_name 2> /dev/null | grep "$fis_role_name" || :)
     if [[ -z $fis_role_exists ]]; then
         echo "🥑 Creating FIS Role"
         fis_role_arn=$(aws iam create-role --role-name $fis_role_name --assume-role-policy-document file:///tmp/fis-role-trust-policy.json | jq -r '.Role.Arn')
@@ -306,7 +321,7 @@ function create_FIS_role {
         sleep 10
     else
         echo "🥑 $fis_role_name already exists; continuing with test run"
-        fis_role_arn=$(aws iam get-role --role-name=$fis_role_name | jq -r '.Role.Arn')
+        fis_role_arn=$(aws iam get-role --role-name=$fis_role_name 2> /dev/null | jq -r '.Role.Arn')
     fi
 }
 
@@ -335,28 +350,21 @@ function start_FIS_experiment {
     create_experiment_template
     echo "🥑 Starting Experiment"
     experiment_start_time=$(date +%s)
-    experiment=$(aws fis start-experiment --experiment-template-id $template_id)
+    aws fis start-experiment --experiment-template-id $template_id > /dev/null
 }
 
 
 ##### TESTING #####
-function is_new_instance {
-    is_new="true"
-    if [[ $instance_ids =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
-        is_new="false"
-    fi
-}
-
 function convert_date_to_epoch_seconds {
     IFS='T' read -r date_part time_part <<< "$1"
     IFS='-' read -r year month day <<< "$date_part"
     IFS=':' read -r hour minute second_fractional <<< "$time_part"
-    IFS='.' read -r second fraction <<< "$second_fractional"
+    IFS='.' read -r -a seconds_array <<< "$second_fractional"
     IFS=':' read -r offset_hours offset_minutes <<< "${time_part:16:5}"
 
     # Convert time strings to base-10 integers
     year=$((10#$year + 0)); month=$((10#$month + 0)); day=$((10#$day + 0)) 
-    hour=$((10#$hour + 0)); minute=$((10#$minute + 0)); second=$((10#$second + 0))
+    hour=$((10#$hour + 0)); minute=$((10#$minute + 0)); second=$((10#${seconds_array[0]} + 0))
     offset_hours=$((10#$offset_hours + 0)); offset_minutes=$((10#$offset_minutes + 0))
 
     if [[ $time_part =~ .*"-".* ]]; then
@@ -365,60 +373,47 @@ function convert_date_to_epoch_seconds {
     fi
 
     total_days=$(((year - 1970) * 365 + (year - 1970)/4))
-    for ((i = 1; i < month; i++)); do
-        total_days=$((total_days + $(cal $i $year | awk 'NF {DAYS = $NF} END {print DAYS}')))
+    for ((k = 1; k < month; k++)); do
+        total_days=$((total_days + $(cal $k $year | awk 'NF {DAYS = $NF} END {print DAYS}')))
     done
     total_days=$((total_days + day - 1))
     total_seconds=$((total_days * 86400 + (hour + offset_hours) * 3600 + (minute + offset_minutes) * 60 + second))
 }
 
 function get_launch_activity {
-    max_duration=$((5 * 60))
-    start_time=$(date +%s)
-
+    echo "🥑 Finding launch activity "
     launch_activity=""
-    while [[ -z $launch_activity ]]; do
-        current_time=$(date +%s)
-        elapsed_time=$((current_time - start_time))
-        if [[ $elapsed_time -ge $max_duration ]]; then 
-            echo "❌ Failed to find a new launched instance. Timeout Reached ❌"
-            exit 1
-        fi
-
-        sleep 5
+    for i in $(seq 1 $LAUNCH_CHECK_CYCLES); do
         activities=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name)
-        activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StatusCode, .StartTime]' <<< $activities)
+        activities_details=$(jq -r '[.Activities | .[] | .ActivityId, .Description, .StartTime]' <<< $activities)
         num_activities=$(jq -r 'length' <<< $activities_details)
-        for i in $(seq 0 4 $((--num_activities))); do
-            id=$(jq -r .[$i] <<< $activities_details)
-            description=$(jq -r .[$((++i))] <<< $activities_details)
-            status=$(jq -r .[$((i+=2))] <<< $activities_details)
-            start=$(jq -r .[$((i+=3))] <<< $activities_details)
+        for j in $(seq 0 3 $((--num_activities))); do
+            id=$(jq -r .[$j] <<< $activities_details)
+            description=$(jq -r .[$((++j))] <<< $activities_details)
+            start=$(jq -r .[$((j+=2))] <<< $activities_details)
             activity_instance=${description##*:}
             convert_date_to_epoch_seconds $start
             if [[ $description =~ .*"Launching".* && $total_seconds -gt $experiment_start_time ]]; then
                 launch_activity=$id
-                echo "🥑 Launch Activity found for instance $activity_instance"
-                break
+                break 2
             fi    
         done
+
+        echo "Setup Loop $i/$LAUNCH_CHECK_CYCLES, sleeping for $LAUNCH_ACTIVITY_CHECK_SLEEP seconds"
+        sleep $LAUNCH_ACTIVITY_CHECK_SLEEP
     done
+
+    if [[ -n $launch_activity ]]; then 
+        echo "✅ Launch Activity found for instance $activity_instance"
+    else
+        echo "❌ Failed to find a new launched instance ❌"
+        exit 1
+    fi
 }
 
 function test_launch_lifecycle {
-    aws sqs receive-message --queue-url $queue_url
-    echo -n "🥑 Waiting for launch hook completion."
-
-    start_time=$(date +%s)
-    while [[ true ]]; do
-        current_time=$(date +%s)
-        elapsed_time=$((current_time - start_time))
-        if [[ $elapsed_time -ge $heartbeat_timeout ]]; then 
-            echo ""
-            echo "❌ Timeout Reached ❌"
-            exit 1
-        fi
-
+    echo "🥑 Verifying launch hook completion "
+    for i in $(seq 1 $LAUNCH_CHECK_CYCLES); do
         activity_status=$(aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_name --activity-ids $launch_activity | jq -r '.Activities | .[].StatusCode')
         if [[ $activity_status == "Successful" ]]; then
             echo ""
@@ -429,9 +424,13 @@ function test_launch_lifecycle {
             echo "❌ Launch Lifecycle $activity_status ❌"
             exit 1
         fi
-        echo -n "."
-        sleep 10
+        
+        echo "Assertion Loop $i/$LAUNCH_CHECK_CYCLES, sleeping for $LAUNCH_STATUS_CHECK_SLEEP seconds"
+        sleep $LAUNCH_STATUS_CHECK_SLEEP
     done
+
+    echo "❌ Failed to verify launch hook completion ❌"
+    exit 1
 }
 
 
@@ -457,14 +456,14 @@ function clean_up {
     fi
     if [[ -n $template_id ]]; then 
         echo "🥑 Deleting FIS experiment template"
-        deletedTemplate=$(aws fis delete-experiment-template --id $template_id --no-paginate)
+        aws fis delete-experiment-template --id $template_id --no-paginate > /dev/null
     fi
     echo "🥑 Detaching FIS role policy"
     aws iam detach-role-policy --role-name $fis_role_name --policy-arn $fis_policy_arn
     echo "🥑 Deleting FIS role"
     aws iam delete-role --role-name $fis_role_name
     echo "🥑 Deleting autoscaling role"
-    aws iam delete-service-linked-role --role-name $auto_scaling_role_name
+    aws iam delete-service-linked-role --role-name $auto_scaling_role_name > /dev/null
     if [[ -n $node_policy_arn ]]; then
         echo "🥑 Deleting Node role policy"
         aws iam delete-policy --policy-arn $node_policy_arn
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index 34385364..be1e243c 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -275,7 +275,7 @@ kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\
 kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite
 
 function is_denylisted {
-    if [[ $SCRIPT_DENYLIST =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then
+    if [[ ${SCRIPT_DENYLIST[*]} =~ (^|[[:space:]])$1($|[[:space:]]) ]]; then 
         return 0
     fi
     return 1

From a8197a4efad2d837b3eb1b93bd403170011b719c Mon Sep 17 00:00:00 2001
From: Gavin Burris <ggburris@amazon.com>
Date: Thu, 18 Jan 2024 11:58:19 -0600
Subject: [PATCH 27/27] Update E22 EKS cluster test with ASG test script

---
 test/e2e/asg-launch-lifecycle-sqs-test | 2 +-
 test/eks-cluster-test/run-test         | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/e2e/asg-launch-lifecycle-sqs-test b/test/e2e/asg-launch-lifecycle-sqs-test
index 05f80e0d..42f21c98 100755
--- a/test/e2e/asg-launch-lifecycle-sqs-test
+++ b/test/e2e/asg-launch-lifecycle-sqs-test
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -euo pipefail
 
-node_group_name="spot-ng"
+node_group_name="linux-ng"
 sqs_queue_name="nth-sqs-test"
 sns_topic_name="nth-sns-test"
 node_policy_name="nth-test-node-policy"
diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test
index beebf0dd..949f68d1 100755
--- a/test/eks-cluster-test/run-test
+++ b/test/eks-cluster-test/run-test
@@ -194,7 +194,6 @@ function reset_cluster {
 
 if [[ -z ${assertion_scripts+x} ]]; then
     assertion_scripts=(
-        "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test"
         "$SCRIPTPATH/../e2e/cordon-only-test"
         "$SCRIPTPATH/../e2e/imds-v2-test"
         "$SCRIPTPATH/../e2e/maintenance-event-cancellation-test"
@@ -206,6 +205,8 @@ if [[ -z ${assertion_scripts+x} ]]; then
         #"$SCRIPTPATH/../e2e/webhook-http-proxy-test"
         #"$SCRIPTPATH/../e2e/webhook-secret-test"
         "$SCRIPTPATH/../e2e/webhook-test"
+        # This test terminates nodes in the cluster and needs to be run last
+        "$SCRIPTPATH/../e2e/asg-launch-lifecycle-sqs-test"
     )
 fi