project-codeflare
diff --git a/‎internal/controller/appwrapper/appwrapper_controller.go
Lines changed: 2 additions & 2 deletions b/‎internal/controller/appwrapper/appwrapper_controller.go
Lines changed: 2 additions & 2 deletions
diff --git a/‎internal/controller/appwrapper/node_health_monitor.go
Lines changed: 80 additions & 93 deletions b/‎internal/controller/appwrapper/node_health_monitor.go
Lines changed: 80 additions & 93 deletions
diff --git a/‎internal/controller/appwrapper/node_health_monitor_test.go
Lines changed: 30 additions & 0 deletions b/‎internal/controller/appwrapper/node_health_monitor_test.go
Lines changed: 30 additions & 0 deletions
@@ -559,7 +559,7 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
 			if pod.DeletionTimestamp.IsZero() {
 				summary.running += 1
 				if checkNoExecuteNodes {
-					noExecuteNodesMutex.RLock() // BEGIN CRITICAL SECTION
+					nodeInfoMutex.RLock() // BEGIN CRITICAL SECTION
 					if len(noExecuteNodes) > 0 {
 						if resources, ok := noExecuteNodes[pod.Spec.NodeName]; ok {
 							for badResource := range resources {
@@ -584,7 +584,7 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
 							}
 						}
 					}
-					noExecuteNodesMutex.RUnlock() // END CRITICAL SECTION
+					nodeInfoMutex.RUnlock() // END CRITICAL SECTION
 				}
 			}
 		case v1.PodSucceeded:
 
@@ -18,20 +18,20 @@ package appwrapper
 
 import (
 	"context"
+	"maps"
 	"sync"
 
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
-	"k8s.io/apimachinery/pkg/types"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
-	"k8s.io/utils/ptr"
 
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/event"
 	"sigs.k8s.io/controller-runtime/pkg/handler"
 	"sigs.k8s.io/controller-runtime/pkg/log"
-	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
 
 	"github.com/project-codeflare/appwrapper/pkg/config"
 )
@@ -44,51 +44,77 @@ import (
 type NodeHealthMonitor struct {
 	client.Client
 	Config *config.AppWrapperConfig
+	Events chan event.GenericEvent // event channel for NodeHealthMonitor to trigger SlackClusterQueueMonitor
 }
 
 var (
-	// noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExeucte taint
-	noExecuteNodes      = make(map[string]sets.Set[string])
-	noExecuteNodesMutex sync.RWMutex
+	// nodeInfoMutex syncnornized writes by NodeHealthMonitor with reads from AppWrapperReconciler and SlackClusterQueueMonitor
+	nodeInfoMutex sync.RWMutex
 
-	// noScheduleNodes is a mapping from Node names to resource quantities that are unschedulable.
+	// noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExecute taint
+	noExecuteNodes = make(map[string]sets.Set[string])
+
+	// noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources.
 	// A resource may be unscheduable either because:
 	//  (a) the Node is cordoned (node.Spec.Unschedulable is true) or
-	//  (b) Autopilot has labeled the with either a NoExecute or NoSchedule taint.
-	noScheduleNodes = make(map[string]map[string]*resource.Quantity)
+	//  (b) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource.
+	noScheduleNodes = make(map[string]v1.ResourceList)
 )
 
 // permission to watch nodes
 //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
-//+kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues,verbs=get;list;watch;update;patch
 
 func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	node := &v1.Node{}
 	if err := r.Get(ctx, req.NamespacedName, node); err != nil {
-		return ctrl.Result{}, nil
+		if errors.IsNotFound(err) {
+			r.updateForNodeDeletion(ctx, req.Name)
+			return ctrl.Result{}, nil
+		}
+		return ctrl.Result{}, err
 	}
 
-	r.updateNoExecuteNodes(ctx, node)
-
-	// If there is a slack ClusterQueue, update its lending limits
-
-	if r.Config.SlackQueueName == "" {
-		return ctrl.Result{}, nil
+	if node.DeletionTimestamp.IsZero() {
+		r.updateNoExecuteNodes(ctx, node)
+		r.updateNoScheduleNodes(ctx, node)
+	} else {
+		r.updateForNodeDeletion(ctx, req.Name)
 	}
 
-	cq := &kueue.ClusterQueue{}
-	if err := r.Get(ctx, types.NamespacedName{Name: r.Config.SlackQueueName}, cq); err != nil {
-		if errors.IsNotFound(err) {
-			return ctrl.Result{}, nil // give up if slack quota is not defined
+	return ctrl.Result{}, nil
+}
+
+// Trigger dispatch by means of "*/*" request
+func (r *NodeHealthMonitor) triggerDispatch() {
+	if r.Config.SlackQueueName != "" {
+		select {
+		case r.Events <- event.GenericEvent{Object: &metav1.PartialObjectMetadata{ObjectMeta: metav1.ObjectMeta{Namespace: "*", Name: "*"}}}:
+		default:
+			// do not block if event is already in channel
 		}
-		return ctrl.Result{}, err
 	}
+}
 
-	r.updateNoScheduleNodes(ctx, cq, node)
-
-	return r.updateLendingLimits(ctx, cq)
+func (r *NodeHealthMonitor) updateForNodeDeletion(ctx context.Context, name string) {
+	if _, ok := noExecuteNodes[name]; ok {
+		nodeInfoMutex.Lock() // BEGIN CRITICAL SECTION
+		delete(noExecuteNodes, name)
+		nodeInfoMutex.Unlock() // END CRITICAL SECTION
+		r.triggerDispatch()
+		log.FromContext(ctx).Info("Updated node NoExecute information for Node deletion",
+			"Number NoExecute Nodes", len(noExecuteNodes), "NoExecute Resource Details", noExecuteNodes)
+	}
+	if _, ok := noScheduleNodes[name]; ok {
+		nodeInfoMutex.Lock() // BEGIN CRITICAL SECTION
+		delete(noScheduleNodes, name)
+		nodeInfoMutex.Unlock() // END CRITICAL SECTION
+		r.triggerDispatch()
+		log.FromContext(ctx).Info("Updated node NoSchedule information for Node deletion",
+			"Number NoSchedule Nodes", len(noScheduleNodes), "NoSchedule Resource Details", noScheduleNodes)
+	}
 }
 
+// update noExecuteNodes entry for this node
 func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.Node) {
 	noExecuteResources := make(sets.Set[string])
 	for key, value := range node.GetLabels() {
@@ -102,7 +128,7 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
 	}
 
 	noExecuteNodesChanged := false
-	noExecuteNodesMutex.Lock() // BEGIN CRITICAL SECTION
+	nodeInfoMutex.Lock() // BEGIN CRITICAL SECTION
 	if priorEntry, ok := noExecuteNodes[node.GetName()]; ok {
 		if len(noExecuteResources) == 0 {
 			delete(noExecuteNodes, node.GetName())
@@ -115,95 +141,56 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
 		noExecuteNodes[node.GetName()] = noExecuteResources
 		noExecuteNodesChanged = true
 	}
-	noExecuteNodesMutex.Unlock() // END CRITICAL SECTION
+	nodeInfoMutex.Unlock() // END CRITICAL SECTION
 
-	// Safe to log outside the mutex because because this method is the only writer of noExecuteNodes
-	// and the controller runtime is configured to not allow concurrent execution of this controller.
 	if noExecuteNodesChanged {
+		r.triggerDispatch()
 		log.FromContext(ctx).Info("Updated node NoExecute information", "Number NoExecute Nodes", len(noExecuteNodes), "NoExecute Resource Details", noExecuteNodes)
 	}
 }
 
-func (r *NodeHealthMonitor) updateNoScheduleNodes(_ context.Context, cq *kueue.ClusterQueue, node *v1.Node) {
-	// update unschedulable resource quantities for this node
-	noScheduleQuantities := make(map[string]*resource.Quantity)
+// update noScheduleNodes entry for this node
+func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.Node) {
+	var noScheduleResources v1.ResourceList
 	if node.Spec.Unschedulable {
-		// add all non-pod resources covered by cq if the node is cordoned
-		for _, resourceName := range cq.Spec.ResourceGroups[0].Flavors[0].Resources {
-			if string(resourceName.Name) != "pods" {
-				noScheduleQuantities[string(resourceName.Name)] = node.Status.Capacity.Name(resourceName.Name, resource.DecimalSI)
-			}
-		}
+		noScheduleResources = node.Status.Capacity.DeepCopy()
+		delete(noScheduleResources, v1.ResourcePods)
 	} else {
+		noScheduleResources = make(v1.ResourceList)
 		for key, value := range node.GetLabels() {
 			for resourceName, taints := range r.Config.Autopilot.ResourceTaints {
 				for _, taint := range taints {
 					if key == taint.Key && value == taint.Value {
-						noScheduleQuantities[resourceName] = node.Status.Capacity.Name(v1.ResourceName(resourceName), resource.DecimalSI)
+						quantity := node.Status.Capacity.Name(v1.ResourceName(resourceName), resource.DecimalSI)
+						if !quantity.IsZero() {
+							noScheduleResources[v1.ResourceName(resourceName)] = *quantity
+						}
 					}
 				}
 			}
 		}
 	}
 
-	if len(noScheduleQuantities) > 0 {
-		noScheduleNodes[node.GetName()] = noScheduleQuantities
-	} else {
-		delete(noScheduleNodes, node.GetName())
-	}
-}
-
-func (r *NodeHealthMonitor) updateLendingLimits(ctx context.Context, cq *kueue.ClusterQueue) (ctrl.Result, error) {
-
-	// compute unschedulable resource totals
-	unschedulableQuantities := map[string]*resource.Quantity{}
-	for _, quantities := range noScheduleNodes {
-		for resourceName, quantity := range quantities {
-			if !quantity.IsZero() {
-				if unschedulableQuantities[resourceName] == nil {
-					unschedulableQuantities[resourceName] = ptr.To(*quantity)
-				} else {
-					unschedulableQuantities[resourceName].Add(*quantity)
-				}
-			}
+	noScheduleNodesChanged := false
+	nodeInfoMutex.Lock() // BEGIN CRITICAL SECTION
+	if priorEntry, ok := noScheduleNodes[node.GetName()]; ok {
+		if len(noScheduleResources) == 0 {
+			delete(noScheduleNodes, node.GetName())
+			noScheduleNodesChanged = true
+		} else if !maps.Equal(priorEntry, noScheduleResources) {
+			noScheduleNodes[node.GetName()] = noScheduleResources
+			noScheduleNodesChanged = true
 		}
+	} else if len(noScheduleResources) > 0 {
+		noScheduleNodes[node.GetName()] = noScheduleResources
+		noScheduleNodesChanged = true
 	}
+	nodeInfoMutex.Unlock() // END CRITICAL SECTION
 
-	// enforce lending limits on 1st flavor of 1st resource group
-	resources := cq.Spec.ResourceGroups[0].Flavors[0].Resources
-	limitsChanged := false
-	for i, quota := range resources {
-		var lendingLimit *resource.Quantity
-		if unschedulableQuantity := unschedulableQuantities[quota.Name.String()]; unschedulableQuantity != nil {
-			if quota.NominalQuota.Cmp(*unschedulableQuantity) > 0 {
-				lendingLimit = ptr.To(quota.NominalQuota)
-				lendingLimit.Sub(*unschedulableQuantity)
-			} else {
-				lendingLimit = resource.NewQuantity(0, resource.DecimalSI)
-			}
-		}
-		if quota.LendingLimit == nil && lendingLimit != nil ||
-			quota.LendingLimit != nil && lendingLimit == nil ||
-			quota.LendingLimit != nil && lendingLimit != nil && quota.LendingLimit.Cmp(*lendingLimit) != 0 {
-			limitsChanged = true
-			resources[i].LendingLimit = lendingLimit
-		}
-	}
-
-	// update lending limits
-	if limitsChanged {
-		err := r.Update(ctx, cq)
-		if err == nil {
-			log.FromContext(ctx).Info("Updated lending limits", "Resources", resources)
-			return ctrl.Result{}, nil
-		} else if errors.IsConflict(err) {
-			return ctrl.Result{Requeue: true}, nil
-		} else {
-			return ctrl.Result{}, err
-		}
+	if noScheduleNodesChanged {
+		r.triggerDispatch()
+		log.FromContext(ctx).Info("Updated node NoSchedule information", "Number NoSchedule Nodes", len(noScheduleNodes), "NoSchedule Resource Details", noScheduleNodes)
 	}
-
-	return ctrl.Result{}, nil
 }
 
 // SetupWithManager sets up the controller with the Manager.
 
@@ -24,14 +24,17 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/controller-runtime/pkg/event"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 )
 
 var _ = Describe("NodeMonitor Controller", func() {
 	var slackQueueName = "fake-queue"
 	var node1Name = types.NamespacedName{Name: "fake-node-1"}
 	var node2Name = types.NamespacedName{Name: "fake-node-2"}
+	var dispatch = types.NamespacedName{Name: "*"}
 	var nodeMonitor *NodeHealthMonitor
+	var cqMonitor *SlackClusterQueueMonitor
 	nodeGPUs := v1.ResourceList{v1.ResourceName("nvidia.com/gpu"): resource.MustParse("4")}
 
 	BeforeEach(func() {
@@ -49,9 +52,16 @@ var _ = Describe("NodeMonitor Controller", func() {
 		// Create reconciller
 		awConfig := config.NewAppWrapperConfig()
 		awConfig.SlackQueueName = slackQueueName
+		conduit := make(chan event.GenericEvent, 1)
 		nodeMonitor = &NodeHealthMonitor{
 			Client: k8sClient,
 			Config: awConfig,
+			Events: conduit,
+		}
+		cqMonitor = &SlackClusterQueueMonitor{
+			Client: k8sClient,
+			Config: awConfig,
+			Events: conduit,
 		}
 	})
 
@@ -124,6 +134,8 @@ var _ = Describe("NodeMonitor Controller", func() {
 		Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
 		_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
 		Expect(err).NotTo(HaveOccurred())
+		_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
+		Expect(err).NotTo(HaveOccurred())
 
 		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
 		Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))
@@ -134,6 +146,8 @@ var _ = Describe("NodeMonitor Controller", func() {
 		Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
 		_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
 		Expect(err).NotTo(HaveOccurred())
+		_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
+		Expect(err).NotTo(HaveOccurred())
 
 		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
 		Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
@@ -144,6 +158,8 @@ var _ = Describe("NodeMonitor Controller", func() {
 		Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
 		_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
 		Expect(err).NotTo(HaveOccurred())
+		_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
+		Expect(err).NotTo(HaveOccurred())
 
 		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
 		Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
@@ -154,6 +170,8 @@ var _ = Describe("NodeMonitor Controller", func() {
 		Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
 		_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
 		Expect(err).NotTo(HaveOccurred())
+		_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
+		Expect(err).NotTo(HaveOccurred())
 
 		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
 		Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())
@@ -164,10 +182,22 @@ var _ = Describe("NodeMonitor Controller", func() {
 		Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
 		_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
 		Expect(err).NotTo(HaveOccurred())
+		_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
+		Expect(err).NotTo(HaveOccurred())
 
 		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
 		Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))
 
+		// Increase the slack cluster queue's quota by 2 and expect LedningLimit to increase by 2 to become 4
+		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
+		queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].NominalQuota = resource.MustParse("8")
+		Expect(k8sClient.Update(ctx, queue)).Should(Succeed())
+		_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: slackQueueName}})
+		Expect(err).NotTo(HaveOccurred())
+
+		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
+		Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(4)))
+
 		Expect(k8sClient.Delete(ctx, queue)).To(Succeed())
 	})
 })
Original file line number	Diff line number	Diff line change
`@@ -559,7 +559,7 @@ func (r AppWrapperReconciler) getPodStatus(ctx context.Context, aw workloadv1b`
`559`	`559`	`if pod.DeletionTimestamp.IsZero() {`
`560`	`560`	`summary.running += 1`
`561`	`561`	`if checkNoExecuteNodes {`
`562`		`- noExecuteNodesMutex.RLock() // BEGIN CRITICAL SECTION`
	`562`	`+ nodeInfoMutex.RLock() // BEGIN CRITICAL SECTION`
`563`	`563`	`if len(noExecuteNodes) > 0 {`
`564`	`564`	`if resources, ok := noExecuteNodes[pod.Spec.NodeName]; ok {`
`565`	`565`	`for badResource := range resources {`
`@@ -584,7 +584,7 @@ func (r AppWrapperReconciler) getPodStatus(ctx context.Context, aw workloadv1b`
`584`	`584`	`}`
`585`	`585`	`}`
`586`	`586`	`}`
`587`		`- noExecuteNodesMutex.RUnlock() // END CRITICAL SECTION`
	`587`	`+ nodeInfoMutex.RUnlock() // END CRITICAL SECTION`
`588`	`588`	`}`
`589`	`589`	`}`
`590`	`590`	`case v1.PodSucceeded:`