@@ -18,20 +18,20 @@ package appwrapper
18
18
19
19
import (
20
20
"context"
21
+ "maps"
21
22
"sync"
22
23
23
24
v1 "k8s.io/api/core/v1"
24
25
"k8s.io/apimachinery/pkg/api/errors"
25
26
"k8s.io/apimachinery/pkg/api/resource"
26
- "k8s.io/apimachinery/pkg/types "
27
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1 "
27
28
"k8s.io/apimachinery/pkg/util/sets"
28
- "k8s.io/utils/ptr"
29
29
30
30
ctrl "sigs.k8s.io/controller-runtime"
31
31
"sigs.k8s.io/controller-runtime/pkg/client"
32
+ "sigs.k8s.io/controller-runtime/pkg/event"
32
33
"sigs.k8s.io/controller-runtime/pkg/handler"
33
34
"sigs.k8s.io/controller-runtime/pkg/log"
34
- kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
35
35
36
36
"github.com/project-codeflare/appwrapper/pkg/config"
37
37
)
@@ -44,51 +44,82 @@ import (
44
44
type NodeHealthMonitor struct {
45
45
client.Client
46
46
Config * config.AppWrapperConfig
47
+ Events chan event.GenericEvent // event channel for NodeHealthMonitor to trigger SlackClusterQueueMonitor
47
48
}
48
49
49
50
var (
50
- // noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExeucte taint
51
- noExecuteNodes = make (map [string ]sets.Set [string ])
52
- noExecuteNodesMutex sync.RWMutex
51
+ // nodeInfoMutex syncnornized writes by NodeHealthMonitor with reads from AppWrapperReconciler and SlackClusterQueueMonitor
52
+ nodeInfoMutex sync.RWMutex
53
53
54
- // noScheduleNodes is a mapping from Node names to resource quantities that are unschedulable.
54
+ // noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExecute taint
55
+ noExecuteNodes = make (map [string ]sets.Set [string ])
56
+
57
+ // noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources.
55
58
// A resource may be unscheduable either because:
56
59
// (a) the Node is cordoned (node.Spec.Unschedulable is true) or
57
- // (b) Autopilot has labeled the with either a NoExecute or NoSchedule taint.
58
- noScheduleNodes = make (map [string ]map [string ]* resource.Quantity )
60
+ // (b) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource.
61
+ noScheduleNodes = make (map [string ]v1.ResourceList )
62
+ )
63
+
64
+ const (
65
+ dispatchEventName = "*trigger*"
59
66
)
60
67
61
68
// permission to watch nodes
62
69
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
63
- //+kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues,verbs=get;list;watch;update;patch
64
70
65
71
func (r * NodeHealthMonitor ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
66
72
node := & v1.Node {}
67
73
if err := r .Get (ctx , req .NamespacedName , node ); err != nil {
68
- return ctrl.Result {}, nil
74
+ if errors .IsNotFound (err ) {
75
+ r .updateForNodeDeletion (ctx , req .Name )
76
+ return ctrl.Result {}, nil
77
+ }
78
+ return ctrl.Result {}, err
69
79
}
70
80
71
- r .updateNoExecuteNodes (ctx , node )
72
-
73
- // If there is a slack ClusterQueue, update its lending limits
74
-
75
- if r .Config .SlackQueueName == "" {
76
- return ctrl.Result {}, nil
81
+ if node .DeletionTimestamp .IsZero () {
82
+ r .updateNoExecuteNodes (ctx , node )
83
+ r .updateNoScheduleNodes (ctx , node )
84
+ } else {
85
+ r .updateForNodeDeletion (ctx , req .Name )
77
86
}
78
87
79
- cq := & kueue.ClusterQueue {}
80
- if err := r .Get (ctx , types.NamespacedName {Name : r .Config .SlackQueueName }, cq ); err != nil {
81
- if errors .IsNotFound (err ) {
82
- return ctrl.Result {}, nil // give up if slack quota is not defined
88
+ return ctrl.Result {}, nil
89
+ }
90
+
91
+ func (r * NodeHealthMonitor ) triggerSlackCQMonitor () {
92
+ if r .Config .SlackQueueName != "" {
93
+ select {
94
+ // Trigger dispatch by means of "*/*" request
95
+ case r .Events <- event.GenericEvent {Object : & metav1.PartialObjectMetadata {ObjectMeta : metav1.ObjectMeta {Name : dispatchEventName }}}:
96
+ default :
97
+ // do not block if event is already in channel
83
98
}
84
- return ctrl.Result {}, err
85
99
}
100
+ }
86
101
87
- r .updateNoScheduleNodes (ctx , cq , node )
88
-
89
- return r .updateLendingLimits (ctx , cq )
102
+ // update for the deletion of nodeName
103
+ func (r * NodeHealthMonitor ) updateForNodeDeletion (ctx context.Context , nodeName string ) {
104
+ if _ , ok := noExecuteNodes [nodeName ]; ok {
105
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
106
+ delete (noExecuteNodes , nodeName )
107
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
108
+ r .triggerSlackCQMonitor ()
109
+ log .FromContext (ctx ).Info ("Updated NoExecute information due to Node deletion" ,
110
+ "Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
111
+ }
112
+ if _ , ok := noScheduleNodes [nodeName ]; ok {
113
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
114
+ delete (noScheduleNodes , nodeName )
115
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
116
+ r .triggerSlackCQMonitor ()
117
+ log .FromContext (ctx ).Info ("Updated NoSchedule information due to Node deletion" ,
118
+ "Number NoSchedule Nodes" , len (noScheduleNodes ), "NoSchedule Resource Details" , noScheduleNodes )
119
+ }
90
120
}
91
121
122
+ // update noExecuteNodes entry for node
92
123
func (r * NodeHealthMonitor ) updateNoExecuteNodes (ctx context.Context , node * v1.Node ) {
93
124
noExecuteResources := make (sets.Set [string ])
94
125
for key , value := range node .GetLabels () {
@@ -102,7 +133,7 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
102
133
}
103
134
104
135
noExecuteNodesChanged := false
105
- noExecuteNodesMutex .Lock () // BEGIN CRITICAL SECTION
136
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
106
137
if priorEntry , ok := noExecuteNodes [node .GetName ()]; ok {
107
138
if len (noExecuteResources ) == 0 {
108
139
delete (noExecuteNodes , node .GetName ())
@@ -115,95 +146,56 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
115
146
noExecuteNodes [node .GetName ()] = noExecuteResources
116
147
noExecuteNodesChanged = true
117
148
}
118
- noExecuteNodesMutex .Unlock () // END CRITICAL SECTION
149
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
119
150
120
- // Safe to log outside the mutex because because this method is the only writer of noExecuteNodes
121
- // and the controller runtime is configured to not allow concurrent execution of this controller.
122
151
if noExecuteNodesChanged {
123
- log .FromContext (ctx ).Info ("Updated node NoExecute information" , "Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
152
+ r .triggerSlackCQMonitor ()
153
+ log .FromContext (ctx ).Info ("Updated NoExecute information" , "Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
124
154
}
125
155
}
126
156
127
- func ( r * NodeHealthMonitor ) updateNoScheduleNodes ( _ context. Context , cq * kueue. ClusterQueue , node * v1. Node ) {
128
- // update unschedulable resource quantities for this node
129
- noScheduleQuantities := make ( map [ string ] * resource. Quantity )
157
+ // update noScheduleNodes entry for node
158
+ func ( r * NodeHealthMonitor ) updateNoScheduleNodes ( ctx context. Context , node * v1. Node ) {
159
+ var noScheduleResources v1. ResourceList
130
160
if node .Spec .Unschedulable {
131
- // add all non-pod resources covered by cq if the node is cordoned
132
- for _ , resourceName := range cq .Spec .ResourceGroups [0 ].Flavors [0 ].Resources {
133
- if string (resourceName .Name ) != "pods" {
134
- noScheduleQuantities [string (resourceName .Name )] = node .Status .Capacity .Name (resourceName .Name , resource .DecimalSI )
135
- }
136
- }
161
+ noScheduleResources = node .Status .Capacity .DeepCopy ()
162
+ delete (noScheduleResources , v1 .ResourcePods )
137
163
} else {
164
+ noScheduleResources = make (v1.ResourceList )
138
165
for key , value := range node .GetLabels () {
139
166
for resourceName , taints := range r .Config .Autopilot .ResourceTaints {
140
167
for _ , taint := range taints {
141
168
if key == taint .Key && value == taint .Value {
142
- noScheduleQuantities [resourceName ] = node .Status .Capacity .Name (v1 .ResourceName (resourceName ), resource .DecimalSI )
169
+ quantity := node .Status .Capacity .Name (v1 .ResourceName (resourceName ), resource .DecimalSI )
170
+ if ! quantity .IsZero () {
171
+ noScheduleResources [v1 .ResourceName (resourceName )] = * quantity
172
+ }
143
173
}
144
174
}
145
175
}
146
176
}
147
177
}
148
178
149
- if len (noScheduleQuantities ) > 0 {
150
- noScheduleNodes [node .GetName ()] = noScheduleQuantities
151
- } else {
152
- delete (noScheduleNodes , node .GetName ())
153
- }
154
- }
155
-
156
- func (r * NodeHealthMonitor ) updateLendingLimits (ctx context.Context , cq * kueue.ClusterQueue ) (ctrl.Result , error ) {
157
-
158
- // compute unschedulable resource totals
159
- unschedulableQuantities := map [string ]* resource.Quantity {}
160
- for _ , quantities := range noScheduleNodes {
161
- for resourceName , quantity := range quantities {
162
- if ! quantity .IsZero () {
163
- if unschedulableQuantities [resourceName ] == nil {
164
- unschedulableQuantities [resourceName ] = ptr .To (* quantity )
165
- } else {
166
- unschedulableQuantities [resourceName ].Add (* quantity )
167
- }
168
- }
169
- }
170
- }
171
-
172
- // enforce lending limits on 1st flavor of 1st resource group
173
- resources := cq .Spec .ResourceGroups [0 ].Flavors [0 ].Resources
174
- limitsChanged := false
175
- for i , quota := range resources {
176
- var lendingLimit * resource.Quantity
177
- if unschedulableQuantity := unschedulableQuantities [quota .Name .String ()]; unschedulableQuantity != nil {
178
- if quota .NominalQuota .Cmp (* unschedulableQuantity ) > 0 {
179
- lendingLimit = ptr .To (quota .NominalQuota )
180
- lendingLimit .Sub (* unschedulableQuantity )
181
- } else {
182
- lendingLimit = resource .NewQuantity (0 , resource .DecimalSI )
183
- }
184
- }
185
- if quota .LendingLimit == nil && lendingLimit != nil ||
186
- quota .LendingLimit != nil && lendingLimit == nil ||
187
- quota .LendingLimit != nil && lendingLimit != nil && quota .LendingLimit .Cmp (* lendingLimit ) != 0 {
188
- limitsChanged = true
189
- resources [i ].LendingLimit = lendingLimit
179
+ noScheduleNodesChanged := false
180
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
181
+ if priorEntry , ok := noScheduleNodes [node .GetName ()]; ok {
182
+ if len (noScheduleResources ) == 0 {
183
+ delete (noScheduleNodes , node .GetName ())
184
+ noScheduleNodesChanged = true
185
+ } else if ! maps .Equal (priorEntry , noScheduleResources ) {
186
+ noScheduleNodes [node .GetName ()] = noScheduleResources
187
+ noScheduleNodesChanged = true
190
188
}
189
+ } else if len (noScheduleResources ) > 0 {
190
+ noScheduleNodes [node .GetName ()] = noScheduleResources
191
+ noScheduleNodesChanged = true
191
192
}
193
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
192
194
193
- // update lending limits
194
- if limitsChanged {
195
- err := r .Update (ctx , cq )
196
- if err == nil {
197
- log .FromContext (ctx ).Info ("Updated lending limits" , "Resources" , resources )
198
- return ctrl.Result {}, nil
199
- } else if errors .IsConflict (err ) {
200
- return ctrl.Result {Requeue : true }, nil
201
- } else {
202
- return ctrl.Result {}, err
203
- }
195
+ if noScheduleNodesChanged {
196
+ r .triggerSlackCQMonitor ()
197
+ log .FromContext (ctx ).Info ("Updated NoSchedule information" , "Number NoSchedule Nodes" , len (noScheduleNodes ), "NoSchedule Resource Details" , noScheduleNodes )
204
198
}
205
-
206
- return ctrl.Result {}, nil
207
199
}
208
200
209
201
// SetupWithManager sets up the controller with the Manager.
0 commit comments