@@ -18,20 +18,20 @@ package appwrapper
18
18
19
19
import (
20
20
"context"
21
+ "maps"
21
22
"sync"
22
23
23
24
v1 "k8s.io/api/core/v1"
24
25
"k8s.io/apimachinery/pkg/api/errors"
25
26
"k8s.io/apimachinery/pkg/api/resource"
26
- "k8s.io/apimachinery/pkg/types "
27
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1 "
27
28
"k8s.io/apimachinery/pkg/util/sets"
28
- "k8s.io/utils/ptr"
29
29
30
30
ctrl "sigs.k8s.io/controller-runtime"
31
31
"sigs.k8s.io/controller-runtime/pkg/client"
32
+ "sigs.k8s.io/controller-runtime/pkg/event"
32
33
"sigs.k8s.io/controller-runtime/pkg/handler"
33
34
"sigs.k8s.io/controller-runtime/pkg/log"
34
- kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
35
35
36
36
"github.com/project-codeflare/appwrapper/pkg/config"
37
37
)
@@ -44,51 +44,77 @@ import (
44
44
type NodeHealthMonitor struct {
45
45
client.Client
46
46
Config * config.AppWrapperConfig
47
+ Events chan event.GenericEvent // event channel for NodeHealthMonitor to trigger SlackClusterQueueMonitor
47
48
}
48
49
49
50
var (
50
- // noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExeucte taint
51
- noExecuteNodes = make (map [string ]sets.Set [string ])
52
- noExecuteNodesMutex sync.RWMutex
51
+ // nodeInfoMutex syncnornized writes by NodeHealthMonitor with reads from AppWrapperReconciler and SlackClusterQueueMonitor
52
+ nodeInfoMutex sync.RWMutex
53
53
54
- // noScheduleNodes is a mapping from Node names to resource quantities that are unschedulable.
54
+ // noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExecute taint
55
+ noExecuteNodes = make (map [string ]sets.Set [string ])
56
+
57
+ // noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources.
55
58
// A resource may be unscheduable either because:
56
59
// (a) the Node is cordoned (node.Spec.Unschedulable is true) or
57
- // (b) Autopilot has labeled the with either a NoExecute or NoSchedule taint.
58
- noScheduleNodes = make (map [string ]map [ string ] * resource. Quantity )
60
+ // (b) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource .
61
+ noScheduleNodes = make (map [string ]v1. ResourceList )
59
62
)
60
63
61
64
// permission to watch nodes
62
65
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
63
- //+kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues,verbs=get;list;watch;update;patch
64
66
65
67
func (r * NodeHealthMonitor ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
66
68
node := & v1.Node {}
67
69
if err := r .Get (ctx , req .NamespacedName , node ); err != nil {
68
- return ctrl.Result {}, nil
70
+ if errors .IsNotFound (err ) {
71
+ r .updateForNodeDeletion (ctx , req .Name )
72
+ return ctrl.Result {}, nil
73
+ }
74
+ return ctrl.Result {}, err
69
75
}
70
76
71
- r .updateNoExecuteNodes (ctx , node )
72
-
73
- // If there is a slack ClusterQueue, update its lending limits
74
-
75
- if r .Config .SlackQueueName == "" {
76
- return ctrl.Result {}, nil
77
+ if node .DeletionTimestamp .IsZero () {
78
+ r .updateNoExecuteNodes (ctx , node )
79
+ r .updateNoScheduleNodes (ctx , node )
80
+ } else {
81
+ r .updateForNodeDeletion (ctx , req .Name )
77
82
}
78
83
79
- cq := & kueue.ClusterQueue {}
80
- if err := r .Get (ctx , types.NamespacedName {Name : r .Config .SlackQueueName }, cq ); err != nil {
81
- if errors .IsNotFound (err ) {
82
- return ctrl.Result {}, nil // give up if slack quota is not defined
84
+ return ctrl.Result {}, nil
85
+ }
86
+
87
+ // Trigger dispatch by means of "*/*" request
88
+ func (r * NodeHealthMonitor ) triggerDispatch () {
89
+ if r .Config .SlackQueueName != "" {
90
+ select {
91
+ case r .Events <- event.GenericEvent {Object : & metav1.PartialObjectMetadata {ObjectMeta : metav1.ObjectMeta {Namespace : "*" , Name : "*" }}}:
92
+ default :
93
+ // do not block if event is already in channel
83
94
}
84
- return ctrl.Result {}, err
85
95
}
96
+ }
86
97
87
- r .updateNoScheduleNodes (ctx , cq , node )
88
-
89
- return r .updateLendingLimits (ctx , cq )
98
+ func (r * NodeHealthMonitor ) updateForNodeDeletion (ctx context.Context , name string ) {
99
+ if _ , ok := noExecuteNodes [name ]; ok {
100
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
101
+ delete (noExecuteNodes , name )
102
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
103
+ r .triggerDispatch ()
104
+ log .FromContext (ctx ).Info ("Updated node NoExecute information for Node deletion" ,
105
+ "Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
106
+ }
107
+ if _ , ok := noScheduleNodes [name ]; ok {
108
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
109
+ delete (noScheduleNodes , name )
110
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
111
+ r .triggerDispatch ()
112
+ log .FromContext (ctx ).Info ("Updated node NoSchedule information for Node deletion" ,
113
+ "Number NoSchedule Nodes" , len (noScheduleNodes ), "NoSchedule Resource Details" , noScheduleNodes )
114
+ }
90
115
}
91
116
117
+ // update noExecuteNodes entry for this node
92
118
func (r * NodeHealthMonitor ) updateNoExecuteNodes (ctx context.Context , node * v1.Node ) {
93
119
noExecuteResources := make (sets.Set [string ])
94
120
for key , value := range node .GetLabels () {
@@ -102,7 +128,7 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
102
128
}
103
129
104
130
noExecuteNodesChanged := false
105
- noExecuteNodesMutex .Lock () // BEGIN CRITICAL SECTION
131
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
106
132
if priorEntry , ok := noExecuteNodes [node .GetName ()]; ok {
107
133
if len (noExecuteResources ) == 0 {
108
134
delete (noExecuteNodes , node .GetName ())
@@ -115,95 +141,56 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
115
141
noExecuteNodes [node .GetName ()] = noExecuteResources
116
142
noExecuteNodesChanged = true
117
143
}
118
- noExecuteNodesMutex .Unlock () // END CRITICAL SECTION
144
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
119
145
120
- // Safe to log outside the mutex because because this method is the only writer of noExecuteNodes
121
- // and the controller runtime is configured to not allow concurrent execution of this controller.
122
146
if noExecuteNodesChanged {
147
+ r .triggerDispatch ()
123
148
log .FromContext (ctx ).Info ("Updated node NoExecute information" , "Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
124
149
}
125
150
}
126
151
127
- func ( r * NodeHealthMonitor ) updateNoScheduleNodes ( _ context. Context , cq * kueue. ClusterQueue , node * v1. Node ) {
128
- // update unschedulable resource quantities for this node
129
- noScheduleQuantities := make ( map [ string ] * resource. Quantity )
152
+ // update noScheduleNodes entry for this node
153
+ func ( r * NodeHealthMonitor ) updateNoScheduleNodes ( ctx context. Context , node * v1. Node ) {
154
+ var noScheduleResources v1. ResourceList
130
155
if node .Spec .Unschedulable {
131
- // add all non-pod resources covered by cq if the node is cordoned
132
- for _ , resourceName := range cq .Spec .ResourceGroups [0 ].Flavors [0 ].Resources {
133
- if string (resourceName .Name ) != "pods" {
134
- noScheduleQuantities [string (resourceName .Name )] = node .Status .Capacity .Name (resourceName .Name , resource .DecimalSI )
135
- }
136
- }
156
+ noScheduleResources = node .Status .Capacity .DeepCopy ()
157
+ delete (noScheduleResources , v1 .ResourcePods )
137
158
} else {
159
+ noScheduleResources = make (v1.ResourceList )
138
160
for key , value := range node .GetLabels () {
139
161
for resourceName , taints := range r .Config .Autopilot .ResourceTaints {
140
162
for _ , taint := range taints {
141
163
if key == taint .Key && value == taint .Value {
142
- noScheduleQuantities [resourceName ] = node .Status .Capacity .Name (v1 .ResourceName (resourceName ), resource .DecimalSI )
164
+ quantity := node .Status .Capacity .Name (v1 .ResourceName (resourceName ), resource .DecimalSI )
165
+ if ! quantity .IsZero () {
166
+ noScheduleResources [v1 .ResourceName (resourceName )] = * quantity
167
+ }
143
168
}
144
169
}
145
170
}
146
171
}
147
172
}
148
173
149
- if len (noScheduleQuantities ) > 0 {
150
- noScheduleNodes [node .GetName ()] = noScheduleQuantities
151
- } else {
152
- delete (noScheduleNodes , node .GetName ())
153
- }
154
- }
155
-
156
- func (r * NodeHealthMonitor ) updateLendingLimits (ctx context.Context , cq * kueue.ClusterQueue ) (ctrl.Result , error ) {
157
-
158
- // compute unschedulable resource totals
159
- unschedulableQuantities := map [string ]* resource.Quantity {}
160
- for _ , quantities := range noScheduleNodes {
161
- for resourceName , quantity := range quantities {
162
- if ! quantity .IsZero () {
163
- if unschedulableQuantities [resourceName ] == nil {
164
- unschedulableQuantities [resourceName ] = ptr .To (* quantity )
165
- } else {
166
- unschedulableQuantities [resourceName ].Add (* quantity )
167
- }
168
- }
174
+ noScheduleNodesChanged := false
175
+ nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
176
+ if priorEntry , ok := noScheduleNodes [node .GetName ()]; ok {
177
+ if len (noScheduleResources ) == 0 {
178
+ delete (noScheduleNodes , node .GetName ())
179
+ noScheduleNodesChanged = true
180
+ } else if ! maps .Equal (priorEntry , noScheduleResources ) {
181
+ noScheduleNodes [node .GetName ()] = noScheduleResources
182
+ noScheduleNodesChanged = true
169
183
}
184
+ } else if len (noScheduleResources ) > 0 {
185
+ noScheduleNodes [node .GetName ()] = noScheduleResources
186
+ noScheduleNodesChanged = true
170
187
}
188
+ nodeInfoMutex .Unlock () // END CRITICAL SECTION
171
189
172
- // enforce lending limits on 1st flavor of 1st resource group
173
- resources := cq .Spec .ResourceGroups [0 ].Flavors [0 ].Resources
174
- limitsChanged := false
175
- for i , quota := range resources {
176
- var lendingLimit * resource.Quantity
177
- if unschedulableQuantity := unschedulableQuantities [quota .Name .String ()]; unschedulableQuantity != nil {
178
- if quota .NominalQuota .Cmp (* unschedulableQuantity ) > 0 {
179
- lendingLimit = ptr .To (quota .NominalQuota )
180
- lendingLimit .Sub (* unschedulableQuantity )
181
- } else {
182
- lendingLimit = resource .NewQuantity (0 , resource .DecimalSI )
183
- }
184
- }
185
- if quota .LendingLimit == nil && lendingLimit != nil ||
186
- quota .LendingLimit != nil && lendingLimit == nil ||
187
- quota .LendingLimit != nil && lendingLimit != nil && quota .LendingLimit .Cmp (* lendingLimit ) != 0 {
188
- limitsChanged = true
189
- resources [i ].LendingLimit = lendingLimit
190
- }
191
- }
192
-
193
- // update lending limits
194
- if limitsChanged {
195
- err := r .Update (ctx , cq )
196
- if err == nil {
197
- log .FromContext (ctx ).Info ("Updated lending limits" , "Resources" , resources )
198
- return ctrl.Result {}, nil
199
- } else if errors .IsConflict (err ) {
200
- return ctrl.Result {Requeue : true }, nil
201
- } else {
202
- return ctrl.Result {}, err
203
- }
190
+ if noScheduleNodesChanged {
191
+ r .triggerDispatch ()
192
+ log .FromContext (ctx ).Info ("Updated node NoSchedule information" , "Number NoSchedule Nodes" , len (noScheduleNodes ), "NoSchedule Resource Details" , noScheduleNodes )
204
193
}
205
-
206
- return ctrl.Result {}, nil
207
194
}
208
195
209
196
// SetupWithManager sets up the controller with the Manager.
0 commit comments