@@ -38,7 +38,7 @@ import (
38
38
39
39
// NodeHealthMonitor watches Nodes and maintains mappings of Nodes that have either
40
40
// been marked as Unschedulable or that have been labeled to indicate that
41
- // they have resources that Autopilot has tainted as NoSchedule or NoExeucte .
41
+ // they have resources that Autopilot has tainted as NoSchedule or NoExecute .
42
42
// This information is used to automate the maintenance of the lendingLimit of
43
43
// a designated slack ClusterQueue and to migrate running workloads away from NoExecute resources.
44
44
type NodeHealthMonitor struct {
@@ -48,21 +48,18 @@ type NodeHealthMonitor struct {
48
48
}
49
49
50
50
var (
51
- // nodeInfoMutex synchronizes writes by NodeHealthMonitor with reads from AppWrapperReconciler and SlackClusterQueueMonitor
52
- nodeInfoMutex sync.RWMutex
53
-
54
51
// noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExecute taint
55
52
noExecuteNodes = make (map [string ]sets.Set [string ])
53
+ // noExecuteNodesMutex synchronizes access to noExecuteNodes
54
+ noExecuteNodesMutex sync.RWMutex
56
55
57
56
// noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources.
58
57
// A resource may be unschedulable either because:
59
58
// (a) the Node is cordoned (node.Spec.Unschedulable is true) or
60
59
// (b) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource.
61
60
noScheduleNodes = make (map [string ]v1.ResourceList )
62
- )
63
-
64
- const (
65
- dispatchEventName = "*trigger*"
61
+ // noScheduleNodesMutex synchronizes access to noScheduleNodes
62
+ noScheduleNodesMutex sync.RWMutex
66
63
)
67
64
68
65
// permission to watch nodes
@@ -91,7 +88,7 @@ func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ct
91
88
func (r * NodeHealthMonitor ) triggerSlackCQMonitor () {
92
89
if r .Config .SlackQueueName != "" {
93
90
select {
94
- case r .Events <- event.GenericEvent {Object : & metav1.PartialObjectMetadata {ObjectMeta : metav1.ObjectMeta {Name : dispatchEventName }}}:
91
+ case r .Events <- event.GenericEvent {Object : & metav1.PartialObjectMetadata {ObjectMeta : metav1.ObjectMeta {Name : r . Config . SlackQueueName }}}:
95
92
default :
96
93
// do not block if event is already in channel
97
94
}
@@ -101,20 +98,20 @@ func (r *NodeHealthMonitor) triggerSlackCQMonitor() {
101
98
// update noExecuteNodes and noScheduleNodes for the deletion of nodeName
102
99
func (r * NodeHealthMonitor ) updateForNodeDeletion (ctx context.Context , nodeName string ) {
103
100
if _ , ok := noExecuteNodes [nodeName ]; ok {
104
- nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
101
+ noExecuteNodesMutex .Lock () // BEGIN CRITICAL SECTION
105
102
delete (noExecuteNodes , nodeName )
106
- nodeInfoMutex .Unlock () // END CRITICAL SECTION
107
- r .triggerSlackCQMonitor ()
103
+ noExecuteNodesMutex .Unlock () // END CRITICAL SECTION
108
104
log .FromContext (ctx ).Info ("Updated NoExecute information due to Node deletion" ,
109
105
"Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
106
+ r .triggerSlackCQMonitor ()
110
107
}
111
108
if _ , ok := noScheduleNodes [nodeName ]; ok {
112
- nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
109
+ noScheduleNodesMutex .Lock () // BEGIN CRITICAL SECTION
113
110
delete (noScheduleNodes , nodeName )
114
- nodeInfoMutex .Unlock () // END CRITICAL SECTION
115
- r .triggerSlackCQMonitor ()
111
+ noScheduleNodesMutex .Unlock () // END CRITICAL SECTION
116
112
log .FromContext (ctx ).Info ("Updated NoSchedule information due to Node deletion" ,
117
113
"Number NoSchedule Nodes" , len (noScheduleNodes ), "NoSchedule Resource Details" , noScheduleNodes )
114
+ r .triggerSlackCQMonitor ()
118
115
}
119
116
}
120
117
@@ -132,7 +129,7 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
132
129
}
133
130
134
131
noExecuteNodesChanged := false
135
- nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
132
+ noExecuteNodesMutex .Lock () // BEGIN CRITICAL SECTION
136
133
if priorEntry , ok := noExecuteNodes [node .GetName ()]; ok {
137
134
if len (noExecuteResources ) == 0 {
138
135
delete (noExecuteNodes , node .GetName ())
@@ -145,11 +142,11 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
145
142
noExecuteNodes [node .GetName ()] = noExecuteResources
146
143
noExecuteNodesChanged = true
147
144
}
148
- nodeInfoMutex .Unlock () // END CRITICAL SECTION
145
+ noExecuteNodesMutex .Unlock () // END CRITICAL SECTION
149
146
150
147
if noExecuteNodesChanged {
151
- r .triggerSlackCQMonitor ()
152
148
log .FromContext (ctx ).Info ("Updated NoExecute information" , "Number NoExecute Nodes" , len (noExecuteNodes ), "NoExecute Resource Details" , noExecuteNodes )
149
+ r .triggerSlackCQMonitor ()
153
150
}
154
151
}
155
152
@@ -176,7 +173,7 @@ func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.
176
173
}
177
174
178
175
noScheduleNodesChanged := false
179
- nodeInfoMutex .Lock () // BEGIN CRITICAL SECTION
176
+ noScheduleNodesMutex .Lock () // BEGIN CRITICAL SECTION
180
177
if priorEntry , ok := noScheduleNodes [node .GetName ()]; ok {
181
178
if len (noScheduleResources ) == 0 {
182
179
delete (noScheduleNodes , node .GetName ())
@@ -189,11 +186,11 @@ func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.
189
186
noScheduleNodes [node .GetName ()] = noScheduleResources
190
187
noScheduleNodesChanged = true
191
188
}
192
- nodeInfoMutex .Unlock () // END CRITICAL SECTION
189
+ noScheduleNodesMutex .Unlock () // END CRITICAL SECTION
193
190
194
191
if noScheduleNodesChanged {
195
- r .triggerSlackCQMonitor ()
196
192
log .FromContext (ctx ).Info ("Updated NoSchedule information" , "Number NoSchedule Nodes" , len (noScheduleNodes ), "NoSchedule Resource Details" , noScheduleNodes )
193
+ r .triggerSlackCQMonitor ()
197
194
}
198
195
}
199
196
0 commit comments