@@ -19,6 +19,7 @@ package custompluginmonitor
19
19
import (
20
20
"encoding/json"
21
21
"io/ioutil"
22
+ "k8s.io/node-problem-detector/pkg/util"
22
23
"time"
23
24
24
25
"github.com/golang/glog"
@@ -28,7 +29,6 @@ import (
28
29
"k8s.io/node-problem-detector/pkg/problemdaemon"
29
30
"k8s.io/node-problem-detector/pkg/problemmetrics"
30
31
"k8s.io/node-problem-detector/pkg/types"
31
- "k8s.io/node-problem-detector/pkg/util"
32
32
"k8s.io/node-problem-detector/pkg/util/tomb"
33
33
)
34
34
@@ -126,17 +126,35 @@ func (c *customPluginMonitor) monitorLoop() {
126
126
127
127
resultChan := c .plugin .GetResultChan ()
128
128
129
+ // runRules done for the interval
130
+ intervalEndChan := c .plugin .GetIntervalEndChan ()
131
+ var intervalResults []cpmtypes.Result
132
+
129
133
for {
130
134
select {
131
135
case result , ok := <- resultChan :
132
136
if ! ok {
133
137
glog .Errorf ("Result channel closed: %s" , c .configPath )
134
138
return
135
139
}
140
+
136
141
glog .V (3 ).Infof ("Receive new plugin result for %s: %+v" , c .configPath , result )
137
- status := c .generateStatus (result )
142
+
143
+ // gather results for single rule interval loop
144
+ intervalResults = append (intervalResults , result )
145
+ case _ , ok := <- intervalEndChan :
146
+ if ! ok {
147
+ glog .Errorf ("Interval End Channel closed: %s" , c .configPath )
148
+ return
149
+ }
150
+
151
+ glog .V (3 ).Infof ("All plugins ran for one interval for %s" , c .configPath )
152
+ status := c .generateStatus (intervalResults )
138
153
glog .V (3 ).Infof ("New status generated: %+v" , status )
139
154
c .statusChan <- status
155
+
156
+ glog .V (3 ).Info ("Resetting interval" )
157
+ intervalResults = []cpmtypes.Result {}
140
158
case <- c .tomb .Stopping ():
141
159
c .plugin .Stop ()
142
160
glog .Infof ("Custom plugin monitor stopped: %s" , c .configPath )
@@ -146,107 +164,117 @@ func (c *customPluginMonitor) monitorLoop() {
146
164
}
147
165
}
148
166
149
- // generateStatus generates status from the plugin check result.
150
- func (c * customPluginMonitor ) generateStatus (result cpmtypes.Result ) * types.Status {
167
+ func (c * customPluginMonitor ) generateStatus (results []cpmtypes.Result ) * types.Status {
151
168
timestamp := time .Now ()
152
169
var activeProblemEvents []types.Event
153
170
var inactiveProblemEvents []types.Event
154
- if result .Rule .Type == types .Temp {
155
- // For temporary error only generate event when exit status is above warning
156
- if result .ExitStatus >= cpmtypes .NonOK {
157
- activeProblemEvents = append (activeProblemEvents , types.Event {
158
- Severity : types .Warn ,
159
- Timestamp : timestamp ,
160
- Reason : result .Rule .Reason ,
161
- Message : result .Message ,
162
- })
163
- }
164
- } else {
165
- // For permanent error that changes the condition
166
- for i := range c .conditions {
167
- condition := & c .conditions [i ]
168
- if condition .Type == result .Rule .Condition {
169
- // The condition reason specified in the rule and the result message
170
- // represent the problem happened. We need to know the default condition
171
- // from the config, so that we can set the new condition reason/message
172
- // back when such problem goes away.
173
- var defaultConditionReason string
174
- var defaultConditionMessage string
175
- for j := range c .config .DefaultConditions {
176
- defaultCondition := & c .config .DefaultConditions [j ]
177
- if defaultCondition .Type == result .Rule .Condition {
178
- defaultConditionReason = defaultCondition .Reason
179
- defaultConditionMessage = defaultCondition .Message
180
- break
181
- }
182
- }
183
171
184
- needToUpdateCondition := true
185
- var newReason string
186
- var newMessage string
187
- status := toConditionStatus (result .ExitStatus )
188
- if condition .Status == types .True && status != types .True {
189
- // Scenario 1: Condition status changes from True to False/Unknown
190
- newReason = defaultConditionReason
191
- if status == types .False {
192
- newMessage = defaultConditionMessage
193
- } else {
194
- // When status unknown, the result's message is important for debug
195
- newMessage = result .Message
172
+ var unProcessedResults []cpmtypes.Result
173
+
174
+ for _ , result := range results {
175
+ status := toConditionStatus (result .ExitStatus )
176
+ if result .Rule .Type == types .Temp {
177
+ // For temporary error only generate event when exit status is above warning
178
+ if result .ExitStatus >= cpmtypes .NonOK {
179
+ activeProblemEvents = append (activeProblemEvents , types.Event {
180
+ Severity : types .Warn ,
181
+ Timestamp : timestamp ,
182
+ Reason : result .Rule .Reason ,
183
+ Message : result .Message ,
184
+ })
185
+ }
186
+ } else {
187
+ // we skip result that sets condition true, and result that sets condition false/unknown but with a different reason
188
+ // result that sets condition true will be processed later again
189
+ if status == types .True {
190
+ unProcessedResults = append (unProcessedResults , result )
191
+ continue
192
+ }
193
+
194
+ for i := range c .conditions {
195
+ condition := & c .conditions [i ]
196
+
197
+ // if appropriate (current condition's reason changes to false/unknown), unset(set to false/unknown) condition first.
198
+ // In case there are multiple reasons per condition, this will prevent ignoring new reason that sets
199
+ // condition true (since original condition reason takes precedence) or flapping (condition set to false
200
+ // by current reason, then to true by another reason)
201
+ if condition .Type == result .Rule .Condition && condition .Reason == result .Rule .Reason {
202
+
203
+ // The condition reason specified in the rule and the result message
204
+ // represent the problem happened. We need to know the default condition
205
+ // from the config, so that we can set the new condition reason/message
206
+ // back when such problem goes away.
207
+ var defaultConditionReason string
208
+ var defaultConditionMessage string
209
+ for j := range c .config .DefaultConditions {
210
+ defaultCondition := & c .config .DefaultConditions [j ]
211
+ if defaultCondition .Type == result .Rule .Condition {
212
+ defaultConditionReason = defaultCondition .Reason
213
+ defaultConditionMessage = defaultCondition .Message
214
+ break
215
+ }
196
216
}
197
- } else if condition .Status != types .True && status == types .True {
198
- // Scenario 2: Condition status changes from False/Unknown to True
199
- newReason = result .Rule .Reason
200
- newMessage = result .Message
201
- } else if condition .Status != status {
202
- // Scenario 3: Condition status changes from False to Unknown or vice versa
217
+
218
+ var newReason string
219
+ var newMessage string
203
220
newReason = defaultConditionReason
204
221
if status == types .False {
205
222
newMessage = defaultConditionMessage
206
223
} else {
207
224
// When status unknown, the result's message is important for debug
208
225
newMessage = result .Message
209
226
}
210
- } else if condition .Status == types .True && status == types .True &&
211
- (condition .Reason != result .Rule .Reason ||
212
- (* c .config .PluginGlobalConfig .EnableMessageChangeBasedConditionUpdate && condition .Message != result .Message )) {
213
- // Scenario 4: Condition status does not change and it stays true.
214
- // condition reason changes or
215
- // condition message changes when message based condition update is enabled.
216
- newReason = result .Rule .Reason
217
- newMessage = result .Message
218
- } else {
219
- // Scenario 5: Condition status does not change and it stays False/Unknown.
220
- // This should just be the default reason or message (as a consequence
221
- // of scenario 1 and scenario 3 above).
222
- needToUpdateCondition = false
223
- }
224
227
225
- if needToUpdateCondition {
226
228
condition .Transition = timestamp
227
229
condition .Status = status
228
230
condition .Reason = newReason
229
231
condition .Message = newMessage
230
232
231
- updateEvent := util .GenerateConditionChangeEvent (
232
- condition .Type ,
233
- status ,
234
- newReason ,
235
- newMessage ,
236
- timestamp ,
237
- )
233
+ break
234
+ }
235
+ }
236
+ }
237
+ }
238
238
239
- if status == types .True {
240
- activeProblemEvents = append (activeProblemEvents , updateEvent )
241
- } else {
242
- inactiveProblemEvents = append (inactiveProblemEvents , updateEvent )
243
- }
239
+ for _ , result := range unProcessedResults {
240
+ status := toConditionStatus (result .ExitStatus )
241
+ // we iterate through results that sets condition true for different reasons
242
+ // whatever result that went through result channel first takes precedence
243
+ for i := range c .conditions {
244
+ condition := & c .conditions [i ]
245
+ if condition .Type == result .Rule .Condition {
246
+ if condition .Status != types .True ||
247
+ (condition .Reason == result .Rule .Reason && * c .config .PluginGlobalConfig .EnableMessageChangeBasedConditionUpdate ) {
248
+ // update condition only when condition is currently false/unknown, or message based condition update is enabled.
249
+ // for each condition, this if-block will be reached once
250
+ condition .Transition = timestamp
251
+ condition .Status = status
252
+ condition .Reason = result .Rule .Reason
253
+ condition .Message = result .Message
244
254
}
245
255
246
256
break
247
257
}
248
258
}
249
259
}
260
+
261
+ for i := range c .conditions {
262
+ // check for conditions that are still false/unknown
263
+ condition := & c .conditions [i ]
264
+ updateEvent := util .GenerateConditionChangeEvent (
265
+ condition .Type ,
266
+ condition .Status ,
267
+ condition .Reason ,
268
+ condition .Message ,
269
+ timestamp ,
270
+ )
271
+ if condition .Status != types .True {
272
+ inactiveProblemEvents = append (inactiveProblemEvents , updateEvent )
273
+ } else {
274
+ activeProblemEvents = append (activeProblemEvents , updateEvent )
275
+ }
276
+ }
277
+
250
278
if * c .config .EnableMetricsReporting {
251
279
// Increment problem counter only for active problems which just got detected.
252
280
for _ , event := range activeProblemEvents {
@@ -266,6 +294,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
266
294
}
267
295
}
268
296
}
297
+
269
298
status := & types.Status {
270
299
Source : c .config .Source ,
271
300
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
@@ -276,6 +305,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
276
305
if len (activeProblemEvents ) != 0 || len (inactiveProblemEvents ) != 0 {
277
306
glog .V (0 ).Infof ("New status generated: %+v" , status )
278
307
}
308
+
279
309
return status
280
310
}
281
311
0 commit comments