Skip to content

Commit fd8cf72

Browse files
jason1028krjasonjung
authored and
jasonjung
committed
support multiple reasons per condition for CustomPluginMonitor
1 parent 56122ce commit fd8cf72

File tree

3 files changed

+850
-84
lines changed

3 files changed

+850
-84
lines changed

pkg/custompluginmonitor/custom_plugin_monitor.go

Lines changed: 108 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package custompluginmonitor
1919
import (
2020
"encoding/json"
2121
"io/ioutil"
22+
"k8s.io/node-problem-detector/pkg/util"
2223
"time"
2324

2425
"github.com/golang/glog"
@@ -28,7 +29,6 @@ import (
2829
"k8s.io/node-problem-detector/pkg/problemdaemon"
2930
"k8s.io/node-problem-detector/pkg/problemmetrics"
3031
"k8s.io/node-problem-detector/pkg/types"
31-
"k8s.io/node-problem-detector/pkg/util"
3232
"k8s.io/node-problem-detector/pkg/util/tomb"
3333
)
3434

@@ -126,17 +126,35 @@ func (c *customPluginMonitor) monitorLoop() {
126126

127127
resultChan := c.plugin.GetResultChan()
128128

129+
// runRules done for the interval
130+
intervalEndChan := c.plugin.GetIntervalEndChan()
131+
var intervalResults []cpmtypes.Result
132+
129133
for {
130134
select {
131135
case result, ok := <-resultChan:
132136
if !ok {
133137
glog.Errorf("Result channel closed: %s", c.configPath)
134138
return
135139
}
140+
136141
glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
137-
status := c.generateStatus(result)
142+
143+
// gather results for single rule interval loop
144+
intervalResults = append(intervalResults, result)
145+
case _, ok := <-intervalEndChan:
146+
if !ok {
147+
glog.Errorf("Interval End Channel closed: %s", c.configPath)
148+
return
149+
}
150+
151+
glog.V(3).Infof("All plugins ran for one interval for %s", c.configPath)
152+
status := c.generateStatus(intervalResults)
138153
glog.V(3).Infof("New status generated: %+v", status)
139154
c.statusChan <- status
155+
156+
glog.V(3).Info("Resetting interval")
157+
intervalResults = []cpmtypes.Result{}
140158
case <-c.tomb.Stopping():
141159
c.plugin.Stop()
142160
glog.Infof("Custom plugin monitor stopped: %s", c.configPath)
@@ -146,107 +164,117 @@ func (c *customPluginMonitor) monitorLoop() {
146164
}
147165
}
148166

149-
// generateStatus generates status from the plugin check result.
150-
func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Status {
167+
func (c *customPluginMonitor) generateStatus(results []cpmtypes.Result) *types.Status {
151168
timestamp := time.Now()
152169
var activeProblemEvents []types.Event
153170
var inactiveProblemEvents []types.Event
154-
if result.Rule.Type == types.Temp {
155-
// For temporary error only generate event when exit status is above warning
156-
if result.ExitStatus >= cpmtypes.NonOK {
157-
activeProblemEvents = append(activeProblemEvents, types.Event{
158-
Severity: types.Warn,
159-
Timestamp: timestamp,
160-
Reason: result.Rule.Reason,
161-
Message: result.Message,
162-
})
163-
}
164-
} else {
165-
// For permanent error that changes the condition
166-
for i := range c.conditions {
167-
condition := &c.conditions[i]
168-
if condition.Type == result.Rule.Condition {
169-
// The condition reason specified in the rule and the result message
170-
// represent the problem happened. We need to know the default condition
171-
// from the config, so that we can set the new condition reason/message
172-
// back when such problem goes away.
173-
var defaultConditionReason string
174-
var defaultConditionMessage string
175-
for j := range c.config.DefaultConditions {
176-
defaultCondition := &c.config.DefaultConditions[j]
177-
if defaultCondition.Type == result.Rule.Condition {
178-
defaultConditionReason = defaultCondition.Reason
179-
defaultConditionMessage = defaultCondition.Message
180-
break
181-
}
182-
}
183171

184-
needToUpdateCondition := true
185-
var newReason string
186-
var newMessage string
187-
status := toConditionStatus(result.ExitStatus)
188-
if condition.Status == types.True && status != types.True {
189-
// Scenario 1: Condition status changes from True to False/Unknown
190-
newReason = defaultConditionReason
191-
if status == types.False {
192-
newMessage = defaultConditionMessage
193-
} else {
194-
// When status unknown, the result's message is important for debug
195-
newMessage = result.Message
172+
var unProcessedResults []cpmtypes.Result
173+
174+
for _, result := range results {
175+
status := toConditionStatus(result.ExitStatus)
176+
if result.Rule.Type == types.Temp {
177+
// For temporary error only generate event when exit status is above warning
178+
if result.ExitStatus >= cpmtypes.NonOK {
179+
activeProblemEvents = append(activeProblemEvents, types.Event{
180+
Severity: types.Warn,
181+
Timestamp: timestamp,
182+
Reason: result.Rule.Reason,
183+
Message: result.Message,
184+
})
185+
}
186+
} else {
187+
// we skip result that sets condition true, and result that sets condition false/unknown but with a different reason
188+
// result that sets condition true will be processed later again
189+
if status == types.True {
190+
unProcessedResults = append(unProcessedResults, result)
191+
continue
192+
}
193+
194+
for i := range c.conditions {
195+
condition := &c.conditions[i]
196+
197+
// if appropriate (current condition's reason changes to false/unknown), unset(set to false/unknown) condition first.
198+
// In case there are multiple reasons per condition, this will prevent ignoring new reason that sets
199+
// condition true (since original condition reason takes precedence) or flapping (condition set to false
200+
// by current reason, then to true by another reason)
201+
if condition.Type == result.Rule.Condition && condition.Reason == result.Rule.Reason {
202+
203+
// The condition reason specified in the rule and the result message
204+
// represent the problem happened. We need to know the default condition
205+
// from the config, so that we can set the new condition reason/message
206+
// back when such problem goes away.
207+
var defaultConditionReason string
208+
var defaultConditionMessage string
209+
for j := range c.config.DefaultConditions {
210+
defaultCondition := &c.config.DefaultConditions[j]
211+
if defaultCondition.Type == result.Rule.Condition {
212+
defaultConditionReason = defaultCondition.Reason
213+
defaultConditionMessage = defaultCondition.Message
214+
break
215+
}
196216
}
197-
} else if condition.Status != types.True && status == types.True {
198-
// Scenario 2: Condition status changes from False/Unknown to True
199-
newReason = result.Rule.Reason
200-
newMessage = result.Message
201-
} else if condition.Status != status {
202-
// Scenario 3: Condition status changes from False to Unknown or vice versa
217+
218+
var newReason string
219+
var newMessage string
203220
newReason = defaultConditionReason
204221
if status == types.False {
205222
newMessage = defaultConditionMessage
206223
} else {
207224
// When status unknown, the result's message is important for debug
208225
newMessage = result.Message
209226
}
210-
} else if condition.Status == types.True && status == types.True &&
211-
(condition.Reason != result.Rule.Reason ||
212-
(*c.config.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate && condition.Message != result.Message)) {
213-
// Scenario 4: Condition status does not change and it stays true.
214-
// condition reason changes or
215-
// condition message changes when message based condition update is enabled.
216-
newReason = result.Rule.Reason
217-
newMessage = result.Message
218-
} else {
219-
// Scenario 5: Condition status does not change and it stays False/Unknown.
220-
// This should just be the default reason or message (as a consequence
221-
// of scenario 1 and scenario 3 above).
222-
needToUpdateCondition = false
223-
}
224227

225-
if needToUpdateCondition {
226228
condition.Transition = timestamp
227229
condition.Status = status
228230
condition.Reason = newReason
229231
condition.Message = newMessage
230232

231-
updateEvent := util.GenerateConditionChangeEvent(
232-
condition.Type,
233-
status,
234-
newReason,
235-
newMessage,
236-
timestamp,
237-
)
233+
break
234+
}
235+
}
236+
}
237+
}
238238

239-
if status == types.True {
240-
activeProblemEvents = append(activeProblemEvents, updateEvent)
241-
} else {
242-
inactiveProblemEvents = append(inactiveProblemEvents, updateEvent)
243-
}
239+
for _, result := range unProcessedResults {
240+
status := toConditionStatus(result.ExitStatus)
241+
// we iterate through results that sets condition true for different reasons
242+
// whatever result that went through result channel first takes precedence
243+
for i := range c.conditions {
244+
condition := &c.conditions[i]
245+
if condition.Type == result.Rule.Condition {
246+
if condition.Status != types.True ||
247+
(condition.Reason == result.Rule.Reason && *c.config.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate) {
248+
// update condition only when condition is currently false/unknown, or message based condition update is enabled.
249+
// for each condition, this if-block will be reached once
250+
condition.Transition = timestamp
251+
condition.Status = status
252+
condition.Reason = result.Rule.Reason
253+
condition.Message = result.Message
244254
}
245255

246256
break
247257
}
248258
}
249259
}
260+
261+
for i := range c.conditions {
262+
// check for conditions that are still false/unknown
263+
condition := &c.conditions[i]
264+
updateEvent := util.GenerateConditionChangeEvent(
265+
condition.Type,
266+
condition.Status,
267+
condition.Reason,
268+
condition.Message,
269+
timestamp,
270+
)
271+
if condition.Status != types.True {
272+
inactiveProblemEvents = append(inactiveProblemEvents, updateEvent)
273+
} else {
274+
activeProblemEvents = append(activeProblemEvents, updateEvent)
275+
}
276+
}
277+
250278
if *c.config.EnableMetricsReporting {
251279
// Increment problem counter only for active problems which just got detected.
252280
for _, event := range activeProblemEvents {
@@ -266,6 +294,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
266294
}
267295
}
268296
}
297+
269298
status := &types.Status{
270299
Source: c.config.Source,
271300
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
@@ -276,6 +305,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
276305
if len(activeProblemEvents) != 0 || len(inactiveProblemEvents) != 0 {
277306
glog.V(0).Infof("New status generated: %+v", status)
278307
}
308+
279309
return status
280310
}
281311

0 commit comments

Comments
 (0)