@@ -26,6 +26,7 @@ import (
26
26
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
27
27
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
28
28
"k8s.io/node-problem-detector/pkg/problemdaemon"
29
+ "k8s.io/node-problem-detector/pkg/problemmetrics"
29
30
"k8s.io/node-problem-detector/pkg/types"
30
31
"k8s.io/node-problem-detector/pkg/util"
31
32
"k8s.io/node-problem-detector/pkg/util/tomb"
@@ -80,9 +81,31 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
80
81
c .plugin = plugin .NewPlugin (c .config )
81
82
// A 1000 size channel should be big enough.
82
83
c .statusChan = make (chan * types.Status , 1000 )
84
+
85
+ if * c .config .EnableMetricsReporting {
86
+ initializeProblemMetricsOrDie (c .config .Rules )
87
+ }
83
88
return c
84
89
}
85
90
91
+ // initializeProblemMetricsOrDie creates problem metrics for all problems and set the value to 0,
92
+ // panic if error occurs.
93
+ func initializeProblemMetricsOrDie (rules []* cpmtypes.CustomRule ) {
94
+ for _ , rule := range rules {
95
+ if rule .Type == types .Perm {
96
+ err := problemmetrics .GlobalProblemMetricsManager .SetProblemGauge (rule .Condition , rule .Reason , false )
97
+ if err != nil {
98
+ glog .Fatalf ("Failed to initialize problem gauge metrics for problem %q, reason %q: %v" ,
99
+ rule .Condition , rule .Reason , err )
100
+ }
101
+ }
102
+ err := problemmetrics .GlobalProblemMetricsManager .IncrementProblemCounter (rule .Reason , 0 )
103
+ if err != nil {
104
+ glog .Fatalf ("Failed to initialize problem counter metrics for %q: %v" , rule .Reason , err )
105
+ }
106
+ }
107
+ }
108
+
86
109
func (c * customPluginMonitor ) Start () (<- chan * types.Status , error ) {
87
110
glog .Info ("Start custom plugin monitor" )
88
111
go c .plugin .Run ()
@@ -120,11 +143,12 @@ func (c *customPluginMonitor) monitorLoop() {
120
143
// generateStatus generates status from the plugin check result.
121
144
func (c * customPluginMonitor ) generateStatus (result cpmtypes.Result ) * types.Status {
122
145
timestamp := time .Now ()
123
- var events []types.Event
146
+ var activeProblemEvents []types.Event
147
+ var inactiveProblemEvents []types.Event
124
148
if result .Rule .Type == types .Temp {
125
149
// For temporary error only generate event when exit status is above warning
126
150
if result .ExitStatus >= cpmtypes .NonOK {
127
- events = append (events , types.Event {
151
+ activeProblemEvents = append (activeProblemEvents , types.Event {
128
152
Severity : types .Warn ,
129
153
Timestamp : timestamp ,
130
154
Reason : result .Rule .Reason ,
@@ -151,7 +175,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
151
175
}
152
176
}
153
177
154
- events = append (events , util .GenerateConditionChangeEvent (
178
+ inactiveProblemEvents = append (inactiveProblemEvents , util .GenerateConditionChangeEvent (
155
179
condition .Type ,
156
180
status ,
157
181
defaultConditionReason ,
@@ -165,7 +189,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
165
189
// change 2: Condition status change from False/Unknown to True
166
190
condition .Transition = timestamp
167
191
condition .Message = result .Message
168
- events = append (events , util .GenerateConditionChangeEvent (
192
+ activeProblemEvents = append (activeProblemEvents , util .GenerateConditionChangeEvent (
169
193
condition .Type ,
170
194
status ,
171
195
result .Rule .Reason ,
@@ -178,7 +202,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
178
202
// change 3: Condition status change from False to Unknown or vice versa
179
203
condition .Transition = timestamp
180
204
condition .Message = result .Message
181
- events = append (events , util .GenerateConditionChangeEvent (
205
+ inactiveProblemEvents = append (inactiveProblemEvents , util .GenerateConditionChangeEvent (
182
206
condition .Type ,
183
207
status ,
184
208
result .Rule .Reason ,
@@ -196,22 +220,46 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
196
220
condition .Transition = timestamp
197
221
condition .Reason = result .Rule .Reason
198
222
condition .Message = result .Message
199
- events = append ( events , util .GenerateConditionChangeEvent (
223
+ updateEvent := util .GenerateConditionChangeEvent (
200
224
condition .Type ,
201
225
status ,
202
226
condition .Reason ,
203
227
timestamp ,
204
- ))
228
+ )
229
+ if condition .Status == types .True {
230
+ activeProblemEvents = append (activeProblemEvents , updateEvent )
231
+ } else {
232
+ inactiveProblemEvents = append (inactiveProblemEvents , updateEvent )
233
+ }
205
234
}
206
235
207
236
break
208
237
}
209
238
}
210
239
}
240
+ if * c .config .EnableMetricsReporting {
241
+ // Increment problem counter only for active problems which just got detected.
242
+ for _ , event := range activeProblemEvents {
243
+ err := problemmetrics .GlobalProblemMetricsManager .IncrementProblemCounter (
244
+ event .Reason , 1 )
245
+ if err != nil {
246
+ glog .Errorf ("Failed to update problem counter metrics for %q: %v" ,
247
+ event .Reason , err )
248
+ }
249
+ }
250
+ for _ , condition := range c .conditions {
251
+ err := problemmetrics .GlobalProblemMetricsManager .SetProblemGauge (
252
+ condition .Type , condition .Reason , condition .Status == types .True )
253
+ if err != nil {
254
+ glog .Errorf ("Failed to update problem gauge metrics for problem %q, reason %q: %v" ,
255
+ condition .Type , condition .Reason , err )
256
+ }
257
+ }
258
+ }
211
259
return & types.Status {
212
260
Source : c .config .Source ,
213
261
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
214
- Events : events ,
262
+ Events : append ( activeProblemEvents , inactiveProblemEvents ... ) ,
215
263
Conditions : c .conditions ,
216
264
}
217
265
}
0 commit comments