@@ -26,6 +26,7 @@ import (
26
26
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
27
27
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
28
28
"k8s.io/node-problem-detector/pkg/problemdaemon"
29
+ "k8s.io/node-problem-detector/pkg/problemmetrics"
29
30
"k8s.io/node-problem-detector/pkg/types"
30
31
"k8s.io/node-problem-detector/pkg/util"
31
32
"k8s.io/node-problem-detector/pkg/util/tomb"
@@ -80,9 +81,31 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
80
81
c .plugin = plugin .NewPlugin (c .config )
81
82
// A 1000 size channel should be big enough.
82
83
c .statusChan = make (chan * types.Status , 1000 )
84
+
85
+ if * c .config .EnableMetricsReporting {
86
+ initializeProblemMetricsOrDie (c .config .Rules )
87
+ }
83
88
return c
84
89
}
85
90
91
+ // initializeProblemMetricsOrDie creates problem metrics for all problems and set the value to 0,
92
+ // panic if error occurs.
93
+ func initializeProblemMetricsOrDie (rules []* cpmtypes.CustomRule ) {
94
+ for _ , rule := range rules {
95
+ if rule .Type == types .Perm {
96
+ err := problemmetrics .GlobalProblemMetricsManager .SetProblemGauge (rule .Condition , rule .Reason , false )
97
+ if err != nil {
98
+ glog .Fatalf ("Failed to initialize problem gauge metrics for problem %q, reason %q: %v" ,
99
+ rule .Condition , rule .Reason , err )
100
+ }
101
+ }
102
+ err := problemmetrics .GlobalProblemMetricsManager .IncrementProblemCounter (rule .Reason , 0 )
103
+ if err != nil {
104
+ glog .Fatalf ("Failed to initialize problem counter metrics for %q: %v" , rule .Reason , err )
105
+ }
106
+ }
107
+ }
108
+
86
109
func (c * customPluginMonitor ) Start () (<- chan * types.Status , error ) {
87
110
glog .Info ("Start custom plugin monitor" )
88
111
go c .plugin .Run ()
@@ -130,6 +153,13 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
130
153
Reason : result .Rule .Reason ,
131
154
Message : result .Message ,
132
155
})
156
+
157
+ if * c .config .EnableMetricsReporting {
158
+ err := problemmetrics .GlobalProblemMetricsManager .IncrementProblemCounter (result .Rule .Reason , 1 )
159
+ if err != nil {
160
+ glog .Errorf ("Failed to update problem counter metrics for %q: %v" , result .Rule .Reason , err )
161
+ }
162
+ }
133
163
}
134
164
} else {
135
165
// For permanent error changes the condition
@@ -161,6 +191,14 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
161
191
condition .Status = status
162
192
condition .Message = defaultConditionMessage
163
193
condition .Reason = defaultConditionReason
194
+
195
+ if * c .config .EnableMetricsReporting {
196
+ err := problemmetrics .GlobalProblemMetricsManager .SetProblemGauge (result .Rule .Condition , result .Rule .Reason , false )
197
+ if err != nil {
198
+ glog .Errorf ("Failed to update problem gauge metrics for problem %q, reason %q: %v" ,
199
+ result .Rule .Condition , result .Rule .Reason , err )
200
+ }
201
+ }
164
202
} else if condition .Status != types .True && status == types .True {
165
203
// change 2: Condition status change from False/Unknown to True
166
204
condition .Transition = timestamp
@@ -174,6 +212,18 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
174
212
175
213
condition .Status = status
176
214
condition .Reason = result .Rule .Reason
215
+
216
+ if * c .config .EnableMetricsReporting {
217
+ err := problemmetrics .GlobalProblemMetricsManager .IncrementProblemCounter (result .Rule .Reason , 1 )
218
+ if err != nil {
219
+ glog .Errorf ("Failed to update problem counter metrics for %q: %v" , result .Rule .Reason , err )
220
+ }
221
+ err = problemmetrics .GlobalProblemMetricsManager .SetProblemGauge (result .Rule .Condition , result .Rule .Reason , true )
222
+ if err != nil {
223
+ glog .Errorf ("Failed to update problem gauge metrics for problem %q, reason %q: %v" ,
224
+ result .Rule .Condition , result .Rule .Reason , err )
225
+ }
226
+ }
177
227
} else if condition .Status != status {
178
228
// change 3: Condition status change from False to Unknown or vice versa
179
229
condition .Transition = timestamp
@@ -202,6 +252,19 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
202
252
condition .Reason ,
203
253
timestamp ,
204
254
))
255
+
256
+ // update metrics only when condition reason changes.
257
+ if condition .Reason != result .Rule .Reason && * c .config .EnableMetricsReporting {
258
+ err := problemmetrics .GlobalProblemMetricsManager .IncrementProblemCounter (result .Rule .Reason , 1 )
259
+ if err != nil {
260
+ glog .Errorf ("Failed to update problem counter metrics for %q: %v" , result .Rule .Reason , err )
261
+ }
262
+ err = problemmetrics .GlobalProblemMetricsManager .SetProblemGauge (result .Rule .Condition , result .Rule .Reason , true )
263
+ if err != nil {
264
+ glog .Errorf ("Failed to update problem gauge metrics for problem %q, reason %q: %v" ,
265
+ result .Rule .Condition , result .Rule .Reason , err )
266
+ }
267
+ }
205
268
}
206
269
207
270
break
0 commit comments