Skip to content

Commit 925a622

Browse files
author
Xuewei Zhang
committed
Report metrics from custom-plugin-monitor
1 parent fbebcf3 commit 925a622

6 files changed

+74
-0
lines changed

config/custom-plugin-monitor.json

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"enable_message_change_based_condition_update": false
99
},
1010
"source": "ntp-custom-plugin-monitor",
11+
"metricsReporting": true,
1112
"conditions": [
1213
{
1314
"type": "NTPProblem",

config/kernel-monitor-counter.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"concurrency": 1
88
},
99
"source": "kernel-monitor",
10+
"metricsReporting": true,
1011
"conditions": [
1112
{
1213
"type": "FrequentUnregisterNetDevice",

config/network-problem-monitor.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"concurrency": 3
88
},
99
"source": "network-custom-plugin-monitor",
10+
"metricsReporting": true,
1011
"conditions": [],
1112
"rules": [
1213
{

config/systemd-monitor-counter.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"concurrency": 1
88
},
99
"source": "systemd-monitor",
10+
"metricsReporting": true,
1011
"conditions": [
1112
{
1213
"type": "FrequentKubeletRestart",

pkg/custompluginmonitor/custom_plugin_monitor.go

+63
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
2727
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
2828
"k8s.io/node-problem-detector/pkg/problemdaemon"
29+
"k8s.io/node-problem-detector/pkg/problemmetrics"
2930
"k8s.io/node-problem-detector/pkg/types"
3031
"k8s.io/node-problem-detector/pkg/util"
3132
"k8s.io/node-problem-detector/pkg/util/tomb"
@@ -80,9 +81,31 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
8081
c.plugin = plugin.NewPlugin(c.config)
8182
// A 1000 size channel should be big enough.
8283
c.statusChan = make(chan *types.Status, 1000)
84+
85+
if *c.config.EnableMetricsReporting {
86+
initializeProblemMetricsOrDie(c.config.Rules)
87+
}
8388
return c
8489
}
8590

91+
// initializeProblemMetricsOrDie creates problem metrics for all problems and set the value to 0,
92+
// panic if error occurs.
93+
func initializeProblemMetricsOrDie(rules []*cpmtypes.CustomRule) {
94+
for _, rule := range rules {
95+
if rule.Type == types.Perm {
96+
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
97+
if err != nil {
98+
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
99+
rule.Condition, rule.Reason, err)
100+
}
101+
}
102+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
103+
if err != nil {
104+
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
105+
}
106+
}
107+
}
108+
86109
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
87110
glog.Info("Start custom plugin monitor")
88111
go c.plugin.Run()
@@ -130,6 +153,13 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
130153
Reason: result.Rule.Reason,
131154
Message: result.Message,
132155
})
156+
157+
if *c.config.EnableMetricsReporting {
158+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(result.Rule.Reason, 1)
159+
if err != nil {
160+
glog.Errorf("Failed to update problem counter metrics for %q: %v", result.Rule.Reason, err)
161+
}
162+
}
133163
}
134164
} else {
135165
// For permanent error changes the condition
@@ -161,6 +191,14 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
161191
condition.Status = status
162192
condition.Message = defaultConditionMessage
163193
condition.Reason = defaultConditionReason
194+
195+
if *c.config.EnableMetricsReporting {
196+
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(result.Rule.Condition, result.Rule.Reason, false)
197+
if err != nil {
198+
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
199+
result.Rule.Condition, result.Rule.Reason, err)
200+
}
201+
}
164202
} else if condition.Status != types.True && status == types.True {
165203
// change 2: Condition status change from False/Unknown to True
166204
condition.Transition = timestamp
@@ -174,6 +212,18 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
174212

175213
condition.Status = status
176214
condition.Reason = result.Rule.Reason
215+
216+
if *c.config.EnableMetricsReporting {
217+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(result.Rule.Reason, 1)
218+
if err != nil {
219+
glog.Errorf("Failed to update problem counter metrics for %q: %v", result.Rule.Reason, err)
220+
}
221+
err = problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(result.Rule.Condition, result.Rule.Reason, true)
222+
if err != nil {
223+
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
224+
result.Rule.Condition, result.Rule.Reason, err)
225+
}
226+
}
177227
} else if condition.Status != status {
178228
// change 3: Condition status change from False to Unknown or vice versa
179229
condition.Transition = timestamp
@@ -202,6 +252,19 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
202252
condition.Reason,
203253
timestamp,
204254
))
255+
256+
// update metrics only when condition reason changes.
257+
if condition.Reason != result.Rule.Reason && *c.config.EnableMetricsReporting {
258+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(result.Rule.Reason, 1)
259+
if err != nil {
260+
glog.Errorf("Failed to update problem counter metrics for %q: %v", result.Rule.Reason, err)
261+
}
262+
err = problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(result.Rule.Condition, result.Rule.Reason, true)
263+
if err != nil {
264+
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
265+
result.Rule.Condition, result.Rule.Reason, err)
266+
}
267+
}
205268
}
206269

207270
break

pkg/custompluginmonitor/types/config.go

+7
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var (
3232
defaultMaxOutputLength = 80
3333
defaultConcurrency = 3
3434
defaultMessageChangeBasedConditionUpdate = false
35+
defaultEnableMetricsReporting = true
3536

3637
customPluginName = "custom"
3738
)
@@ -66,6 +67,8 @@ type CustomPluginConfig struct {
6667
DefaultConditions []types.Condition `json:"conditions"`
6768
// Rules are the rules custom plugin monitor will follow to parse and invoke plugins.
6869
Rules []*CustomRule `json:"rules"`
70+
// EnableMetricsReporting describes whether to report problems as metrics or not.
71+
EnableMetricsReporting *bool `json:"metricsReporting,omitempty"`
6972
}
7073

7174
// ApplyConfiguration applies default configurations.
@@ -112,6 +115,10 @@ func (cpc *CustomPluginConfig) ApplyConfiguration() error {
112115
}
113116
}
114117

118+
if cpc.EnableMetricsReporting == nil {
119+
cpc.EnableMetricsReporting = &defaultEnableMetricsReporting
120+
}
121+
115122
return nil
116123
}
117124

0 commit comments

Comments
 (0)