Skip to content

Commit 94af7de

Browse files
author
Xuewei Zhang
committed
Report metrics from custom-plugin-monitor
1 parent fbebcf3 commit 94af7de

8 files changed

+110
-26
lines changed

config/custom-plugin-monitor.json

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"enable_message_change_based_condition_update": false
99
},
1010
"source": "ntp-custom-plugin-monitor",
11+
"metricsReporting": true,
1112
"conditions": [
1213
{
1314
"type": "NTPProblem",

config/kernel-monitor-counter.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"concurrency": 1
88
},
99
"source": "kernel-monitor",
10+
"metricsReporting": true,
1011
"conditions": [
1112
{
1213
"type": "FrequentUnregisterNetDevice",

config/network-problem-monitor.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"concurrency": 3
88
},
99
"source": "network-custom-plugin-monitor",
10+
"metricsReporting": true,
1011
"conditions": [],
1112
"rules": [
1213
{

config/systemd-monitor-counter.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"concurrency": 1
88
},
99
"source": "systemd-monitor",
10+
"metricsReporting": true,
1011
"conditions": [
1112
{
1213
"type": "FrequentKubeletRestart",

pkg/custompluginmonitor/custom_plugin_monitor.go

+56-8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
2727
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
2828
"k8s.io/node-problem-detector/pkg/problemdaemon"
29+
"k8s.io/node-problem-detector/pkg/problemmetrics"
2930
"k8s.io/node-problem-detector/pkg/types"
3031
"k8s.io/node-problem-detector/pkg/util"
3132
"k8s.io/node-problem-detector/pkg/util/tomb"
@@ -80,9 +81,31 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
8081
c.plugin = plugin.NewPlugin(c.config)
8182
// A 1000 size channel should be big enough.
8283
c.statusChan = make(chan *types.Status, 1000)
84+
85+
if *c.config.EnableMetricsReporting {
86+
initializeProblemMetricsOrDie(c.config.Rules)
87+
}
8388
return c
8489
}
8590

91+
// initializeProblemMetricsOrDie creates problem metrics for all problems and set the value to 0,
92+
// panic if error occurs.
93+
func initializeProblemMetricsOrDie(rules []*cpmtypes.CustomRule) {
94+
for _, rule := range rules {
95+
if rule.Type == types.Perm {
96+
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
97+
if err != nil {
98+
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
99+
rule.Condition, rule.Reason, err)
100+
}
101+
}
102+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
103+
if err != nil {
104+
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
105+
}
106+
}
107+
}
108+
86109
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
87110
glog.Info("Start custom plugin monitor")
88111
go c.plugin.Run()
@@ -120,11 +143,12 @@ func (c *customPluginMonitor) monitorLoop() {
120143
// generateStatus generates status from the plugin check result.
121144
func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Status {
122145
timestamp := time.Now()
123-
var events []types.Event
146+
var activeProblemEvents []types.Event
147+
var inactiveProblemEvents []types.Event
124148
if result.Rule.Type == types.Temp {
125149
// For temporary error only generate event when exit status is above warning
126150
if result.ExitStatus >= cpmtypes.NonOK {
127-
events = append(events, types.Event{
151+
activeProblemEvents = append(activeProblemEvents, types.Event{
128152
Severity: types.Warn,
129153
Timestamp: timestamp,
130154
Reason: result.Rule.Reason,
@@ -151,7 +175,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
151175
}
152176
}
153177

154-
events = append(events, util.GenerateConditionChangeEvent(
178+
inactiveProblemEvents = append(inactiveProblemEvents, util.GenerateConditionChangeEvent(
155179
condition.Type,
156180
status,
157181
defaultConditionReason,
@@ -165,7 +189,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
165189
// change 2: Condition status change from False/Unknown to True
166190
condition.Transition = timestamp
167191
condition.Message = result.Message
168-
events = append(events, util.GenerateConditionChangeEvent(
192+
activeProblemEvents = append(activeProblemEvents, util.GenerateConditionChangeEvent(
169193
condition.Type,
170194
status,
171195
result.Rule.Reason,
@@ -178,7 +202,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
178202
// change 3: Condition status change from False to Unknown or vice versa
179203
condition.Transition = timestamp
180204
condition.Message = result.Message
181-
events = append(events, util.GenerateConditionChangeEvent(
205+
inactiveProblemEvents = append(inactiveProblemEvents, util.GenerateConditionChangeEvent(
182206
condition.Type,
183207
status,
184208
result.Rule.Reason,
@@ -196,22 +220,46 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
196220
condition.Transition = timestamp
197221
condition.Reason = result.Rule.Reason
198222
condition.Message = result.Message
199-
events = append(events, util.GenerateConditionChangeEvent(
223+
updateEvent := util.GenerateConditionChangeEvent(
200224
condition.Type,
201225
status,
202226
condition.Reason,
203227
timestamp,
204-
))
228+
)
229+
if condition.Status == types.True {
230+
activeProblemEvents = append(activeProblemEvents, updateEvent)
231+
} else {
232+
inactiveProblemEvents = append(inactiveProblemEvents, updateEvent)
233+
}
205234
}
206235

207236
break
208237
}
209238
}
210239
}
240+
if *c.config.EnableMetricsReporting {
241+
// Increment problem counter only for active problems which just got detected.
242+
for _, event := range activeProblemEvents {
243+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(
244+
event.Reason, 1)
245+
if err != nil {
246+
glog.Errorf("Failed to update problem counter metrics for %q: %v",
247+
event.Reason, err)
248+
}
249+
}
250+
for _, condition := range c.conditions {
251+
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
252+
condition.Type, condition.Reason, condition.Status == types.True)
253+
if err != nil {
254+
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
255+
condition.Type, condition.Reason, err)
256+
}
257+
}
258+
}
211259
return &types.Status{
212260
Source: c.config.Source,
213261
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
214-
Events: events,
262+
Events: append(activeProblemEvents, inactiveProblemEvents...),
215263
Conditions: c.conditions,
216264
}
217265
}

pkg/custompluginmonitor/types/config.go

+7
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var (
3232
defaultMaxOutputLength = 80
3333
defaultConcurrency = 3
3434
defaultMessageChangeBasedConditionUpdate = false
35+
defaultEnableMetricsReporting = true
3536

3637
customPluginName = "custom"
3738
)
@@ -66,6 +67,8 @@ type CustomPluginConfig struct {
6667
DefaultConditions []types.Condition `json:"conditions"`
6768
// Rules are the rules custom plugin monitor will follow to parse and invoke plugins.
6869
Rules []*CustomRule `json:"rules"`
70+
// EnableMetricsReporting describes whether to report problems as metrics or not.
71+
EnableMetricsReporting *bool `json:"metricsReporting,omitempty"`
6972
}
7073

7174
// ApplyConfiguration applies default configurations.
@@ -112,6 +115,10 @@ func (cpc *CustomPluginConfig) ApplyConfiguration() error {
112115
}
113116
}
114117

118+
if cpc.EnableMetricsReporting == nil {
119+
cpc.EnableMetricsReporting = &defaultEnableMetricsReporting
120+
}
121+
115122
return nil
116123
}
117124

pkg/custompluginmonitor/types/config_test.go

+24
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
3030
maxOutputLength := 79
3131
concurrency := 2
3232
messageChangeBasedConditionUpdate := true
33+
disableMetricsReporting := false
3334

3435
ruleTimeout := 1 * time.Second
3536
ruleTimeoutString := ruleTimeout.String()
@@ -60,6 +61,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
6061
Concurrency: &defaultConcurrency,
6162
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
6263
},
64+
EnableMetricsReporting: &defaultEnableMetricsReporting,
6365
Rules: []*CustomRule{
6466
{
6567
Path: "../plugin/test-data/ok.sh",
@@ -88,6 +90,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
8890
Concurrency: &defaultConcurrency,
8991
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
9092
},
93+
EnableMetricsReporting: &defaultEnableMetricsReporting,
9194
},
9295
},
9396
"custom default timeout": {
@@ -106,6 +109,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
106109
Concurrency: &defaultConcurrency,
107110
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
108111
},
112+
EnableMetricsReporting: &defaultEnableMetricsReporting,
109113
},
110114
},
111115
"custom max output length": {
@@ -124,6 +128,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
124128
Concurrency: &defaultConcurrency,
125129
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
126130
},
131+
EnableMetricsReporting: &defaultEnableMetricsReporting,
127132
},
128133
},
129134
"custom concurrency": {
@@ -142,6 +147,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
142147
Concurrency: &concurrency,
143148
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
144149
},
150+
EnableMetricsReporting: &defaultEnableMetricsReporting,
145151
},
146152
},
147153
"custom message change based condition update": {
@@ -160,6 +166,24 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
160166
Concurrency: &defaultConcurrency,
161167
EnableMessageChangeBasedConditionUpdate: &messageChangeBasedConditionUpdate,
162168
},
169+
EnableMetricsReporting: &defaultEnableMetricsReporting,
170+
},
171+
},
172+
"disable metrics reporting": {
173+
Orig: CustomPluginConfig{
174+
EnableMetricsReporting: &disableMetricsReporting,
175+
},
176+
Wanted: CustomPluginConfig{
177+
PluginGlobalConfig: pluginGlobalConfig{
178+
InvokeIntervalString: &defaultInvokeIntervalString,
179+
InvokeInterval: &defaultInvokeInterval,
180+
TimeoutString: &defaultGlobalTimeoutString,
181+
Timeout: &defaultGlobalTimeout,
182+
MaxOutputLength: &defaultMaxOutputLength,
183+
Concurrency: &defaultConcurrency,
184+
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
185+
},
186+
EnableMetricsReporting: &disableMetricsReporting,
163187
},
164188
},
165189
}

pkg/systemlogmonitor/log_monitor.go

+19-18
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
157157
timestamp := logs[0].Timestamp
158158
message := generateMessage(logs)
159159
var events []types.Event
160+
var changedConditions []*types.Condition
160161
if rule.Type == types.Temp {
161162
// For temporary error only generate event
162163
events = append(events, types.Event{
@@ -165,12 +166,6 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
165166
Reason: rule.Reason,
166167
Message: message,
167168
})
168-
if *l.config.EnableMetricsReporting {
169-
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 1)
170-
if err != nil {
171-
glog.Errorf("Failed to update problem counter metrics for %q: %v", rule.Reason, err)
172-
}
173-
}
174169
} else {
175170
// For permanent error changes the condition
176171
for i := range l.conditions {
@@ -188,26 +183,32 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
188183
rule.Reason,
189184
timestamp,
190185
))
191-
192-
if *l.config.EnableMetricsReporting {
193-
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, true)
194-
if err != nil {
195-
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
196-
rule.Condition, rule.Reason, err)
197-
}
198-
err = problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 1)
199-
if err != nil {
200-
glog.Errorf("Failed to update problem counter metrics for %q: %v", rule.Reason, err)
201-
}
202-
}
203186
}
204187
condition.Status = types.True
205188
condition.Reason = rule.Reason
189+
changedConditions = append(changedConditions, condition)
206190
break
207191
}
208192
}
209193
}
210194

195+
if *l.config.EnableMetricsReporting {
196+
for _, event := range events {
197+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(event.Reason, 1)
198+
if err != nil {
199+
glog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
200+
}
201+
}
202+
for _, condition := range changedConditions {
203+
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
204+
condition.Type, condition.Reason, condition.Status == types.True)
205+
if err != nil {
206+
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
207+
condition.Type, condition.Reason, err)
208+
}
209+
}
210+
}
211+
211212
return &types.Status{
212213
Source: l.config.Source,
213214
// TODO(random-liu): Aggregate events and conditions and then do periodically report.

0 commit comments

Comments
 (0)