Skip to content

Commit 10cf230

Browse files
jason1028krjasonjung
authored and
jasonjung
committed
test commit
1 parent 4412a2b commit 10cf230

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+275
-7817
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ junit*.xml
77
debug.test
88
/output/
99
coverage.out
10+
.idea

Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,8 @@ COPY ./bin/node-problem-detector /node-problem-detector
2727
ARG LOGCOUNTER
2828
COPY ./bin/health-checker ${LOGCOUNTER} /home/kubernetes/bin/
2929

30+
COPY ./config/test_condition_fail.sh /home/kubernetes/scripts/
31+
COPY ./config/test_condition_succeed.sh /home/kubernetes/scripts/
32+
3033
COPY config /config
3134
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json"]

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else e
3232
TAG?=$(VERSION)
3333

3434
# REGISTRY is the container registry to push into.
35-
REGISTRY?=gcr.io/k8s-staging-npd
35+
#REGISTRY?=gcr.io/k8s-staging-npd
36+
REGISTRY?=jasonjungacr.azurecr.io/k8s-staging-npd
3637

3738
# UPLOAD_PATH is the cloud storage path to upload release tar.
3839
UPLOAD_PATH?=gs://kubernetes-release
@@ -62,7 +63,7 @@ IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG)
6263

6364
# ENABLE_JOURNALD enables build journald support or not. Building journald
6465
# support needs libsystemd-dev or libsystemd-journal-dev.
65-
ENABLE_JOURNALD?=1
66+
ENABLE_JOURNALD?=0
6667

6768
ifeq ($(go env GOHOSTOS), darwin)
6869
ENABLE_JOURNALD=0
@@ -260,7 +261,6 @@ build-in-docker: clean docker-builder
260261
-c 'cd /gopath/src/k8s.io/node-problem-detector/ && make build-binaries'
261262

262263
push-container: build-container
263-
gcloud auth configure-docker
264264
docker push $(IMAGE)
265265

266266
push-tar: build-tar
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "60s",
5+
"timeout": "60s"
6+
},
7+
"source": "test-custom-plugin-monitor",
8+
"metricsReporting": true,
9+
"conditions": [
10+
{
11+
"type": "TestConditionHasProblem",
12+
"reason": "TestIsOK",
13+
"message": "test condition is OK"
14+
}
15+
],
16+
"rules": [
17+
{
18+
"type": "permanent",
19+
"condition": "TestConditionHasProblem",
20+
"reason": "TestIsIntentionallyFailing",
21+
"path": "./config/plugin/test/test_condition_fail.sh",
22+
"timeout": "60s"
23+
},
24+
{
25+
"type": "permanent",
26+
"condition": "TestConditionHasProblem",
27+
"reason": "TestIsIntentionallySucceeding",
28+
"path": "./config/plugin/test/test_condition_succeed.sh",
29+
"timeout": "60s"
30+
}
31+
]
32+
}

config/test_condition_fail.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/sh
2+
exit 1

config/test_condition_succeed.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/sh
2+
exit 0

deployment/node-problem-detector-config.yaml

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -64,25 +64,58 @@ data:
6464
}
6565
]
6666
}
67-
docker-monitor.json: |
67+
cpm-test.json: |
6868
{
69-
"plugin": "journald",
70-
"pluginConfig": {
71-
"source": "dockerd"
69+
"plugin": "custom",
70+
"pluginConfig": {
71+
"invoke_interval": "60s",
72+
"timeout": "60s"
73+
},
74+
"source": "test-custom-plugin-monitor",
75+
"metricsReporting": true,
76+
"conditions": [
77+
{
78+
"type": "TestConditionHasProblem",
79+
"reason": "TestIsOK",
80+
"message": "test condition is OK"
81+
}
82+
],
83+
"rules": [
84+
{
85+
"type": "permanent",
86+
"condition": "TestConditionHasProblem",
87+
"reason": "TestIsIntentionallyFailing",
88+
"path": "/home/kubernetes/scripts//test_condition_fail.sh",
89+
"timeout": "60s"
7290
},
73-
"logPath": "/var/log/journal",
74-
"lookback": "5m",
75-
"bufferSize": 10,
76-
"source": "docker-monitor",
77-
"conditions": [],
78-
"rules": [
79-
{
80-
"type": "temporary",
81-
"reason": "CorruptDockerImage",
82-
"pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
83-
}
84-
]
91+
{
92+
"type": "permanent",
93+
"condition": "TestConditionHasProblem",
94+
"reason": "TestIsIntentionallySucceeding",
95+
"path": "/home/kubernetes/scripts//test_condition_succeed.sh",
96+
"timeout": "60s"
97+
}
98+
]
8599
}
100+
# docker-monitor.json: |
101+
# {
102+
# "plugin": "journald",
103+
# "pluginConfig": {
104+
# "source": "dockerd"
105+
# },
106+
# "logPath": "/var/log/journal",
107+
# "lookback": "5m",
108+
# "bufferSize": 10,
109+
# "source": "docker-monitor",
110+
# "conditions": [],
111+
# "rules": [
112+
# {
113+
# "type": "temporary",
114+
# "reason": "CorruptDockerImage",
115+
# "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
116+
# }
117+
# ]
118+
# }
86119
kind: ConfigMap
87120
metadata:
88121
name: node-problem-detector-config

deployment/node-problem-detector.yaml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ spec:
2828
command:
2929
- /node-problem-detector
3030
- --logtostderr
31-
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
32-
image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7
31+
#- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
32+
- --config.custom-plugin-monitor=/config/cpm-test.json
33+
image: jasonjungacr.azurecr.io/k8s-staging-npd/node-problem-detector:v0.8.10-10-g4412a2b-dirty
3334
resources:
3435
limits:
3536
cpu: 10m
@@ -77,8 +78,10 @@ spec:
7778
items:
7879
- key: kernel-monitor.json
7980
path: kernel-monitor.json
80-
- key: docker-monitor.json
81-
path: docker-monitor.json
81+
# - key: docker-monitor.json
82+
# path: docker-monitor.json
83+
- key: cpm-test.json
84+
path: cpm-test.json
8285
tolerations:
8386
- effect: NoSchedule
8487
operator: Exists

pkg/custompluginmonitor/custom_plugin_monitor.go

Lines changed: 172 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,17 +126,35 @@ func (c *customPluginMonitor) monitorLoop() {
126126

127127
resultChan := c.plugin.GetResultChan()
128128

129+
// runRules done for the interval
130+
//intervalEndChan := c.plugin.GetIntervalEndChan()
131+
var intervalResults []cpmtypes.Result
132+
129133
for {
130134
select {
131135
case result, ok := <-resultChan:
132136
if !ok {
133137
glog.Errorf("Result channel closed: %s", c.configPath)
134138
return
135139
}
140+
136141
glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
137-
status := c.generateStatus(result)
138-
glog.V(3).Infof("New status generated: %+v", status)
139-
c.statusChan <- status
142+
// gather results for single rule interval loop
143+
intervalResults = append(intervalResults)
144+
145+
//glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
146+
//status := c.generateStatus(result)
147+
//glog.V(3).Infof("New status generated: %+v", status)
148+
//c.statusChan <- status
149+
150+
case _, ok := <-intervalEndChan:
151+
if !ok {
152+
glog.Errorf("Interval End Channel closed: %s", c.configPath)
153+
return
154+
}
155+
//status := c.generateStatus(intervalResults)
156+
//glog.V(3).Infof("New status generated: %+v", status)
157+
//c.statusChan <- status
140158
case <-c.tomb.Stopping():
141159
c.plugin.Stop()
142160
glog.Infof("Custom plugin monitor stopped: %s", c.configPath)
@@ -146,8 +164,158 @@ func (c *customPluginMonitor) monitorLoop() {
146164
}
147165
}
148166

167+
func (c *customPluginMonitor) generateStatus(results []cpmtypes.Result) *types.Status {
168+
timestamp := time.Now()
169+
var activeProblemEvents []types.Event
170+
var inactiveProblemEvents []types.Event
171+
172+
var unProcessedResults []cpmtypes.Result
173+
174+
for _, result := range results {
175+
status := toConditionStatus(result.ExitStatus)
176+
if result.Rule.Type == types.Temp {
177+
// For temporary error only generate event when exit status is above warning
178+
if result.ExitStatus >= cpmtypes.NonOK {
179+
activeProblemEvents = append(activeProblemEvents, types.Event{
180+
Severity: types.Warn,
181+
Timestamp: timestamp,
182+
Reason: result.Rule.Reason,
183+
Message: result.Message,
184+
})
185+
}
186+
} else {
187+
// we skip result that sets condition true, and result that sets condition false/unknown but with a different reason
188+
if status == types.True {
189+
unProcessedResults = append(unProcessedResults, result)
190+
continue
191+
}
192+
193+
for i := range c.conditions {
194+
condition := &c.conditions[i]
195+
196+
// if appropriate (current condition's reason changes to false/unknown), unset(set to false/unknown) condition first.
197+
// In case there are multiple reasons per condition, this will prevent ignoring new reason that sets
198+
// condition true (since original condition reason takes precedence) or flapping (condition set to false
199+
// by current reason, then to true by another reason)
200+
if condition.Type == result.Rule.Condition && condition.Reason == result.Rule.Reason {
201+
// The condition reason specified in the rule and the result message
202+
// represent the problem happened. We need to know the default condition
203+
// from the config, so that we can set the new condition reason/message
204+
// back when such problem goes away.
205+
var defaultConditionReason string
206+
var defaultConditionMessage string
207+
for j := range c.config.DefaultConditions {
208+
defaultCondition := &c.config.DefaultConditions[j]
209+
if defaultCondition.Type == result.Rule.Condition {
210+
defaultConditionReason = defaultCondition.Reason
211+
defaultConditionMessage = defaultCondition.Message
212+
break
213+
}
214+
}
215+
216+
var newReason string
217+
var newMessage string
218+
newReason = defaultConditionReason
219+
if status == types.False {
220+
newMessage = defaultConditionMessage
221+
} else {
222+
// When status unknown, the result's message is important for debug
223+
newMessage = result.Message
224+
}
225+
226+
condition.Transition = timestamp
227+
condition.Status = status
228+
condition.Reason = newReason
229+
condition.Message = newMessage
230+
231+
break
232+
}
233+
}
234+
}
235+
}
236+
237+
for _, result := range unProcessedResults {
238+
status := toConditionStatus(result.ExitStatus)
239+
// we iterate through results that sets condition true for different reasons
240+
// whatever result that went through result channel first takes precedence
241+
for i := range c.conditions {
242+
condition := &c.conditions[i]
243+
if condition.Type == result.Rule.Condition {
244+
if condition.Status != types.True ||
245+
(condition.Reason == result.Rule.Reason && *c.config.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate) {
246+
// update condition only when condition is currently false, or message based condition update is enabled.
247+
// for each condition, this if-block will be reached once
248+
condition.Transition = timestamp
249+
condition.Status = status
250+
condition.Reason = result.Rule.Reason
251+
condition.Message = result.Message
252+
253+
updateEvent := util.GenerateConditionChangeEvent(
254+
condition.Type,
255+
status,
256+
result.Rule.Reason,
257+
timestamp,
258+
)
259+
260+
activeProblemEvents = append(activeProblemEvents, updateEvent)
261+
}
262+
263+
break
264+
}
265+
}
266+
}
267+
268+
for i := range c.conditions {
269+
// check for conditions that are still false/unknown
270+
condition := &c.conditions[i]
271+
if condition.Status != types.True {
272+
updateEvent := util.GenerateConditionChangeEvent(
273+
condition.Type,
274+
condition.Status,
275+
condition.Reason,
276+
timestamp,
277+
)
278+
279+
inactiveProblemEvents = append(inactiveProblemEvents, updateEvent)
280+
}
281+
}
282+
283+
if *c.config.EnableMetricsReporting {
284+
// Increment problem counter only for active problems which just got detected.
285+
for _, event := range activeProblemEvents {
286+
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(
287+
event.Reason, 1)
288+
if err != nil {
289+
glog.Errorf("Failed to update problem counter metrics for %q: %v",
290+
event.Reason, err)
291+
}
292+
}
293+
for _, condition := range c.conditions {
294+
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
295+
condition.Type, condition.Reason, condition.Status == types.True)
296+
if err != nil {
297+
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
298+
condition.Type, condition.Reason, err)
299+
}
300+
}
301+
}
302+
status := &types.Status{
303+
Source: c.config.Source,
304+
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
305+
Events: append(activeProblemEvents, inactiveProblemEvents...),
306+
Conditions: c.conditions,
307+
}
308+
// Log only if condition has changed
309+
if len(activeProblemEvents) != 0 || len(inactiveProblemEvents) != 0 {
310+
glog.V(0).Infof("New status generated: %+v", status)
311+
}
312+
return status
313+
}
314+
315+
/*
149316
// generateStatus generates status from the plugin check result.
150317
func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Status {
318+
151319
timestamp := time.Now()
152320
var activeProblemEvents []types.Event
153321
var inactiveProblemEvents []types.Event
@@ -277,6 +445,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
277445
}
278446
return status
279447
}
448+
*/
280449

281450
func toConditionStatus(s cpmtypes.Status) types.ConditionStatus {
282451
switch s {

pkg/custompluginmonitor/custom_plugin_monitor_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,7 @@ func TestRegistration(t *testing.T) {
2929
func() { problemdaemon.GetProblemDaemonHandlerOrDie("custom-plugin-monitor") },
3030
"Custom plugin monitor failed to register itself as a problem daemon.")
3131
}
32+
33+
func TestGenerateStatus(t *testing.T) {
34+
35+
}

0 commit comments

Comments
 (0)