Skip to content

Commit bcba3af

Browse files
committed
separate linux/windows health checker files.
1 parent 857754c commit bcba3af

13 files changed

+462
-140
lines changed

cmd/healthchecker/health_checker.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func main() {
5555
os.Exit(int(types.Unknown))
5656
}
5757
if !healthy {
58-
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
58+
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.Service, hco.EnableRepair)
5959
os.Exit(int(types.NonOK))
6060
}
6161
os.Exit(int(types.OK))

cmd/healthchecker/options/options.go

+15-9
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package options
1919
import (
2020
"flag"
2121
"fmt"
22+
"runtime"
2223
"time"
2324

2425
"github.com/spf13/pflag"
@@ -34,7 +35,7 @@ func NewHealthCheckerOptions() *HealthCheckerOptions {
3435
// HealthCheckerOptions are the options used to configure the health checker.
3536
type HealthCheckerOptions struct {
3637
Component string
37-
SystemdService string
38+
Service string
3839
EnableRepair bool
3940
CriCtlPath string
4041
CriSocketPath string
@@ -47,8 +48,13 @@ type HealthCheckerOptions struct {
4748
func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
4849
fs.StringVar(&hco.Component, "component", types.KubeletComponent,
4950
"The component to check health for. Supports kubelet, docker and cri")
50-
fs.StringVar(&hco.SystemdService, "systemd-service", "",
51-
"The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
51+
// Deprecated: For backward compatibility on linux environment. Going forward "service" will be used instead of systemd-service
52+
if runtime.GOOS == "linux" {
53+
fs.StringVar(&hco.Service, "systemd-service", "",
54+
"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
55+
}
56+
fs.StringVar(&hco.Service, "service", "",
57+
"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
5258
fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.")
5359
fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl,
5460
"The path to the crictl binary. This is used to check health of cri component.")
@@ -69,9 +75,9 @@ func (hco *HealthCheckerOptions) IsValid() error {
6975
if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent {
7076
return fmt.Errorf("the component specified is not supported. Supported components are : <kubelet/docker/cri>")
7177
}
72-
// Make sure the systemd service is specified if repair is enabled.
73-
if hco.EnableRepair && hco.SystemdService == "" {
74-
return fmt.Errorf("systemd-service cannot be empty when repair is enabled")
78+
// Make sure the service is specified if repair is enabled.
79+
if hco.EnableRepair && hco.Service == "" {
80+
return fmt.Errorf("service cannot be empty when repair is enabled")
7581
}
7682
// Skip checking further if the component is not cri.
7783
if hco.Component != types.CRIComponent {
@@ -90,14 +96,14 @@ func (hco *HealthCheckerOptions) IsValid() error {
9096

9197
// SetDefaults sets the defaults values for the dependent flags.
9298
func (hco *HealthCheckerOptions) SetDefaults() {
93-
if hco.SystemdService != "" {
99+
if hco.Service != "" {
94100
return
95101
}
96102
if hco.Component != types.CRIComponent {
97-
hco.SystemdService = hco.Component
103+
hco.Service = hco.Component
98104
return
99105
}
100-
hco.SystemdService = types.ContainerdService
106+
hco.Service = types.ContainerdService
101107
}
102108

103109
func init() {

cmd/healthchecker/options/options_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ func TestIsValid(t *testing.T) {
5656
{
5757
name: "empty systemd-service and repair enabled",
5858
hco: HealthCheckerOptions{
59-
Component: types.KubeletComponent,
60-
EnableRepair: true,
61-
SystemdService: "",
59+
Component: types.KubeletComponent,
60+
EnableRepair: true,
61+
Service: "",
6262
},
6363
expectError: true,
6464
},
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "10s",
5+
"timeout": "3m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "health-checker",
10+
"metricsReporting": true,
11+
"conditions": [
12+
{
13+
"type": "ContainerRuntimeUnhealthy",
14+
"reason": "ContainerRuntimeIsHealthy",
15+
"message": "Container runtime on the node is functioning properly"
16+
}
17+
],
18+
"rules": [
19+
{
20+
"type": "permanent",
21+
"condition": "ContainerRuntimeUnhealthy",
22+
"reason": "ContainerdUnhealthy",
23+
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
24+
"args": [
25+
"--component=cri",
26+
"--enable-repair=true",
27+
"--cooldown-time=2m",
28+
"--health-check-timeout=60s"
29+
],
30+
"timeout": "3m"
31+
}
32+
]
33+
}
34+
+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "10s",
5+
"timeout": "3m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "health-checker",
10+
"metricsReporting": true,
11+
"conditions": [
12+
{
13+
"type": "ContainerRuntimeUnhealthy",
14+
"reason": "ContainerRuntimeIsHealthy",
15+
"message": "Container runtime on the node is functioning properly"
16+
}
17+
],
18+
"rules": [
19+
{
20+
"type": "permanent",
21+
"condition": "ContainerRuntimeUnhealthy",
22+
"reason": "DockerUnhealthy",
23+
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
24+
"args": [
25+
"--component=docker",
26+
"--enable-repair=true",
27+
"--cooldown-time=2m",
28+
"--health-check-timeout=60s"
29+
],
30+
"timeout": "3m"
31+
}
32+
]
33+
}
34+
+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "10s",
5+
"timeout": "3m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "health-checker",
10+
"metricsReporting": true,
11+
"conditions": [
12+
{
13+
"type": "KubeletUnhealthy",
14+
"reason": "KubeletIsHealthy",
15+
"message": "kubelet on the node is functioning properly"
16+
}
17+
],
18+
"rules": [
19+
{
20+
"type": "permanent",
21+
"condition": "KubeletUnhealthy",
22+
"reason": "KubeletUnhealthy",
23+
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
24+
"args": [
25+
"--component=kubelet",
26+
"--enable-repair=true",
27+
"--cooldown-time=1m",
28+
"--health-check-timeout=10s"
29+
],
30+
"timeout": "3m"
31+
}
32+
]
33+
}
34+

pkg/healthchecker/health_checker.go

+4-121
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,16 @@ limitations under the License.
1717
package healthchecker
1818

1919
import (
20-
"context"
21-
"errors"
22-
"net/http"
23-
"os/exec"
24-
"strconv"
25-
"strings"
2620
"time"
2721

2822
"github.com/golang/glog"
29-
3023
"k8s.io/node-problem-detector/cmd/healthchecker/options"
3124
"k8s.io/node-problem-detector/pkg/healthchecker/types"
3225
)
3326

3427
type healthChecker struct {
3528
component string
36-
systemdService string
29+
service string
3730
enableRepair bool
3831
healthCheckFunc func() (bool, error)
3932
// The repair is "best-effort" and ignores the error from the underlying actions.
@@ -54,96 +47,23 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
5447
crictlPath: hco.CriCtlPath,
5548
healthCheckTimeout: hco.HealthCheckTimeout,
5649
coolDownTime: hco.CoolDownTime,
57-
systemdService: hco.SystemdService,
50+
service: hco.Service,
5851
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
5952
}
6053
hc.healthCheckFunc = getHealthCheckFunc(hco)
6154
hc.repairFunc = getRepairFunc(hco)
62-
hc.uptimeFunc = getUptimeFunc(hco.SystemdService)
55+
hc.uptimeFunc = getUptimeFunc(hco.Service)
6356
return hc, nil
6457
}
6558

66-
// getUptimeFunc returns the time for which the given service has been running.
67-
func getUptimeFunc(service string) func() (time.Duration, error) {
68-
return func() (time.Duration, error) {
69-
// Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will
70-
// transition from inactive -> activating and the timestamp is captured.
71-
// Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/
72-
// Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when
73-
// RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in
74-
// activating state and hence ActiveEnterTimestamp was never updated.
75-
out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=InactiveExitTimestamp")
76-
if err != nil {
77-
return time.Duration(0), err
78-
}
79-
val := strings.Split(out, "=")
80-
if len(val) < 2 {
81-
return time.Duration(0), errors.New("could not parse the service uptime time correctly")
82-
}
83-
t, err := time.Parse(types.UptimeTimeLayout, val[1])
84-
if err != nil {
85-
return time.Duration(0), err
86-
}
87-
return time.Since(t), nil
88-
}
89-
}
90-
91-
// getRepairFunc returns the repair function based on the component.
92-
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
93-
switch hco.Component {
94-
case types.DockerComponent:
95-
// Use "docker ps" for docker health check. Not using crictl for docker to remove
96-
// dependency on the kubelet.
97-
return func() {
98-
execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd")
99-
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
100-
}
101-
default:
102-
// Just kill the service for all other components
103-
return func() {
104-
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
105-
}
106-
}
107-
}
108-
109-
// getHealthCheckFunc returns the health check function based on the component.
110-
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
111-
switch hco.Component {
112-
case types.KubeletComponent:
113-
return func() (bool, error) {
114-
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
115-
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
116-
if err != nil || response.StatusCode != http.StatusOK {
117-
return false, nil
118-
}
119-
return true, nil
120-
}
121-
case types.DockerComponent:
122-
return func() (bool, error) {
123-
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
124-
return false, nil
125-
}
126-
return true, nil
127-
}
128-
case types.CRIComponent:
129-
return func() (bool, error) {
130-
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
131-
return false, nil
132-
}
133-
return true, nil
134-
}
135-
}
136-
return nil
137-
}
138-
13959
// CheckHealth checks for the health of the component and tries to repair if enabled.
14060
// Returns true if healthy, false otherwise.
14161
func (hc *healthChecker) CheckHealth() (bool, error) {
14262
healthy, err := hc.healthCheckFunc()
14363
if err != nil {
14464
return healthy, err
14565
}
146-
logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck)
66+
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
14767
if err != nil {
14868
return logPatternHealthy, err
14969
}
@@ -167,19 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
16787
return false, nil
16888
}
16989

170-
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
171-
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
172-
ctx, cancel := context.WithTimeout(context.Background(), timeout)
173-
defer cancel()
174-
cmd := exec.CommandContext(ctx, command, args...)
175-
out, err := cmd.Output()
176-
if err != nil {
177-
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
178-
return "", err
179-
}
180-
return strings.TrimSuffix(string(out), "\n"), nil
181-
}
182-
18390
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
18491
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
18592
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
@@ -203,27 +110,3 @@ func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (b
203110
}
204111
return true, nil
205112
}
206-
207-
// checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
208-
// service restart. (false, nil) otherwise.
209-
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
210-
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
211-
// Query service logs since the logStartTime
212-
`journalctl --unit "`+service+`" --since "`+logStartTime+
213-
// Grep the pattern
214-
`" | grep -i "`+logPattern+
215-
// Get the count of occurrences
216-
`" | wc -l`)
217-
if err != nil {
218-
return true, err
219-
}
220-
occurrences, err := strconv.Atoi(out)
221-
if err != nil {
222-
return true, err
223-
}
224-
if occurrences >= logCountThreshold {
225-
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
226-
return false, nil
227-
}
228-
return true, nil
229-
}

0 commit comments

Comments
 (0)