@@ -17,23 +17,16 @@ limitations under the License.
17
17
package healthchecker
18
18
19
19
import (
20
- "context"
21
- "errors"
22
- "net/http"
23
- "os/exec"
24
- "strconv"
25
- "strings"
26
20
"time"
27
21
28
22
"github.com/golang/glog"
29
-
30
23
"k8s.io/node-problem-detector/cmd/healthchecker/options"
31
24
"k8s.io/node-problem-detector/pkg/healthchecker/types"
32
25
)
33
26
34
27
type healthChecker struct {
35
28
component string
36
- systemdService string
29
+ service string
37
30
enableRepair bool
38
31
healthCheckFunc func () (bool , error )
39
32
// The repair is "best-effort" and ignores the error from the underlying actions.
@@ -54,96 +47,23 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
54
47
crictlPath : hco .CriCtlPath ,
55
48
healthCheckTimeout : hco .HealthCheckTimeout ,
56
49
coolDownTime : hco .CoolDownTime ,
57
- systemdService : hco .SystemdService ,
50
+ service : hco .Service ,
58
51
logPatternsToCheck : hco .LogPatterns .GetLogPatternCountMap (),
59
52
}
60
53
hc .healthCheckFunc = getHealthCheckFunc (hco )
61
54
hc .repairFunc = getRepairFunc (hco )
62
- hc .uptimeFunc = getUptimeFunc (hco .SystemdService )
55
+ hc .uptimeFunc = getUptimeFunc (hco .Service )
63
56
return hc , nil
64
57
}
65
58
66
- // getUptimeFunc returns the time for which the given service has been running.
67
- func getUptimeFunc (service string ) func () (time.Duration , error ) {
68
- return func () (time.Duration , error ) {
69
- // Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will
70
- // transition from inactive -> activating and the timestamp is captured.
71
- // Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/
72
- // Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when
73
- // RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in
74
- // activating state and hence ActiveEnterTimestamp was never updated.
75
- out , err := execCommand (types .CmdTimeout , "systemctl" , "show" , service , "--property=InactiveExitTimestamp" )
76
- if err != nil {
77
- return time .Duration (0 ), err
78
- }
79
- val := strings .Split (out , "=" )
80
- if len (val ) < 2 {
81
- return time .Duration (0 ), errors .New ("could not parse the service uptime time correctly" )
82
- }
83
- t , err := time .Parse (types .UptimeTimeLayout , val [1 ])
84
- if err != nil {
85
- return time .Duration (0 ), err
86
- }
87
- return time .Since (t ), nil
88
- }
89
- }
90
-
91
- // getRepairFunc returns the repair function based on the component.
92
- func getRepairFunc (hco * options.HealthCheckerOptions ) func () {
93
- switch hco .Component {
94
- case types .DockerComponent :
95
- // Use "docker ps" for docker health check. Not using crictl for docker to remove
96
- // dependency on the kubelet.
97
- return func () {
98
- execCommand (types .CmdTimeout , "pkill" , "-SIGUSR1" , "dockerd" )
99
- execCommand (types .CmdTimeout , "systemctl" , "kill" , "--kill-who=main" , hco .SystemdService )
100
- }
101
- default :
102
- // Just kill the service for all other components
103
- return func () {
104
- execCommand (types .CmdTimeout , "systemctl" , "kill" , "--kill-who=main" , hco .SystemdService )
105
- }
106
- }
107
- }
108
-
109
- // getHealthCheckFunc returns the health check function based on the component.
110
- func getHealthCheckFunc (hco * options.HealthCheckerOptions ) func () (bool , error ) {
111
- switch hco .Component {
112
- case types .KubeletComponent :
113
- return func () (bool , error ) {
114
- httpClient := http.Client {Timeout : hco .HealthCheckTimeout }
115
- response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
116
- if err != nil || response .StatusCode != http .StatusOK {
117
- return false , nil
118
- }
119
- return true , nil
120
- }
121
- case types .DockerComponent :
122
- return func () (bool , error ) {
123
- if _ , err := execCommand (hco .HealthCheckTimeout , "docker" , "ps" ); err != nil {
124
- return false , nil
125
- }
126
- return true , nil
127
- }
128
- case types .CRIComponent :
129
- return func () (bool , error ) {
130
- if _ , err := execCommand (hco .HealthCheckTimeout , hco .CriCtlPath , "--runtime-endpoint=" + hco .CriSocketPath , "--image-endpoint=" + hco .CriSocketPath , "pods" ); err != nil {
131
- return false , nil
132
- }
133
- return true , nil
134
- }
135
- }
136
- return nil
137
- }
138
-
139
59
// CheckHealth checks for the health of the component and tries to repair if enabled.
140
60
// Returns true if healthy, false otherwise.
141
61
func (hc * healthChecker ) CheckHealth () (bool , error ) {
142
62
healthy , err := hc .healthCheckFunc ()
143
63
if err != nil {
144
64
return healthy , err
145
65
}
146
- logPatternHealthy , err := logPatternHealthCheck (hc .systemdService , hc .logPatternsToCheck )
66
+ logPatternHealthy , err := logPatternHealthCheck (hc .service , hc .logPatternsToCheck )
147
67
if err != nil {
148
68
return logPatternHealthy , err
149
69
}
@@ -167,19 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
167
87
return false , nil
168
88
}
169
89
170
- // execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
171
- func execCommand (timeout time.Duration , command string , args ... string ) (string , error ) {
172
- ctx , cancel := context .WithTimeout (context .Background (), timeout )
173
- defer cancel ()
174
- cmd := exec .CommandContext (ctx , command , args ... )
175
- out , err := cmd .Output ()
176
- if err != nil {
177
- glog .Infof ("command %v failed: %v, %v\n " , cmd , err , out )
178
- return "" , err
179
- }
180
- return strings .TrimSuffix (string (out ), "\n " ), nil
181
- }
182
-
183
90
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
184
91
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
185
92
func logPatternHealthCheck (service string , logPatternsToCheck map [string ]int ) (bool , error ) {
@@ -203,27 +110,3 @@ func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (b
203
110
}
204
111
return true , nil
205
112
}
206
-
207
- // checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
208
- // service restart. (false, nil) otherwise.
209
- func checkForPattern (service , logStartTime , logPattern string , logCountThreshold int ) (bool , error ) {
210
- out , err := execCommand (types .CmdTimeout , "/bin/sh" , "-c" ,
211
- // Query service logs since the logStartTime
212
- `journalctl --unit "` + service + `" --since "` + logStartTime +
213
- // Grep the pattern
214
- `" | grep -i "` + logPattern +
215
- // Get the count of occurrences
216
- `" | wc -l` )
217
- if err != nil {
218
- return true , err
219
- }
220
- occurrences , err := strconv .Atoi (out )
221
- if err != nil {
222
- return true , err
223
- }
224
- if occurrences >= logCountThreshold {
225
- glog .Infof ("%s failed log pattern check, %s occurrences: %v" , service , logPattern , occurrences )
226
- return false , nil
227
- }
228
- return true , nil
229
- }
0 commit comments