Skip to content

Commit 89a8128

Browse files
authored
Fix OOM status reports for Task/Batch APIs (#1807)
1 parent bed4556 commit 89a8128

File tree

5 files changed

+83
-74
lines changed

5 files changed

+83
-74
lines changed

pkg/lib/k8s/pod.go

+34-46
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"bytes"
2121
"context"
2222
"regexp"
23-
"strings"
2423
"time"
2524

2625
"github.com/cortexlabs/cortex/pkg/lib/errors"
@@ -38,7 +37,12 @@ var _podTypeMeta = kmeta.TypeMeta{
3837
Kind: "Pod",
3938
}
4039

41-
const ReasonEvicted = "Evicted"
40+
// pod termination reasons
41+
// https://github.com/kubernetes/kube-state-metrics/blob/master/docs/pod-metrics.md
42+
const (
43+
ReasonEvicted = "Evicted"
44+
ReasonOOMKilled = "OOMKilled"
45+
)
4246

4347
type PodStatus string
4448

@@ -148,22 +152,14 @@ func WasPodOOMKilled(pod *kcore.Pod) bool {
148152
return true
149153
}
150154
for _, containerStatus := range pod.Status.ContainerStatuses {
155+
var reason string
151156
if containerStatus.LastTerminationState.Terminated != nil {
152-
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode
153-
reason := strings.ToLower(containerStatus.LastTerminationState.Terminated.Reason)
154-
if _killStatuses[exitCode] {
155-
if strings.Contains(reason, "oom") {
156-
return true
157-
}
158-
}
157+
reason = containerStatus.LastTerminationState.Terminated.Reason
159158
} else if containerStatus.State.Terminated != nil {
160-
exitCode := containerStatus.State.Terminated.ExitCode
161-
reason := strings.ToLower(containerStatus.State.Terminated.Reason)
162-
if _killStatuses[exitCode] {
163-
if strings.Contains(reason, "oom") {
164-
return true
165-
}
166-
}
159+
reason = containerStatus.State.Terminated.Reason
160+
}
161+
if reason == ReasonOOMKilled {
162+
return true
167163
}
168164
}
169165

@@ -194,25 +190,21 @@ func GetPodStatus(pod *kcore.Pod) PodStatus {
194190
}
195191

196192
for _, containerStatus := range pod.Status.ContainerStatuses {
193+
var reason string
194+
var exitCode int32
197195
if containerStatus.LastTerminationState.Terminated != nil {
198-
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode
199-
reason := strings.ToLower(containerStatus.LastTerminationState.Terminated.Reason)
200-
if _killStatuses[exitCode] {
201-
if strings.Contains(reason, "oom") {
202-
return PodStatusKilledOOM
203-
}
204-
return PodStatusKilled
205-
}
196+
reason = containerStatus.LastTerminationState.Terminated.Reason
197+
exitCode = containerStatus.LastTerminationState.Terminated.ExitCode
206198
} else if containerStatus.State.Terminated != nil {
207-
exitCode := containerStatus.State.Terminated.ExitCode
208-
reason := strings.ToLower(containerStatus.State.Terminated.Reason)
209-
if _killStatuses[exitCode] {
210-
if strings.Contains(reason, "oom") {
211-
return PodStatusKilledOOM
212-
}
213-
return PodStatusKilled
214-
}
199+
reason = containerStatus.State.Terminated.Reason
200+
exitCode = containerStatus.State.Terminated.ExitCode
215201
}
202+
if reason == ReasonOOMKilled {
203+
return PodStatusKilledOOM
204+
} else if _killStatuses[exitCode] {
205+
return PodStatusKilled
206+
}
207+
216208
}
217209
return PodStatusFailed
218210
case kcore.PodRunning:
@@ -245,29 +237,25 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
245237
numRunning++
246238
} else if containerStatus.State.Terminated != nil {
247239
exitCode := containerStatus.State.Terminated.ExitCode
248-
reason := strings.ToLower(containerStatus.State.Terminated.Reason)
249-
if exitCode == 0 {
240+
reason := containerStatus.State.Terminated.Reason
241+
if reason == ReasonOOMKilled {
242+
numKilledOOM++
243+
} else if exitCode == 0 {
250244
numSucceeded++
251245
} else if _killStatuses[exitCode] {
252-
if strings.Contains(reason, "oom") {
253-
numKilledOOM++
254-
} else {
255-
numKilled++
256-
}
246+
numKilled++
257247
} else {
258248
numFailed++
259249
}
260250
} else if containerStatus.LastTerminationState.Terminated != nil {
261251
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode
262-
reason := strings.ToLower(containerStatus.LastTerminationState.Terminated.Reason)
263-
if exitCode == 0 {
252+
reason := containerStatus.LastTerminationState.Terminated.Reason
253+
if reason == ReasonOOMKilled {
254+
numKilledOOM++
255+
} else if exitCode == 0 {
264256
numSucceeded++
265257
} else if _killStatuses[exitCode] {
266-
if strings.Contains(reason, "oom") {
267-
numKilledOOM++
268-
} else {
269-
numKilled++
270-
}
258+
numKilled++
271259
} else {
272260
numFailed++
273261
}

pkg/operator/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func main() {
4747

4848
telemetry.Event("operator.init", map[string]interface{}{"provider": config.Provider})
4949

50-
cron.Run(operator.DeleteEvictedPods, operator.ErrorHandler("delete evicted pods"), 12*time.Hour)
50+
cron.Run(operator.DeleteEvictedPods, operator.ErrorHandler("delete evicted pods"), time.Hour)
5151

5252
switch config.Provider {
5353
case types.AWSProviderType:

pkg/operator/operator/cron.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/cortexlabs/cortex/pkg/lib/aws"
2323
"github.com/cortexlabs/cortex/pkg/lib/errors"
2424
"github.com/cortexlabs/cortex/pkg/lib/k8s"
25+
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
2526
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
2627
"github.com/cortexlabs/cortex/pkg/operator/config"
2728
"github.com/cortexlabs/cortex/pkg/operator/lib/logging"
@@ -30,6 +31,7 @@ import (
3031
)
3132

3233
var operatorLogger = logging.GetOperatorLogger()
34+
var previousListOfEvictedPods = strset.New()
3335

3436
func DeleteEvictedPods() error {
3537
failedPods, err := config.K8s.ListPods(&kmeta.ListOptions{
@@ -40,14 +42,21 @@ func DeleteEvictedPods() error {
4042
}
4143

4244
var errs []error
45+
currentEvictedPods := strset.New()
4346
for _, pod := range failedPods {
44-
if pod.Status.Reason == k8s.ReasonEvicted {
47+
if pod.Status.Reason != k8s.ReasonEvicted {
48+
continue
49+
}
50+
if previousListOfEvictedPods.Has(pod.Name) {
4551
_, err := config.K8s.DeletePod(pod.Name)
4652
if err != nil {
4753
errs = append(errs, err)
4854
}
55+
continue
4956
}
57+
currentEvictedPods.Add(pod.Name)
5058
}
59+
previousListOfEvictedPods = currentEvictedPods
5160

5261
if errors.HasError(errs) {
5362
return errors.FirstError(errs...)

pkg/operator/resources/job/batchapi/cron.go

+22-14
Original file line numberDiff line numberDiff line change
@@ -284,8 +284,9 @@ func reconcileInProgressJob(jobState *job.State, queueURL *string, k8sJob *kbatc
284284
}
285285

286286
func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job) error {
287-
if int(k8sJob.Status.Failed) > 0 {
288-
return investigateJobFailure(jobKey)
287+
jobFailed, err := checkForJobFailure(jobKey, k8sJob)
288+
if err != nil || jobFailed {
289+
return err
289290
}
290291

291292
queueMessages, err := getQueueMetricsFromURL(queueURL)
@@ -349,19 +350,18 @@ func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job
349350
return nil
350351
}
351352

352-
func investigateJobFailure(jobKey spec.JobKey) error {
353-
reasonFound := false
354-
353+
func checkForJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) (bool, error) {
355354
jobLogger, err := operator.GetJobLogger(jobKey)
356355
if err != nil {
357-
return err
356+
return false, err
358357
}
359358

359+
reasonFound := false
360360
pods, _ := config.K8s.ListPodsByLabel("jobID", jobKey.ID)
361361
for _, pod := range pods {
362362
if k8s.WasPodOOMKilled(&pod) {
363363
jobLogger.Error("at least one worker was killed because it ran out of out of memory")
364-
return errors.FirstError(
364+
return true, errors.FirstError(
365365
job.SetWorkerOOMStatus(jobKey),
366366
deleteJobRuntimeResources(jobKey),
367367
)
@@ -382,13 +382,21 @@ func investigateJobFailure(jobKey spec.JobKey) error {
382382
}
383383
}
384384

385-
if !reasonFound {
386-
jobLogger.Error("workers were killed for unknown reason")
385+
if int(k8sJob.Status.Failed) > 0 {
386+
if !reasonFound {
387+
jobLogger.Error("workers were killed for unknown reason")
388+
}
389+
return true, errors.FirstError(
390+
job.SetWorkerErrorStatus(jobKey),
391+
deleteJobRuntimeResources(jobKey),
392+
)
393+
} else if int(k8sJob.Status.Succeeded) == 1 && len(pods) == 0 {
394+
// really unexpected situation which doesn't hurt if we check
395+
return true, errors.FirstError(
396+
job.SetUnexpectedErrorStatus(jobKey),
397+
deleteJobRuntimeResources(jobKey),
398+
)
387399
}
388400

389-
return errors.FirstError(
390-
err,
391-
job.SetWorkerErrorStatus(jobKey),
392-
deleteJobRuntimeResources(jobKey),
393-
)
401+
return false, nil
394402
}

pkg/operator/resources/job/taskapi/cron.go

+16-12
Original file line numberDiff line numberDiff line change
@@ -202,27 +202,31 @@ func reconcileInProgressJob(jobState *job.State, k8sJob *kbatch.Job) (status.Job
202202
}
203203

204204
func checkIfJobCompleted(jobKey spec.JobKey, k8sJob *kbatch.Job) error {
205-
if int(k8sJob.Status.Failed) == 1 {
206-
pods, _ := config.K8s.ListPodsByLabel("jobID", jobKey.ID)
207-
for _, pod := range pods {
208-
if k8s.WasPodOOMKilled(&pod) {
209-
return errors.FirstError(
210-
job.SetWorkerOOMStatus(jobKey),
211-
deleteJobRuntimeResources(jobKey),
212-
)
213-
}
205+
pods, _ := config.K8s.ListPodsByLabel("jobID", jobKey.ID)
206+
for _, pod := range pods {
207+
if k8s.WasPodOOMKilled(&pod) {
208+
return errors.FirstError(
209+
job.SetWorkerOOMStatus(jobKey),
210+
deleteJobRuntimeResources(jobKey),
211+
)
214212
}
213+
}
214+
if int(k8sJob.Status.Failed) == 1 {
215215
return errors.FirstError(
216216
job.SetWorkerErrorStatus(jobKey),
217217
deleteJobRuntimeResources(jobKey),
218218
)
219-
}
220-
221-
if int(k8sJob.Status.Succeeded) == 1 {
219+
} else if int(k8sJob.Status.Succeeded) == 1 && len(pods) > 0 {
222220
return errors.FirstError(
223221
job.SetSucceededStatus(jobKey),
224222
deleteJobRuntimeResources(jobKey),
225223
)
224+
} else if int(k8sJob.Status.Succeeded) == 1 && len(pods) == 0 {
225+
// really unexpected situation which doesn't hurt if we check
226+
return errors.FirstError(
227+
job.SetUnexpectedErrorStatus(jobKey),
228+
deleteJobRuntimeResources(jobKey),
229+
)
226230
}
227231

228232
return nil

0 commit comments

Comments
 (0)