Skip to content

Commit 9821c22

Browse files
authored
Evict pods that consume too much memory (#426)
1 parent 95e295a commit 9821c22

File tree

3 files changed

+59
-0
lines changed

3 files changed

+59
-0
lines changed

manager/eks.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,16 @@ nodeGroups:
3232
iam:
3333
withAddonPolicies:
3434
autoScaler: true
35+
kubeletExtraConfig:
36+
kubeReserved:
37+
cpu: 150m
38+
memory: 300Mi
39+
ephemeral-storage: 1Gi
40+
kubeReservedCgroup: /kube-reserved
41+
systemReserved:
42+
cpu: 150m
43+
memory: 300Mi
44+
ephemeral-storage: 1Gi
45+
evictionHard:
46+
memory.available: 200Mi
47+
nodefs.available: 5%

pkg/lib/k8s/pod.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package k8s
1818

1919
import (
20+
"regexp"
2021
"time"
2122

2223
kcore "k8s.io/api/core/v1"
@@ -32,6 +33,8 @@ var podTypeMeta = kmeta.TypeMeta{
3233
Kind: "Pod",
3334
}
3435

36+
const ReasonEvicted = "Evicted"
37+
3538
type PodStatus string
3639

3740
const (
@@ -130,6 +133,8 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time {
130133
return nil
131134
}
132135

136+
var evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`)
137+
133138
func GetPodStatus(pod *kcore.Pod) PodStatus {
134139
if pod == nil {
135140
return PodStatusUnknown
@@ -145,6 +150,10 @@ func GetPodStatus(pod *kcore.Pod) PodStatus {
145150
case kcore.PodSucceeded:
146151
return PodStatusSucceeded
147152
case kcore.PodFailed:
153+
if pod.Status.Reason == ReasonEvicted && evictedMemoryMessageRegex.MatchString(pod.Status.Message) {
154+
return PodStatusKilledOOM
155+
}
156+
148157
for _, containerStatus := range pod.Status.ContainerStatuses {
149158
if containerStatus.LastTerminationState.Terminated != nil {
150159
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode

pkg/operator/workloads/cron.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@ package workloads
1919
import (
2020
"time"
2121

22+
kcore "k8s.io/api/core/v1"
2223
kmeta "k8s.io/apimachinery/pkg/apis/meta/v1"
2324

2425
"github.com/cortexlabs/cortex/pkg/lib/errors"
26+
"github.com/cortexlabs/cortex/pkg/lib/k8s"
27+
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
2528
"github.com/cortexlabs/cortex/pkg/operator/config"
2629
)
2730

@@ -60,6 +63,7 @@ func runCron() {
6063
"workloadType": workloadTypeAPI,
6164
"userFacing": "true",
6265
})
66+
6367
if err != nil {
6468
config.Telemetry.ReportError(err)
6569
errors.PrintError(err)
@@ -73,11 +77,14 @@ func runCron() {
7377
failedPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{
7478
FieldSelector: "status.phase=Failed",
7579
})
80+
7681
if err != nil {
7782
config.Telemetry.ReportError(err)
7883
errors.PrintError(err)
7984
}
8085

86+
deleteEvictedPods(failedPods)
87+
8188
if err := updateDataWorkloadErrors(failedPods); err != nil {
8289
config.Telemetry.ReportError(err)
8390
errors.PrintError(err)
@@ -93,3 +100,33 @@ func reportAndRecover(strs ...string) error {
93100
}
94101
return nil
95102
}
103+
104+
func deleteEvictedPods(failedPods []kcore.Pod) {
105+
evictedPods := []kcore.Pod{}
106+
for _, pod := range failedPods {
107+
if pod.Status.Reason == k8s.ReasonEvicted {
108+
evictedPods = append(evictedPods, pod)
109+
}
110+
}
111+
112+
if len(evictedPods) > 0 {
113+
savedEvictedPods := map[string]kcore.Pod{}
114+
currentWorkloadIDs := strset.New()
115+
for _, ctx := range CurrentContexts() {
116+
currentWorkloadIDs.Merge(ctx.ComputedResourceWorkloadIDs())
117+
}
118+
119+
for _, pod := range evictedPods {
120+
if currentWorkloadIDs.Has(pod.Labels["workloadID"]) {
121+
if _, ok := savedEvictedPods[pod.Labels["resourceID"]]; !ok {
122+
savedEvictedPods[pod.Labels["resourceID"]] = pod
123+
continue
124+
}
125+
}
126+
_, err := config.Kubernetes.DeletePod(pod.Name)
127+
if err != nil {
128+
errors.PrintError(err)
129+
}
130+
}
131+
}
132+
}

0 commit comments

Comments
 (0)