diff --git a/cli/cmd/get.go b/cli/cmd/get.go index 586ea2019a..7c69aeba19 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -236,12 +236,12 @@ func describeAPI(name string, resourcesRes *schema.GetResourcesResponse, flagVer row := []interface{}{ groupStatus.Message(), - s.Int32(groupStatus.ReadyUpdated), - s.Int32(groupStatus.Available()), - s.Int32(groupStatus.Requested), - s.Int32(groupStatus.ReadyStaleCompute), - s.Int32(groupStatus.ReadyStaleModel), - s.Int32(groupStatus.FailedUpdated), + groupStatus.ReadyUpdated, + groupStatus.Available(), + groupStatus.Requested, + groupStatus.ReadyStaleCompute, + groupStatus.ReadyStaleModel, + groupStatus.FailedUpdated, libtime.Since(updatedAt), } @@ -550,6 +550,7 @@ func apiResourceTable(apiGroupStatuses map[string]*resource.APIGroupStatus) stri rows = append(rows, []interface{}{ name, + groupStatus.Message(), groupStatus.ReadyUpdated, groupStatus.Available(), groupStatus.Requested, @@ -563,6 +564,7 @@ func apiResourceTable(apiGroupStatuses map[string]*resource.APIGroupStatus) stri t := table.Table{ Headers: []table.Header{ {Title: resource.APIType.UserFacing()}, + {Title: "status"}, {Title: "up-to-date"}, {Title: "available"}, {Title: "requested"}, diff --git a/docs/deployments/statuses.md b/docs/deployments/statuses.md index a30c36af5e..3661b6ac63 100644 --- a/docs/deployments/statuses.md +++ b/docs/deployments/statuses.md @@ -4,13 +4,14 @@ | Status | Meaning | |----------------------|---| -| live | API is deployed and ready to serve prediction requests (at least one replica is running) | -| pending | API is waiting for another resource to be ready | -| creating | API is being created | -| stopping | API is stopping | -| stopped | API is stopped | -| error | API was not created due to an error; run `cortex logs ` to view the logs | -| skipped | API was not created due to an error in another resource | -| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready | -| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready | -| compute unavailable | API could not start due to insufficient memory, CPU, or GPU in the cluster; some replicas may be ready | +| live | API is deployed and ready to serve prediction requests (at least one replica is running) | +| pending | API is waiting for another resource to be ready | +| creating | API is being created | +| stopping | API is stopping | +| stopped | API is stopped | +| error | API was not created due to an error; run `cortex logs ` to view the logs | +| skipped | API was not created due to an error in another resource | +| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready | +| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready | +| error (out of memory) | API was terminated due to excessive memory usage; try allocating more memory to the API and re-deploying | +| compute unavailable | API could not start due to insufficient memory, CPU, or GPU in the cluster; some replicas may be ready | diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index fde5c1650c..bbd5fe9da3 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -182,13 +182,16 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P numSucceeded := 0 numFailed := 0 numKilled := 0 + numKilledOOM := 0 for _, containerStatus := range containerStatuses { - if containerStatus.State.Running != nil { + if containerStatus.State.Running != nil && containerStatus.RestartCount == 0 { numRunning++ } else if containerStatus.State.Terminated != nil { exitCode := containerStatus.State.Terminated.ExitCode if exitCode == 0 { numSucceeded++ + } else if exitCode == 137 { + numKilledOOM++ } else if killStatuses[exitCode] { numKilled++ } else { @@ -198,6 +201,8 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P exitCode := containerStatus.LastTerminationState.Terminated.ExitCode if exitCode == 0 { numSucceeded++ + } else if exitCode == 137 { + numKilledOOM++ } else if killStatuses[exitCode] { numKilled++ } else { @@ -208,7 +213,9 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P numWaiting++ } } - if numKilled > 0 { + if numKilledOOM > 0 { + return PodStatusKilledOOM + } else if numKilled > 0 { return PodStatusKilled } else if numFailed > 0 { return PodStatusFailed diff --git a/pkg/operator/api/resource/status.go b/pkg/operator/api/resource/status.go index cacfdffcc3..a54c2d88f7 100644 --- a/pkg/operator/api/resource/status.go +++ b/pkg/operator/api/resource/status.go @@ -16,6 +16,10 @@ limitations under the License. package resource +import ( + "github.com/cortexlabs/cortex/pkg/lib/k8s" +) + type DataStatus struct { DataSavedStatus Code StatusCode `json:"status_code"` @@ -30,7 +34,8 @@ type APIStatus struct { InitReplicas int32 `json:"init_replicas"` TargetCPUUtilization int32 `json:"target_cpu_utilization"` ReplicaCounts `json:"replica_counts"` - Code StatusCode `json:"status_code"` + PodStatuses []k8s.PodStatus `json:"pod_statuses"` + Code StatusCode `json:"status_code"` } type ReplicaCounts struct { @@ -142,14 +147,14 @@ var _ = [1]int{}[int(StatusStopped)-(len(statusCodes)-1)] // Ensure list length var statusCodeMessages = []string{ "unknown", // StatusUnknown - "pending", // StatusPending - "compute unavailable", // StatusPendingCompute - "pending", // StatusWaiting - "skipped", // StatusSkipped - "error", // StatusError - "upstream error", // StatusParentFailed - "upstream termination", // StatusParentKilled - "terminated (out of mem)", // StatusDataOOM + "pending", // StatusPending + "compute unavailable", // StatusPendingCompute + "pending", // StatusWaiting + "skipped", // StatusSkipped + "error", // StatusError + "upstream error", // StatusParentFailed + "upstream termination", // StatusParentKilled + "error (out of memory)", // StatusKilledOOM "running", // StatusRunning "ready", // StatusSucceeded diff --git a/pkg/operator/workloads/api_status.go b/pkg/operator/workloads/api_status.go index d3d484d235..5821ffc345 100644 --- a/pkg/operator/workloads/api_status.go +++ b/pkg/operator/workloads/api_status.go @@ -68,7 +68,7 @@ func getCurrentAPIStatuses( return nil, errors.Wrap(err, "api statuses", ctx.App.Name) } - replicaCountsMap := getReplicaCountsMap(podList, deployments, ctx) + replicaCountsMap, podStatusMap := getReplicaCountsMap(podList, deployments, ctx) currentResourceWorkloadIDs := ctx.APIResourceWorkloadIDs() @@ -117,6 +117,7 @@ func getCurrentAPIStatuses( for resourceID, apiStatus := range apiStatuses { apiStatus.Path = context.APIPath(apiStatus.APIName, apiStatus.AppName) apiStatus.ReplicaCounts = replicaCountsMap[resourceID] + apiStatus.PodStatuses = podStatusMap[resourceID] apiStatus.Code = apiStatusCode(apiStatus) } @@ -135,7 +136,7 @@ func getReplicaCountsMap( podList []kcore.Pod, deployments map[string]*kapps.Deployment, // api.Name -> deployment ctx *context.Context, -) map[string]resource.ReplicaCounts { +) (map[string]resource.ReplicaCounts, map[string][]k8s.PodStatus) { apiComputeIDMap := make(map[string]string) for _, api := range ctx.APIs { @@ -149,6 +150,7 @@ func getReplicaCountsMap( } replicaCountsMap := make(map[string]resource.ReplicaCounts) + podStatusMap := make(map[string][]k8s.PodStatus) for _, pod := range podList { resourceID := pod.Labels["resourceID"] podAPIComputeID := APIPodComputeID(pod.Spec.Containers) @@ -170,7 +172,7 @@ func getReplicaCountsMap( replicaCounts.ReadyStaleCompute++ } } - if podStatus == k8s.PodStatusFailed { + if podStatus == k8s.PodStatusFailed || podStatus == k8s.PodStatusKilled || podStatus == k8s.PodStatusKilledOOM { if computeMatches { replicaCounts.FailedUpdatedCompute++ } else { @@ -179,6 +181,7 @@ func getReplicaCountsMap( } replicaCountsMap[resourceID] = replicaCounts + podStatusMap[resourceID] = append(podStatusMap[resourceID], podStatus) } for _, deployment := range deployments { @@ -191,7 +194,7 @@ func getReplicaCountsMap( replicaCountsMap[resourceID] = replicaCounts } - return replicaCountsMap + return replicaCountsMap, podStatusMap } func numUpdatedReadyReplicas(ctx *context.Context, api *context.API) (int32, error) { @@ -229,6 +232,18 @@ func apiStatusCode(apiStatus *resource.APIStatus) resource.StatusCode { } if apiStatus.TotalFailed() > 0 { + for _, podStatus := range apiStatus.PodStatuses { + if podStatus == k8s.PodStatusKilledOOM { + return resource.StatusKilledOOM + } + } + + for _, podStatus := range apiStatus.PodStatuses { + if podStatus == k8s.PodStatusKilled { + return resource.StatusKilled + } + } + return resource.StatusError } diff --git a/pkg/operator/workloads/api_workload.go b/pkg/operator/workloads/api_workload.go index f34e251634..c41b45ec7b 100644 --- a/pkg/operator/workloads/api_workload.go +++ b/pkg/operator/workloads/api_workload.go @@ -218,7 +218,8 @@ func (aw *APIWorkload) IsFailed(ctx *context.Context) (bool, error) { } for _, pod := range pods { - if k8s.GetPodStatus(&pod) == k8s.PodStatusFailed { + podStatus := k8s.GetPodStatus(&pod) + if podStatus == k8s.PodStatusFailed || podStatus == k8s.PodStatusKilled || podStatus == k8s.PodStatusKilledOOM { return true, nil } }