diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 26b301341c..bb5ccf5cfa 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -933,7 +933,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { numAPIInstances := len(infoResponse.NodeInfos) var totalReplicas int - var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncAPIs bool + var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool for _, nodeInfo := range infoResponse.NodeInfos { totalReplicas += nodeInfo.NumReplicas if nodeInfo.ComputeUserCapacity.GPU > 0 { @@ -943,7 +943,10 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { doesClusterHaveInfs = true } if nodeInfo.NumAsyncGatewayReplicas > 0 { - doesClusterHaveAsyncAPIs = true + doesClusterHaveAsyncGateways = true + } + if nodeInfo.NumEnqueuerReplicas > 0 { + doesClusterHaveEnqueuers = true } } @@ -962,7 +965,8 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { {Title: "instance type"}, {Title: "lifecycle"}, {Title: "replicas"}, - {Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncAPIs}, + {Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncGateways}, + {Title: "batch enqueuer replicas", Hidden: !doesClusterHaveEnqueuers}, {Title: "CPU (requested / total allocatable)"}, {Title: "memory (requested / total allocatable)"}, {Title: "GPU (requested / total allocatable)", Hidden: !doesClusterHaveGPUs}, @@ -980,7 +984,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { memStr := nodeInfo.ComputeUserRequested.Mem.String() + " / " + nodeInfo.ComputeUserCapacity.Mem.String() gpuStr := s.Int64(nodeInfo.ComputeUserRequested.GPU) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.GPU) infStr := s.Int64(nodeInfo.ComputeUserRequested.Inf) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.Inf) - rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, cpuStr, memStr, gpuStr, infStr}) + rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, nodeInfo.NumEnqueuerReplicas, cpuStr, memStr, gpuStr, infStr}) } t := table.Table{ diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go index 56e8ea1617..b83a6374e0 100644 --- a/pkg/operator/endpoints/info.go +++ b/pkg/operator/endpoints/info.go @@ -104,8 +104,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { for i := range pods { pod := pods[i] + if pod.Status.Phase == kcore.PodSucceeded || pod.Status.Phase == kcore.PodFailed { + // note: pending pods can be scheduled on nodes (image pull in progress) + continue + } + _, isAPIPod := pod.Labels["apiName"] - asyncDeploymentType, isAsyncPod := pod.Labels["cortex.dev/async"] + asyncPodType, isAsyncPod := pod.Labels["cortex.dev/async"] + batchPodType, isBatchPod := pod.Labels["cortex.dev/batch"] if pod.Spec.NodeName == "" && isAPIPod { numPendingReplicas++ @@ -118,8 +124,10 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { } if isAPIPod { - if isAsyncPod && asyncDeploymentType == "gateway" { + if isAsyncPod && asyncPodType == "gateway" { node.NumAsyncGatewayReplicas++ + } else if isBatchPod && batchPodType == "enqueuer" { + node.NumEnqueuerReplicas++ } else { node.NumReplicas++ } diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 401686fee3..18988fa277 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -38,6 +38,7 @@ type NodeInfo struct { Price float64 `json:"price" yaml:"price"` NumReplicas int `json:"num_replicas" yaml:"num_replicas"` NumAsyncGatewayReplicas int `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"` + NumEnqueuerReplicas int `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"` ComputeUserCapacity userconfig.Compute `json:"compute_user_capacity" yaml:"compute_user_capacity"` // the total resources available to the user on a node ComputeAvailable userconfig.Compute `json:"compute_available" yaml:"compute_unavailable"` // unused resources on a node ComputeUserRequested userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node