Skip to content

Improve cluster info output for batch jobs #2270

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -933,7 +933,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
numAPIInstances := len(infoResponse.NodeInfos)

var totalReplicas int
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncAPIs bool
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
for _, nodeInfo := range infoResponse.NodeInfos {
totalReplicas += nodeInfo.NumReplicas
if nodeInfo.ComputeUserCapacity.GPU > 0 {
Expand All @@ -943,7 +943,10 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
doesClusterHaveInfs = true
}
if nodeInfo.NumAsyncGatewayReplicas > 0 {
doesClusterHaveAsyncAPIs = true
doesClusterHaveAsyncGateways = true
}
if nodeInfo.NumEnqueuerReplicas > 0 {
doesClusterHaveEnqueuers = true
}
}

Expand All @@ -962,7 +965,8 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
{Title: "instance type"},
{Title: "lifecycle"},
{Title: "replicas"},
{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncAPIs},
{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncGateways},
{Title: "batch enqueuer replicas", Hidden: !doesClusterHaveEnqueuers},
{Title: "CPU (requested / total allocatable)"},
{Title: "memory (requested / total allocatable)"},
{Title: "GPU (requested / total allocatable)", Hidden: !doesClusterHaveGPUs},
Expand All @@ -980,7 +984,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
memStr := nodeInfo.ComputeUserRequested.Mem.String() + " / " + nodeInfo.ComputeUserCapacity.Mem.String()
gpuStr := s.Int64(nodeInfo.ComputeUserRequested.GPU) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.GPU)
infStr := s.Int64(nodeInfo.ComputeUserRequested.Inf) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.Inf)
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, cpuStr, memStr, gpuStr, infStr})
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, nodeInfo.NumEnqueuerReplicas, cpuStr, memStr, gpuStr, infStr})
}

t := table.Table{
Expand Down
12 changes: 10 additions & 2 deletions pkg/operator/endpoints/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
for i := range pods {
pod := pods[i]

if pod.Status.Phase == kcore.PodSucceeded || pod.Status.Phase == kcore.PodFailed {
// note: pending pods can be scheduled on nodes (image pull in progress)
continue
}

_, isAPIPod := pod.Labels["apiName"]
asyncDeploymentType, isAsyncPod := pod.Labels["cortex.dev/async"]
asyncPodType, isAsyncPod := pod.Labels["cortex.dev/async"]
batchPodType, isBatchPod := pod.Labels["cortex.dev/batch"]

if pod.Spec.NodeName == "" && isAPIPod {
numPendingReplicas++
Expand All @@ -118,8 +124,10 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
}

if isAPIPod {
if isAsyncPod && asyncDeploymentType == "gateway" {
if isAsyncPod && asyncPodType == "gateway" {
node.NumAsyncGatewayReplicas++
} else if isBatchPod && batchPodType == "enqueuer" {
node.NumEnqueuerReplicas++
} else {
node.NumReplicas++
}
Expand Down
1 change: 1 addition & 0 deletions pkg/operator/schema/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type NodeInfo struct {
Price float64 `json:"price" yaml:"price"`
NumReplicas int `json:"num_replicas" yaml:"num_replicas"`
NumAsyncGatewayReplicas int `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"`
NumEnqueuerReplicas int `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"`
ComputeUserCapacity userconfig.Compute `json:"compute_user_capacity" yaml:"compute_user_capacity"` // the total resources available to the user on a node
ComputeAvailable userconfig.Compute `json:"compute_available" yaml:"compute_unavailable"` // unused resources on a node
ComputeUserRequested userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node
Expand Down