Skip to content

Show error if resource requested is larger than any node size #379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/lib/k8s/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ type Client struct {
clientset *kclientset.Clientset
dynamicClient kclientdynamic.Interface
podClient kclientcore.PodInterface
nodeClient kclientcore.NodeInterface
serviceClient kclientcore.ServiceInterface
configMapClient kclientcore.ConfigMapInterface
deploymentClient kclientapps.DeploymentInterface
Expand Down Expand Up @@ -87,6 +88,7 @@ func New(namespace string, inCluster bool) (*Client, error) {
}

client.podClient = client.clientset.CoreV1().Pods(namespace)
client.nodeClient = client.clientset.CoreV1().Nodes()
client.serviceClient = client.clientset.CoreV1().Services(namespace)
client.configMapClient = client.clientset.CoreV1().ConfigMaps(namespace)
client.deploymentClient = client.clientset.AppsV1().Deployments(namespace)
Expand Down
35 changes: 35 additions & 0 deletions pkg/lib/k8s/node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
Copyright 2019 Cortex Labs, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package k8s

import (
"github.com/cortexlabs/cortex/pkg/lib/errors"

kcore "k8s.io/api/core/v1"
kmeta "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func (c *Client) ListNodes(opts *kmeta.ListOptions) ([]kcore.Node, error) {
if opts == nil {
opts = &kmeta.ListOptions{}
}
nodeList, err := c.nodeClient.List(*opts)
if err != nil {
return nil, errors.WithStack(err)
}
return nodeList.Items, nil
}
6 changes: 6 additions & 0 deletions pkg/operator/endpoints/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ func Deploy(w http.ResponseWriter, r *http.Request) {
fullCtxMatch = true
}

err = workloads.ValidateDeploy(ctx)
if err != nil {
RespondError(w, err)
return
}

deploymentStatus, err := workloads.GetDeploymentStatus(ctx.App.Name)
if err != nil {
RespondError(w, err)
Expand Down
1 change: 0 additions & 1 deletion pkg/operator/workloads/api_workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ func (aw *APIWorkload) Start(ctx *context.Context) error {
desiredReplicas := getRequestedReplicasFromDeployment(api, k8sDeloyment, hpa)

var deploymentSpec *kapps.Deployment

switch api.ModelFormat {
case userconfig.TensorFlowModelFormat:
deploymentSpec = tfAPISpec(ctx, api, aw.WorkloadID, desiredReplicas)
Expand Down
17 changes: 16 additions & 1 deletion pkg/operator/workloads/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.

package workloads

import "fmt"

type ErrorKind int

const (
Expand All @@ -25,6 +27,7 @@ const (
ErrLoadBalancerInitializing
ErrNotFound
ErrAPIInitializing
ErrNoAvailableNodeComputeLimit
)

var errorKinds = []string{
Expand All @@ -34,9 +37,10 @@ var errorKinds = []string{
"err_load_balancer_initializing",
"err_not_found",
"err_api_initializing",
"err_no_available_node_compute_limit",
}

var _ = [1]int{}[int(ErrAPIInitializing)-(len(errorKinds)-1)] // Ensure list length matches
var _ = [1]int{}[int(ErrNoAvailableNodeComputeLimit)-(len(errorKinds)-1)] // Ensure list length matches

func (t ErrorKind) String() string {
return errorKinds[t]
Expand Down Expand Up @@ -115,3 +119,14 @@ func ErrorAPIInitializing() error {
message: "api is still initializing",
}
}

func ErrorNoAvailableNodeComputeLimit(resource string, reqStr string, maxStr string) error {
message := fmt.Sprintf("no available nodes can satisfy the requested %s quantity - requested %s %s but nodes only have %s %s", resource, reqStr, resource, maxStr, resource)
if maxStr == "0" {
message = fmt.Sprintf("no available nodes can satisfy the requested %s quantity - requested %s %s but nodes don't have any %s", resource, reqStr, resource, resource)
}
return Error{
Kind: ErrNoAvailableNodeComputeLimit,
message: message,
}
}
51 changes: 51 additions & 0 deletions pkg/operator/workloads/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ limitations under the License.
package workloads

import (
"fmt"
"path/filepath"

kresource "k8s.io/apimachinery/pkg/api/resource"

"github.com/cortexlabs/cortex/pkg/consts"
"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
"github.com/cortexlabs/cortex/pkg/operator/api/context"
"github.com/cortexlabs/cortex/pkg/operator/api/resource"
"github.com/cortexlabs/cortex/pkg/operator/api/userconfig"
"github.com/cortexlabs/cortex/pkg/operator/config"
)

Expand Down Expand Up @@ -294,3 +298,50 @@ func GetDeploymentStatus(appName string) (resource.DeploymentStatus, error) {
}
return resource.UpdatedDeploymentStatus, nil
}

func ValidateDeploy(ctx *context.Context) error {
nodes, err := config.Kubernetes.ListNodes(nil)
if err != nil {
return err
}

var maxCPU, maxMem kresource.Quantity
var maxGPU int64
for _, node := range nodes {
curCPU := node.Status.Capacity.Cpu()
curMem := node.Status.Capacity.Memory()

var curGPU int64
if GPUQuantity, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
curGPU, _ = GPUQuantity.AsInt64()
}

if curCPU != nil && maxCPU.Cmp(*curCPU) < 0 {
maxCPU = *curCPU
}

if curMem != nil && maxMem.Cmp(*curMem) < 0 {
maxMem = *curMem
}

if curGPU > maxGPU {
maxGPU = curGPU
}
}

for _, api := range ctx.APIs {
if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
}
if api.Compute.Mem != nil {
if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
}
}
gpu := api.Compute.GPU
if gpu > maxGPU {
return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
}
}
return nil
}