diff --git a/Makefile b/Makefile index 4a81c2af75..9ce8df0052 100644 --- a/Makefile +++ b/Makefile @@ -21,10 +21,7 @@ SHELL := /bin/bash # Cortex devstart: - @kill $(shell pgrep -f rerun); ./dev/operator_local.sh || true - -killdev: - @kill $(shell pgrep -f rerun) + @./dev/operator_local.sh || true kubectl: @eksctl utils write-kubeconfig --name="cortex" @@ -138,8 +135,6 @@ ci-build-images: @./build/build-image.sh images/fluentd fluentd @./build/build-image.sh images/nginx-controller nginx-controller @./build/build-image.sh images/nginx-backend nginx-backend - @./build/build-image.sh images/argo-controller argo-controller - @./build/build-image.sh images/argo-executor argo-executor @./build/build-image.sh images/python-packager python-packager @./build/build-image.sh images/cluster-autoscaler cluster-autoscaler @./build/build-image.sh images/nvidia nvidia @@ -160,8 +155,6 @@ ci-push-images: @./build/push-image.sh fluentd @./build/push-image.sh nginx-controller @./build/push-image.sh nginx-backend - @./build/push-image.sh argo-controller - @./build/push-image.sh argo-executor @./build/push-image.sh python-packager @./build/push-image.sh cluster-autoscaler @./build/push-image.sh nvidia diff --git a/cortex.sh b/cortex.sh index d8502e2132..3b10cd2fbd 100755 --- a/cortex.sh +++ b/cortex.sh @@ -117,8 +117,6 @@ export CORTEX_NODES_MAX="${CORTEX_NODES_MAX:-5}" export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}" export CORTEX_IMAGE_MANAGER="${CORTEX_IMAGE_MANAGER:-cortexlabs/manager:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_ARGO_CONTROLLER="${CORTEX_IMAGE_ARGO_CONTROLLER:-cortexlabs/argo-controller:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_ARGO_EXECUTOR="${CORTEX_IMAGE_ARGO_EXECUTOR:-cortexlabs/argo-executor:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_FLUENTD="${CORTEX_IMAGE_FLUENTD:-cortexlabs/fluentd:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_NGINX_BACKEND="${CORTEX_IMAGE_NGINX_BACKEND:-cortexlabs/nginx-backend:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_NGINX_CONTROLLER="${CORTEX_IMAGE_NGINX_CONTROLLER:-cortexlabs/nginx-controller:$CORTEX_VERSION_STABLE}" @@ -177,8 +175,6 @@ function install_cortex() { -e CORTEX_NODE_TYPE=$CORTEX_NODE_TYPE \ -e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \ -e CORTEX_BUCKET=$CORTEX_BUCKET \ - -e CORTEX_IMAGE_ARGO_CONTROLLER=$CORTEX_IMAGE_ARGO_CONTROLLER \ - -e CORTEX_IMAGE_ARGO_EXECUTOR=$CORTEX_IMAGE_ARGO_EXECUTOR \ -e CORTEX_IMAGE_FLUENTD=$CORTEX_IMAGE_FLUENTD \ -e CORTEX_IMAGE_NGINX_BACKEND=$CORTEX_IMAGE_NGINX_BACKEND \ -e CORTEX_IMAGE_NGINX_CONTROLLER=$CORTEX_IMAGE_NGINX_CONTROLLER \ diff --git a/dev/operator_local.sh b/dev/operator_local.sh index c4d223b8c8..e8dd4ae50d 100755 --- a/dev/operator_local.sh +++ b/dev/operator_local.sh @@ -26,6 +26,9 @@ export CONST_OPERATOR_TRANSFORMERS_DIR=$ROOT/pkg/transformers export CONST_OPERATOR_ESTIMATORS_DIR=$ROOT/pkg/estimators export CONST_OPERATOR_IN_CLUSTER=false +kill $(pgrep -f rerun) >/dev/null 2>&1 || true + rerun -watch $ROOT/pkg $ROOT/cli -ignore $ROOT/vendor $ROOT/bin -run sh -c \ "go build -o $ROOT/bin/operator $ROOT/pkg/operator && go build -installsuffix cgo -o $ROOT/bin/cortex $ROOT/cli && $ROOT/bin/operator" -# go run -race $ROOT/pkg/operator/operator.go + +# go run -race $ROOT/pkg/operator/operator.go # Check for race conditions. Doesn't seem to catch them all? diff --git a/dev/registry.sh b/dev/registry.sh index e9efbdd3ce..068012f352 100755 --- a/dev/registry.sh +++ b/dev/registry.sh @@ -36,8 +36,6 @@ function ecr_login() { function create_registry() { aws ecr create-repository --repository-name=cortexlabs/manager --region=$REGISTRY_REGION || true - aws ecr create-repository --repository-name=cortexlabs/argo-controller --region=$REGISTRY_REGION || true - aws ecr create-repository --repository-name=cortexlabs/argo-executor --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/fluentd --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/nginx-backend --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/nginx-controller --region=$REGISTRY_REGION || true @@ -138,8 +136,6 @@ elif [ "$cmd" = "update" ]; then build_and_push $ROOT/images/nginx-controller nginx-controller latest build_and_push $ROOT/images/nginx-backend nginx-backend latest build_and_push $ROOT/images/fluentd fluentd latest - build_and_push $ROOT/images/argo-controller argo-controller latest - build_and_push $ROOT/images/argo-executor argo-executor latest build_and_push $ROOT/images/tf-serve tf-serve latest build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest diff --git a/docs/cluster/config.md b/docs/cluster/config.md index e4000960e8..07a2fe0411 100644 --- a/docs/cluster/config.md +++ b/docs/cluster/config.md @@ -40,8 +40,6 @@ export CORTEX_NAMESPACE="cortex" # Image paths export CORTEX_IMAGE_MANAGER="cortexlabs/manager:master" -export CORTEX_IMAGE_ARGO_CONTROLLER="cortexlabs/argo-controller:master" -export CORTEX_IMAGE_ARGO_EXECUTOR="cortexlabs/argo-executor:master" export CORTEX_IMAGE_FLUENTD="cortexlabs/fluentd:master" export CORTEX_IMAGE_NGINX_BACKEND="cortexlabs/nginx-backend:master" export CORTEX_IMAGE_NGINX_CONTROLLER="cortexlabs/nginx-controller:master" diff --git a/docs/cluster/development.md b/docs/cluster/development.md index 0748b193f6..5f95801977 100644 --- a/docs/cluster/development.md +++ b/docs/cluster/development.md @@ -56,8 +56,6 @@ export CORTEX_NODES_MAX="5" export CORTEX_NAMESPACE="cortex" export CORTEX_IMAGE_MANAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/manager:latest" -export CORTEX_IMAGE_ARGO_CONTROLLER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/argo-controller:latest" -export CORTEX_IMAGE_ARGO_EXECUTOR="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/argo-executor:latest" export CORTEX_IMAGE_FLUENTD="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/fluentd:latest" export CORTEX_IMAGE_NGINX_BACKEND="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nginx-backend:latest" export CORTEX_IMAGE_NGINX_CONTROLLER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nginx-controller:latest" diff --git a/go.mod b/go.mod index eac3486535..902fd7445a 100644 --- a/go.mod +++ b/go.mod @@ -4,11 +4,10 @@ // go mod tidy // replace these lines in go.mod: // github.com/GoogleCloudPlatform/spark-on-k8s-operator v1alpha1-0.5-2.4.0 -// github.com/argoproj/argo v2.3.0 // github.com/cortexlabs/yaml v2.2.4 -// k8s.io/client-go v10.0.0 -// k8s.io/api 89a74a8d264df0e993299876a8cde88379b940ee -// k8s.io/apimachinery 2b1284ed4c93a43499e781493253e2ac5959c4fd +// k8s.io/client-go v12.0.0 +// k8s.io/api 7525909cc6da +// k8s.io/apimachinery 1799e75a0719 // (note: go to the commit for the client-go release and browse to Godeps/Godeps.json to find the SHAs for k8s.io/api and k8s.io/apimachinery) // go mod tidy // check the diff in this file @@ -19,39 +18,23 @@ go 1.12 require ( github.com/GoogleCloudPlatform/spark-on-k8s-operator v0.0.0-20181208011959-62db1d66dafa - github.com/argoproj/argo v2.3.0+incompatible - github.com/aws/aws-sdk-go v1.20.12 + github.com/aws/aws-sdk-go v1.20.20 github.com/cortexlabs/yaml v0.0.0-20190626164117-202ab3a3d475 github.com/davecgh/go-spew v1.1.1 - github.com/emicklei/go-restful v2.9.6+incompatible // indirect - github.com/ghodss/yaml v1.0.0 - github.com/go-openapi/spec v0.19.2 // indirect - github.com/gogo/protobuf v1.2.1 // indirect - github.com/google/btree v1.0.0 // indirect - github.com/google/gofuzz v1.0.0 // indirect - github.com/googleapis/gnostic v0.3.0 // indirect github.com/gorilla/mux v1.7.3 github.com/gorilla/websocket v1.4.0 - github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/imdario/mergo v0.3.7 // indirect - github.com/json-iterator/go v1.1.6 // indirect github.com/mitchellh/go-homedir v1.1.0 - github.com/modern-go/reflect2 v1.0.1 // indirect - github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/errors v0.8.1 github.com/spf13/cobra v0.0.5 github.com/stretchr/testify v1.3.0 github.com/tcnksm/go-input v0.0.0-20180404061846-548a7d7a8ee8 - github.com/ugorji/go/codec v1.1.5-pre + github.com/ugorji/go/codec v1.1.7 github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 // indirect golang.org/x/time v0.0.0-20190308202827-9d24e82272b4 // indirect - gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5 - k8s.io/api v0.0.0-20181204000039-89a74a8d264d - k8s.io/apimachinery v0.0.0-20181127025237-2b1284ed4c93 - k8s.io/client-go v10.0.0+incompatible - k8s.io/klog v0.3.0 // indirect - k8s.io/kube-openapi v0.0.0-20190603182131-db7b694dc208 // indirect - sigs.k8s.io/yaml v1.1.0 // indirect + k8s.io/api v0.0.0-20190620084959-7cf5895f2711 + k8s.io/apimachinery v0.0.0-20190612205821-1799e75a0719 + k8s.io/client-go v0.0.0-20190620085101-78d2af792bab + k8s.io/utils v0.0.0-20190712204705-3dccf664f023 // indirect ) diff --git a/go.sum b/go.sum index fb2c8abb42..36fb0e3dc2 100644 --- a/go.sum +++ b/go.sum @@ -1,193 +1,149 @@ cloud.google.com/go v0.34.0 h1:eOI3/cP2VTU6uZLDYAoic+eyzzB9YyGmJ7eIjl8rOPg= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +github.com/Azure/go-autorest v11.1.2+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/GoogleCloudPlatform/spark-on-k8s-operator v0.0.0-20181208011959-62db1d66dafa h1:+7sR1qfswfQkw01erHTK74SP1RLDwo8TSUh5C8AJgmo= github.com/GoogleCloudPlatform/spark-on-k8s-operator v0.0.0-20181208011959-62db1d66dafa/go.mod h1:6PnrZv6zUDkrNMw0mIoGRmGBR7i9LulhKPmxFq4rUiM= -github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= -github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= -github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= -github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= -github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/argoproj/argo v2.3.0+incompatible h1:L1OYZ86Q7NK19ahdl/eJOq78Mlf52wUKGmp7VDNQVz8= -github.com/argoproj/argo v2.3.0+incompatible/go.mod h1:KJ0MB+tuhtAklR4jkPM10mIZXfRA0peTYJ1sLUnFLVU= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= -github.com/aws/aws-sdk-go v1.20.12 h1:xV7xfLSkiqd7JOnLlfER+Jz8kI98rAGJvtXssYkCRs4= -github.com/aws/aws-sdk-go v1.20.12/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= +github.com/aws/aws-sdk-go v1.20.20 h1:OAR/GtjMOhenkp1NNKr1N1FgIP3mQXHeGbRhvVIAQp0= +github.com/aws/aws-sdk-go v1.20.20/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/cortexlabs/yaml v0.0.0-20190626164117-202ab3a3d475 h1:N+pms5TCPH2F/DIX6a+2dgJI/CHweh45pEhGW/+stxQ= github.com/cortexlabs/yaml v0.0.0-20190626164117-202ab3a3d475/go.mod h1:nuzR4zMPuiBWg1HyZo9bzSZmtdSVjKfn8+RyO7egs0c= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= -github.com/davecgh/go-spew v0.0.0-20151105211317-5215b55f46b2/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= -github.com/emicklei/go-restful v2.9.6+incompatible h1:tfrHha8zJ01ywiOEC1miGY8st1/igzWB8OmvPgoYX7w= -github.com/emicklei/go-restful v2.9.6+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= +github.com/dgrijalva/jwt-go v0.0.0-20160705203006-01aeca54ebda/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= +github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM= +github.com/elazarl/goproxy v0.0.0-20170405201442-c4fc26588b6e/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= +github.com/evanphx/json-patch v0.0.0-20190203023257-5858425f7550/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= -github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/go-openapi/jsonpointer v0.0.0-20160704185906-46af16f9f7b1/go.mod h1:+35s3my2LFTysnkMfxsJBAMHj/DoqoB9knIWoYG/Vk0= -github.com/go-openapi/jsonpointer v0.19.2 h1:A9+F4Dc/MCNB5jibxf6rRvOvR/iFgQdyNx9eIhnGqq0= -github.com/go-openapi/jsonpointer v0.19.2/go.mod h1:3akKfEdA7DF1sugOqz1dVQHBcuDBPKZGEoHC/NkiQRg= -github.com/go-openapi/jsonreference v0.0.0-20160704190145-13c6e3589ad9/go.mod h1:W3Z9FmVs9qj+KR4zFKmDPGiLdk1D9Rlm7cyMvf57TTg= -github.com/go-openapi/jsonreference v0.19.2 h1:o20suLFB4Ri0tuzpWtyHlh7E7HnkqTNLq6aR6WVNS1w= -github.com/go-openapi/jsonreference v0.19.2/go.mod h1:jMjeRr2HHw6nAVajTXJ4eiUwohSTlpa0o73RUL1owJc= -github.com/go-openapi/spec v0.0.0-20160808142527-6aced65f8501/go.mod h1:J8+jY1nAiCcj+friV/PDoE1/3eeccG9LYBs0tYvLOWc= -github.com/go-openapi/spec v0.19.2 h1:SStNd1jRcYtfKCN7R0laGNs80WYYvn5CbBjM2sOmCrE= -github.com/go-openapi/spec v0.19.2/go.mod h1:sCxk3jxKgioEJikev4fgkNmwS+3kuYdJtcsZsD5zxMY= -github.com/go-openapi/swag v0.0.0-20160704191624-1d0bd113de87/go.mod h1:DXUve3Dpr1UfpPtxFw+EFuQ41HhCWZfha5jSVRG7C7I= -github.com/go-openapi/swag v0.19.2 h1:jvO6bCMBEilGwMfHhrd61zIID4oIFdwb76V17SM88dE= -github.com/go-openapi/swag v0.19.2/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= -github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= -github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= -github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/gogo/protobuf v0.0.0-20171007142547-342cbe0a0415 h1:WSBJMqJbLxsn+bTCPyPYZfqHdJmc8MK4wrBjMft6BAM= +github.com/gogo/protobuf v0.0.0-20171007142547-342cbe0a0415/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo= -github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= -github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/googleapis/gnostic v0.0.0-20170426233943-68f4ded48ba9/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= -github.com/googleapis/gnostic v0.3.0 h1:CcQijm0XKekKjP/YCz28LXVSpgguuB+nCxaSjCe09y0= -github.com/googleapis/gnostic v0.3.0/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= +github.com/google/btree v0.0.0-20160524151835-7d79101e329e/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/gofuzz v0.0.0-20170612174753-24818f796faf h1:+RRA9JqSOZFfKrOeqr2z77+8R2RKyh8PG66dcu1V0ck= +github.com/google/gofuzz v0.0.0-20170612174753-24818f796faf/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= +github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d h1:7XGaL1e6bYS1yIonGp9761ExpPPV1ui0SAC59Yube9k= +github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= +github.com/gophercloud/gophercloud v0.0.0-20190126172459-c818fa66e4c8/go.mod h1:3WdhXV3rUYy9p6AUW8d94kr+HS62Y4VL9mBnFxsD8q4= github.com/gorilla/mux v1.7.3 h1:gnP5JzjVOuiZD07fKKToCAOjS0yOpj/qPETTXCCS6hw= github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/websocket v1.4.0 h1:WDFjx/TMzVgy9VdMMQi2K2Emtwi2QcUQsztZ/zLaH/Q= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= -github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= -github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/gregjones/httpcache v0.0.0-20170728041850-787624de3eb7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/imdario/mergo v0.3.7 h1:Y+UAYTZ7gDEuOfhxKWy+dvb5dRQ6rJjFSdX2HZY1/gI= github.com/imdario/mergo v0.3.7/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af h1:pmfjZENx5imkbgOkpRUYLnmbU7UEFbjtDA2hxJ1ichM= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= -github.com/json-iterator/go v0.0.0-20180612202835-f2b4162afba3/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= -github.com/json-iterator/go v1.1.6 h1:MrUvLMLTMxbqFJ9kzlvat/rYZqZnW3u4wkLzWTaFwKs= -github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= -github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/json-iterator/go v0.0.0-20180701071628-ab8a2e0c74be h1:AHimNtVIpiBjPUhEF5KNCkrUyqTSA5zWUl8sQ2bfGBE= +github.com/json-iterator/go v0.0.0-20180701071628-ab8a2e0c74be/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= -github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63 h1:nTT4s92Dgz2HlrB2NaMgvlfqHH39OgMhA7z3PK7PGD4= -github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v0.0.0-20190113212917-5533ce8a0da3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= -github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= +github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v0.0.5 h1:f0B+LkLX6DtmRH1isoNA9VTtNUK9K8xYd28JNNfOv/s= github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= -github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/spf13/pflag v1.0.1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= -github.com/stretchr/testify v0.0.0-20151208002404-e3a8ff8ce365/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/tcnksm/go-input v0.0.0-20180404061846-548a7d7a8ee8 h1:RB0v+/pc8oMzPsN97aZYEwNuJ6ouRJ2uhjxemJ9zvrY= github.com/tcnksm/go-input v0.0.0-20180404061846-548a7d7a8ee8/go.mod h1:IlWNj9v/13q7xFbaK4mbyzMNwrZLaWSHx/aibKIZuIg= -github.com/ugorji/go v1.1.5-pre h1:jyJKFOSEbdOc2HODrf2qcCkYOdq7zzXqA9bhW5oV4fM= -github.com/ugorji/go v1.1.5-pre/go.mod h1:FwP/aQVg39TXzItUBMwnWp9T9gPQnXw4Poh4/oBQZ/0= +github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo= +github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= -github.com/ugorji/go/codec v1.1.5-pre h1:5YV9PsFAN+ndcCtTM7s60no7nY7eTG3LPtxhSwuxzCs= -github.com/ugorji/go/codec v1.1.5-pre/go.mod h1:tULtS6Gy1AE1yCENaw4Vb//HLH5njI2tfCQDUqRd8fI= +github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs= +github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca h1:1CFlNzQhALwjS9mBAUkycX616GzgsuYUOCHA5+HSlXI= github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca/go.mod h1:ce1O1j6UtZfjr22oyGxGLbauSBp2YVXpARAosm7dHBg= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +golang.org/x/crypto v0.0.0-20181025213731-e84da0312774/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9 h1:mKdxBk7AujPs8kU4m80U72y/zjbZ3UcXC7dClwKbUI0= golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8 h1:1wopBVtVdWnn03fZelqdXTqk7U7zPQCb+T4rbU9ZEoU= -golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190613194153-d28f0bde5980 h1:dfGZHvZk057jK2MCeWus/TowKpJ8y4AmooUzdBSR9GU= -golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190206173232-65e2d4e15006 h1:bfLnR+k0tq5Lqt6dflRLcZiz6UaXCMt3vhYJ1l4FQ80= +golang.org/x/net v0.0.0-20190206173232-65e2d4e15006/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/oauth2 v0.0.0-20190402181905-9f3314589c9a/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20170830134202-bb24a47a89ea/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f h1:25KHgbfyiSm6vwQLbM3zZIe1v9p/3ea4Rz+nnM5K/i4= -golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/sys v0.0.0-20190312061237-fead79001313 h1:pczuHS43Cp2ktBEEmLwScxgjWsBSzdaQiKzUyf3DTTc= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db h1:6/JqlYfC1CCaLnGceQTI+sDGhC9UBSPAsBqI0Gun6kU= +golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/time v0.0.0-20161028155119-f51c12702a4d/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4 h1:SvFZT6jyqRaOeXpc5h/JSfZenJ2O330aBsf7JfSUXmQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20181011042414-1f849cf54d09/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0 h1:KxkO13IPW4Lslp2bz+KHP2E3gtFlrIGNThxkZQ3g+4c= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= -gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5 h1:E846t8CnR+lv5nE+VuiKTDG/v1U2stad0QzddfJC7kY= -gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5/go.mod h1:hiOFpYm0ZJbusNj2ywpbrXowU3G8U6GIQzqn2mw1UIE= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/inf.v0 v0.9.0 h1:3zYtXIO92bvsdS3ggAdA8Gb4Azj0YU+TVY1uGYNFA8o= +gopkg.in/inf.v0 v0.9.0/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -k8s.io/api v0.0.0-20181204000039-89a74a8d264d h1:HQoGWsWUe/FmRcX9BU440AAMnzBFEf+DBo4nbkQlNzs= -k8s.io/api v0.0.0-20181204000039-89a74a8d264d/go.mod h1:iuAfoD4hCxJ8Onx9kaTIt30j7jUFS00AXQi6QMi99vA= -k8s.io/apimachinery v0.0.0-20181127025237-2b1284ed4c93 h1:tT6oQBi0qwLbbZSfDkdIsb23EwaLY85hoAV4SpXfdao= -k8s.io/apimachinery v0.0.0-20181127025237-2b1284ed4c93/go.mod h1:ccL7Eh7zubPUSh9A3USN90/OzHNSVN6zxzde07TDCL0= -k8s.io/client-go v10.0.0+incompatible h1:F1IqCqw7oMBzDkqlcBymRq1450wD0eNqLE9jzUrIi34= -k8s.io/client-go v10.0.0+incompatible/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s= -k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= -k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= -k8s.io/klog v0.3.0 h1:0VPpR+sizsiivjIfIAQH/rl8tan6jvWkS7lU+0di3lE= +k8s.io/api v0.0.0-20190620084959-7cf5895f2711 h1:BblVYz/wE5WtBsD/Gvu54KyBUTJMflolzc5I2DTvh50= +k8s.io/api v0.0.0-20190620084959-7cf5895f2711/go.mod h1:TBhBqb1AWbBQbW3XRusr7n7E4v2+5ZY8r8sAMnyFC5A= +k8s.io/apimachinery v0.0.0-20190612205821-1799e75a0719 h1:uV4S5IB5g4Nvi+TBVNf3e9L4wrirlwYJ6w88jUQxTUw= +k8s.io/apimachinery v0.0.0-20190612205821-1799e75a0719/go.mod h1:I4A+glKBHiTgiEjQiCCQfCAIcIMFGt291SmsvcrFzJA= +k8s.io/client-go v0.0.0-20190620085101-78d2af792bab h1:E8Fecph0qbNsAbijJJQryKu4Oi9QTp5cVpjTE+nqg6g= +k8s.io/client-go v0.0.0-20190620085101-78d2af792bab/go.mod h1:E95RaSlHr79aHaX0aGSwcPNfygDiPKOVXdmivCIZT0k= k8s.io/klog v0.3.0/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= -k8s.io/kube-openapi v0.0.0-20190603182131-db7b694dc208 h1:5sW+fEHvlJI3Ngolx30CmubFulwH28DhKjGf70Xmtco= -k8s.io/kube-openapi v0.0.0-20190603182131-db7b694dc208/go.mod h1:nfDlWeOsu3pUf4yWGL+ERqohP4YsZcBJXWMK+gkzOA4= -sigs.k8s.io/structured-merge-diff v0.0.0-20190525122527-15d366b2352e/go.mod h1:wWxsB5ozmmv/SG7nM11ayaAW51xMvak/t1r0CSlcokI= +k8s.io/klog v0.3.1 h1:RVgyDHY/kFKtLqh67NvEWIgkMneNoIrdkN0CxDSQc68= +k8s.io/klog v0.3.1/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= +k8s.io/kube-openapi v0.0.0-20190228160746-b3a7cee44a30/go.mod h1:BXM9ceUBTj2QnfH2MK1odQs778ajze1RxcmP6S8RVVc= +k8s.io/utils v0.0.0-20190221042446-c2654d5206da/go.mod h1:8k8uAuAQ0rXslZKaEWd0c3oVhZz7sSzSiPnVZayjIX0= +k8s.io/utils v0.0.0-20190712204705-3dccf664f023 h1:1H4Jyzb0z2X0GfBMTwRjnt5ejffRHrGftUgJcV/ZfDc= +k8s.io/utils v0.0.0-20190712204705-3dccf664f023/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew= sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= diff --git a/images/argo-controller/Dockerfile b/images/argo-controller/Dockerfile deleted file mode 100644 index c9a245331e..0000000000 --- a/images/argo-controller/Dockerfile +++ /dev/null @@ -1 +0,0 @@ -FROM argoproj/workflow-controller:v2.3.0 diff --git a/images/argo-executor/Dockerfile b/images/argo-executor/Dockerfile deleted file mode 100644 index a0fe2c43bc..0000000000 --- a/images/argo-executor/Dockerfile +++ /dev/null @@ -1 +0,0 @@ -FROM argoproj/argoexec:v2.3.0 diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh index 6076749a9c..a0d3860104 100755 --- a/manager/install_cortex.sh +++ b/manager/install_cortex.sh @@ -171,7 +171,6 @@ setup_configmap setup_secrets envsubst < manifests/spark.yaml | kubectl apply -f - >/dev/null -envsubst < manifests/argo.yaml | kubectl apply -f - >/dev/null envsubst < manifests/nginx.yaml | kubectl apply -f - >/dev/null envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null diff --git a/manager/manifests/argo.yaml b/manager/manifests/argo.yaml deleted file mode 100644 index b4f6271583..0000000000 --- a/manager/manifests/argo.yaml +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright 2019 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: argo-executor - namespace: $CORTEX_NAMESPACE ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: argo-executor - namespace: $CORTEX_NAMESPACE -subjects: -- kind: ServiceAccount - name: argo-executor - namespace: $CORTEX_NAMESPACE -roleRef: - kind: ClusterRole - name: cluster-admin - apiGroup: rbac.authorization.k8s.io ---- - -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: workflows.argoproj.io - namespace: $CORTEX_NAMESPACE -spec: - group: argoproj.io - names: - kind: Workflow - plural: workflows - shortNames: - - wf - scope: Namespaced - version: v1alpha1 ---- - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: argo-controller - namespace: $CORTEX_NAMESPACE ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: argo-controller - namespace: $CORTEX_NAMESPACE -rules: -- apiGroups: [""] - resources: [pods, pods/exec] - verbs: [create, get, list, watch, update, patch, delete] -- apiGroups: [""] - resources: [configmaps] - verbs: [get, watch, list] -- apiGroups: [""] - resources: [persistentvolumeclaims] - verbs: [create, delete] -- apiGroups: [argoproj.io] - resources: [workflows, workflows/finalizers] - verbs: [get, list, watch, update, patch, delete] ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: argo - namespace: $CORTEX_NAMESPACE -subjects: -- kind: ServiceAccount - name: argo-controller - namespace: $CORTEX_NAMESPACE -roleRef: - kind: Role - name: argo-controller - apiGroup: rbac.authorization.k8s.io ---- - -apiVersion: v1 -kind: ConfigMap -metadata: - name: argo-controller - namespace: $CORTEX_NAMESPACE -data: - config: | - namespace: $CORTEX_NAMESPACE ---- - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: argo-controller - namespace: $CORTEX_NAMESPACE -spec: - selector: - matchLabels: - app: argo-controller - template: - metadata: - labels: - app: argo-controller - spec: - containers: - - args: - - --configmap - - argo-controller - - --executor-image - - $CORTEX_IMAGE_ARGO_EXECUTOR - - --executor-image-pull-policy - - Always - command: - - workflow-controller - image: $CORTEX_IMAGE_ARGO_CONTROLLER - imagePullPolicy: Always - name: argo-controller - serviceAccountName: argo-controller diff --git a/manager/manifests/operator.yaml b/manager/manifests/operator.yaml index 16669dd2b1..70e4b63a19 100644 --- a/manager/manifests/operator.yaml +++ b/manager/manifests/operator.yaml @@ -45,11 +45,11 @@ spec: replicas: 1 selector: matchLabels: - workloadId: operator + workloadID: operator template: metadata: labels: - workloadId: operator + workloadID: operator workloadType: operator spec: containers: @@ -86,7 +86,7 @@ metadata: workloadType: operator spec: selector: - workloadId: operator + workloadID: operator ports: - port: 8888 targetPort: 8888 diff --git a/manager/uninstall_cortex.sh b/manager/uninstall_cortex.sh index 720cb22ff0..cac7c0687b 100755 --- a/manager/uninstall_cortex.sh +++ b/manager/uninstall_cortex.sh @@ -30,7 +30,6 @@ fi kubectl delete --ignore-not-found=true customresourcedefinition scheduledsparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 kubectl delete --ignore-not-found=true customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 -kubectl delete --ignore-not-found=true customresourcedefinition workflows.argoproj.io >/dev/null 2>&1 kubectl delete --ignore-not-found=true namespace $CORTEX_NAMESPACE >/dev/null 2>&1 echo "✓ Uninstalled Cortex" diff --git a/pkg/lib/argo/argo.go b/pkg/lib/argo/argo.go deleted file mode 100644 index 5f53d8911a..0000000000 --- a/pkg/lib/argo/argo.go +++ /dev/null @@ -1,319 +0,0 @@ -/* -Copyright 2019 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package argo - -import ( - "strings" - "time" - - argowf "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1" - argoclientset "github.com/argoproj/argo/pkg/client/clientset/versioned" - argoclientapi "github.com/argoproj/argo/pkg/client/clientset/versioned/typed/workflow/v1alpha1" - kerrors "k8s.io/apimachinery/pkg/api/errors" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - kclientrest "k8s.io/client-go/rest" - - "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/maps" - "github.com/cortexlabs/cortex/pkg/lib/pointer" - "github.com/cortexlabs/cortex/pkg/lib/sets/strset" - "github.com/cortexlabs/cortex/pkg/lib/slices" -) - -var ( - doneStates = []string{ - string(argowf.NodeSucceeded), - string(argowf.NodeSkipped), - string(argowf.NodeFailed), - string(argowf.NodeError), - } - runningStates = []string{ - string(argowf.NodeRunning), - } -) - -type Client struct { - workflowClient argoclientapi.WorkflowInterface - namespace string -} - -func New(restConfig *kclientrest.Config, namespace string) *Client { - client := &Client{ - namespace: namespace, - } - wfcs := argoclientset.NewForConfigOrDie(restConfig) - client.workflowClient = wfcs.ArgoprojV1alpha1().Workflows(namespace) - return client -} - -type WorkflowTask struct { - Name string - Action string - Manifest string - SuccessCondition string - FailureCondition string - Dependencies []string - Labels map[string]string -} - -func (c *Client) NewWorkflow(name string, labels ...map[string]string) *argowf.Workflow { - name = "argo-" + name - if !strings.HasSuffix(name, "-") && !strings.HasSuffix(name, "_") { - name = name + "-" - } - allLabels := maps.MergeStrMaps(labels...) - - return &argowf.Workflow{ - ObjectMeta: kmeta.ObjectMeta{ - GenerateName: name, - Namespace: c.namespace, - Labels: allLabels, - }, - Spec: argowf.WorkflowSpec{ - ServiceAccountName: "argo-executor", - Entrypoint: "DAG", - Templates: []argowf.Template{ - { - Name: "DAG", - DAG: &argowf.DAGTemplate{ - Tasks: []argowf.DAGTask{}, - }, - }, - }, - }, - } -} - -func AddTask(wf *argowf.Workflow, task *WorkflowTask) *argowf.Workflow { - if task == nil { - return wf - } - - DAGTask := argowf.DAGTask{ - Name: task.Name, - Template: task.Name, - Dependencies: slices.RemoveEmptiesAndUnique(task.Dependencies), - } - - // All tasks are added to the DAG template which is first - wf.Spec.Templates[0].DAG.Tasks = append(wf.Spec.Templates[0].DAG.Tasks, DAGTask) - - labels := task.Labels - labels["argo"] = "true" - - template := argowf.Template{ - Name: task.Name, - Resource: &argowf.ResourceTemplate{ - Action: task.Action, - Manifest: task.Manifest, - SuccessCondition: task.SuccessCondition, - FailureCondition: task.FailureCondition, - }, - Metadata: argowf.Metadata{ - Labels: labels, - }, - } - - wf.Spec.Templates = append(wf.Spec.Templates, template) - - return wf -} - -func EnableGC(spec kmeta.Object) { - ownerReferences := spec.GetOwnerReferences() - ownerReferences = append(ownerReferences, kmeta.OwnerReference{ - APIVersion: "argoproj.io/v1alpha1", - Kind: "Workflow", - Name: "{{workflow.name}}", - UID: "{{workflow.uid}}", - BlockOwnerDeletion: pointer.Bool(false), - }) - spec.SetOwnerReferences(ownerReferences) -} - -func NumTasks(wf *argowf.Workflow) int { - if wf == nil || len(wf.Spec.Templates) == 0 { - return 0 - } - return len(wf.Spec.Templates[0].DAG.Tasks) -} - -func (c *Client) Run(wf *argowf.Workflow) error { - _, err := c.workflowClient.Create(wf) - if err != nil { - return errors.WithStack(err) - } - return nil -} - -func (c *Client) List(opts *kmeta.ListOptions) ([]argowf.Workflow, error) { - if opts == nil { - opts = &kmeta.ListOptions{} - } - wfList, err := c.workflowClient.List(*opts) - if err != nil { - return nil, errors.WithStack(err) - } - return wfList.Items, nil -} - -func (c *Client) ListByLabels(labels map[string]string) ([]argowf.Workflow, error) { - opts := &kmeta.ListOptions{ - LabelSelector: k8s.LabelSelector(labels), - } - return c.List(opts) -} - -func (c *Client) ListByLabel(labelKey string, labelValue string) ([]argowf.Workflow, error) { - return c.ListByLabels(map[string]string{labelKey: labelValue}) -} - -func (c *Client) ListRunning(labels ...map[string]string) ([]argowf.Workflow, error) { - wfs, err := c.ListByLabels(maps.MergeStrMaps(labels...)) - if err != nil { - return wfs, err - } - runningWfs := []argowf.Workflow{} - for _, wf := range wfs { - if IsRunning(&wf) { - runningWfs = append(runningWfs, wf) - } - } - return runningWfs, nil -} - -func (c *Client) ListDone(labels ...map[string]string) ([]argowf.Workflow, error) { - wfs, err := c.ListByLabels(maps.MergeStrMaps(labels...)) - if err != nil { - return wfs, err - } - doneWfs := []argowf.Workflow{} - for _, wf := range wfs { - if IsDone(&wf) { - doneWfs = append(doneWfs, wf) - } - } - return doneWfs, nil -} - -func (c *Client) Delete(wfName string) (bool, error) { - err := c.workflowClient.Delete(wfName, &kmeta.DeleteOptions{}) - if kerrors.IsNotFound(err) { - return false, nil - } else if err != nil { - return false, errors.WithStack(err) - } - return true, nil -} - -func (c *Client) DeleteMultiple(wfs []argowf.Workflow) error { - errs := []error{} - for _, wf := range wfs { - _, err := c.Delete(wf.Name) - errs = append(errs, err) - } - return errors.FirstError(errs...) -} - -func IsDone(wf *argowf.Workflow) bool { - if wf == nil { - return true - } - return slices.HasString(doneStates, string(wf.Status.Phase)) -} - -func IsRunning(wf *argowf.Workflow) bool { - if wf == nil { - return false - } - return slices.HasString(runningStates, string(wf.Status.Phase)) -} - -type WorkflowItem struct { - Task *argowf.DAGTask - Template *argowf.Template - NodeStatus *argowf.NodeStatus - Labels map[string]string -} - -// ParseWorkflow returns task name -> *WorkflowItem -func ParseWorkflow(wf *argowf.Workflow) map[string]*WorkflowItem { - if wf == nil { - return nil - } - pWf := make(map[string]*WorkflowItem) - for _, task := range wf.Spec.Templates[0].DAG.Tasks { - initTask(task, pWf) - } - for i, template := range wf.Spec.Templates { - if i != 0 { - addTemplate(template, pWf) - } - } - for _, nodeStatus := range wf.Status.Nodes { - addNodeStatus(nodeStatus, pWf) - } - return pWf -} - -func initTask(task argowf.DAGTask, pWf map[string]*WorkflowItem) { - pWf[task.Name] = &WorkflowItem{ - Task: &task, - } -} - -func addTemplate(template argowf.Template, pWf map[string]*WorkflowItem) { - pWf[template.Name].Template = &template - pWf[template.Name].Labels = template.Metadata.Labels -} - -func addNodeStatus(nodeStatus argowf.NodeStatus, pWf map[string]*WorkflowItem) { - if nodeStatus.Type != argowf.NodeTypePod { - return - } - pWf[nodeStatus.TemplateName].NodeStatus = &nodeStatus -} - -func (wfItem *WorkflowItem) StartedAt() *time.Time { - if wfItem.NodeStatus != nil && !wfItem.NodeStatus.StartedAt.Time.IsZero() { - return &wfItem.NodeStatus.StartedAt.Time - } - return nil -} - -func (wfItem *WorkflowItem) FinishedAt() *time.Time { - if wfItem.NodeStatus != nil && !wfItem.NodeStatus.FinishedAt.Time.IsZero() { - return &wfItem.NodeStatus.FinishedAt.Time - } - return nil -} - -func (wfItem *WorkflowItem) Phase() *argowf.NodePhase { - if wfItem.NodeStatus != nil { - return &wfItem.NodeStatus.Phase - } - return nil -} - -func (wfItem *WorkflowItem) Dependencies() strset.Set { - if wfItem.Task != nil && wfItem.Task.Dependencies != nil { - return strset.New(wfItem.Task.Dependencies...) - } - - return strset.New() -} diff --git a/pkg/lib/debug/debug.go b/pkg/lib/debug/debug.go index a1c860d46b..f5a939ea3a 100644 --- a/pkg/lib/debug/debug.go +++ b/pkg/lib/debug/debug.go @@ -20,6 +20,7 @@ import ( "encoding/json" "fmt" + "github.com/cortexlabs/yaml" "github.com/davecgh/go-spew/spew" "github.com/cortexlabs/cortex/pkg/lib/errors" @@ -51,3 +52,11 @@ func Ppj(obj interface{}) { } fmt.Println(string(b)) } + +func Ppy(obj interface{}) { + b, err := yaml.Marshal(obj) + if err != nil { + errors.PrintError(err) + } + fmt.Println(string(b)) +} diff --git a/pkg/lib/k8s/configmap.go b/pkg/lib/k8s/configmap.go new file mode 100644 index 0000000000..fb6b4f8a62 --- /dev/null +++ b/pkg/lib/k8s/configmap.go @@ -0,0 +1,157 @@ +/* +Copyright 2019 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package k8s + +import ( + kcore "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/cortexlabs/cortex/pkg/lib/errors" +) + +var configMapTypeMeta = kmeta.TypeMeta{ + APIVersion: "v1", + Kind: "ConfigMap", +} + +type ConfigMapSpec struct { + Name string + Namespace string + Data map[string]string + Labels map[string]string +} + +func ConfigMap(spec *ConfigMapSpec) *kcore.ConfigMap { + if spec.Namespace == "" { + spec.Namespace = "default" + } + configMap := &kcore.ConfigMap{ + TypeMeta: configMapTypeMeta, + ObjectMeta: kmeta.ObjectMeta{ + Name: spec.Name, + Namespace: spec.Namespace, + Labels: spec.Labels, + }, + Data: spec.Data, + } + return configMap +} + +func (c *Client) CreateConfigMap(configMap *kcore.ConfigMap) (*kcore.ConfigMap, error) { + configMap.TypeMeta = configMapTypeMeta + configMap, err := c.configMapClient.Create(configMap) + if err != nil { + return nil, errors.WithStack(err) + } + return configMap, nil +} + +func (c *Client) updateConfigMap(configMap *kcore.ConfigMap) (*kcore.ConfigMap, error) { + configMap.TypeMeta = configMapTypeMeta + configMap, err := c.configMapClient.Update(configMap) + if err != nil { + return nil, errors.WithStack(err) + } + return configMap, nil +} + +func (c *Client) ApplyConfigMap(configMap *kcore.ConfigMap) (*kcore.ConfigMap, error) { + existing, err := c.GetConfigMap(configMap.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreateConfigMap(configMap) + } + return c.updateConfigMap(configMap) +} + +func (c *Client) GetConfigMap(name string) (*kcore.ConfigMap, error) { + configMap, err := c.configMapClient.Get(name, kmeta.GetOptions{}) + if kerrors.IsNotFound(err) { + return nil, nil + } + if err != nil { + return nil, errors.WithStack(err) + } + configMap.TypeMeta = configMapTypeMeta + return configMap, nil +} + +func (c *Client) GetConfigMapData(name string) (map[string]string, error) { + configMap, err := c.GetConfigMap(name) + if err != nil { + return nil, err + } + if configMap == nil { + return nil, nil + } + return configMap.Data, nil +} + +func (c *Client) DeleteConfigMap(name string) (bool, error) { + err := c.configMapClient.Delete(name, deleteOpts) + if kerrors.IsNotFound(err) { + return false, nil + } + if err != nil { + return false, errors.WithStack(err) + } + return true, nil +} + +func (c *Client) ConfigMapExists(name string) (bool, error) { + configMap, err := c.GetConfigMap(name) + if err != nil { + return false, err + } + return configMap != nil, nil +} + +func (c *Client) ListConfigMaps(opts *kmeta.ListOptions) ([]kcore.ConfigMap, error) { + if opts == nil { + opts = &kmeta.ListOptions{} + } + configMapList, err := c.configMapClient.List(*opts) + if err != nil { + return nil, errors.WithStack(err) + } + for i := range configMapList.Items { + configMapList.Items[i].TypeMeta = configMapTypeMeta + } + return configMapList.Items, nil +} + +func (c *Client) ListConfigMapsByLabels(labels map[string]string) ([]kcore.ConfigMap, error) { + opts := &kmeta.ListOptions{ + LabelSelector: LabelSelector(labels), + } + return c.ListConfigMaps(opts) +} + +func (c *Client) ListConfigMapsByLabel(labelKey string, labelValue string) ([]kcore.ConfigMap, error) { + return c.ListConfigMapsByLabels(map[string]string{labelKey: labelValue}) +} + +func ConfigMapMap(configMaps []kcore.ConfigMap) map[string]kcore.ConfigMap { + configMapMap := map[string]kcore.ConfigMap{} + for _, configMap := range configMaps { + configMapMap[configMap.Name] = configMap + } + return configMapMap +} diff --git a/pkg/lib/k8s/deployment.go b/pkg/lib/k8s/deployment.go index aafa0a0b1f..3501ff7100 100644 --- a/pkg/lib/k8s/deployment.go +++ b/pkg/lib/k8s/deployment.go @@ -32,8 +32,6 @@ var deploymentTypeMeta = kmeta.TypeMeta{ Kind: "Deployment", } -const DeploymentSuccessConditionAll = "!status.unavailableReplicas" - type DeploymentSpec struct { Name string Namespace string @@ -82,15 +80,17 @@ func Deployment(spec *DeploymentSpec) *kapps.Deployment { return deployment } -func (c *Client) CreateDeployment(spec *DeploymentSpec) (*kapps.Deployment, error) { - deployment, err := c.deploymentClient.Create(Deployment(spec)) +func (c *Client) CreateDeployment(deployment *kapps.Deployment) (*kapps.Deployment, error) { + deployment.TypeMeta = deploymentTypeMeta + deployment, err := c.deploymentClient.Create(deployment) if err != nil { return nil, errors.WithStack(err) } return deployment, nil } -func (c *Client) UpdateDeployment(deployment *kapps.Deployment) (*kapps.Deployment, error) { +func (c *Client) updateDeployment(deployment *kapps.Deployment) (*kapps.Deployment, error) { + deployment.TypeMeta = deploymentTypeMeta deployment, err := c.deploymentClient.Update(deployment) if err != nil { return nil, errors.WithStack(err) @@ -98,6 +98,17 @@ func (c *Client) UpdateDeployment(deployment *kapps.Deployment) (*kapps.Deployme return deployment, nil } +func (c *Client) ApplyDeployment(deployment *kapps.Deployment) (*kapps.Deployment, error) { + existing, err := c.GetDeployment(deployment.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreateDeployment(deployment) + } + return c.updateDeployment(deployment) +} + func (c *Client) GetDeployment(name string) (*kapps.Deployment, error) { deployment, err := c.deploymentClient.Get(name, kmeta.GetOptions{}) if kerrors.IsNotFound(err) { diff --git a/pkg/lib/k8s/hpa.go b/pkg/lib/k8s/hpa.go index 134170e1ac..dee3ab7379 100644 --- a/pkg/lib/k8s/hpa.go +++ b/pkg/lib/k8s/hpa.go @@ -63,15 +63,17 @@ func HPA(spec *HPASpec) *kautoscaling.HorizontalPodAutoscaler { return hpa } -func (c *Client) CreateHPA(spec *HPASpec) (*kautoscaling.HorizontalPodAutoscaler, error) { - hpa, err := c.hpaClient.Create(HPA(spec)) +func (c *Client) CreateHPA(hpa *kautoscaling.HorizontalPodAutoscaler) (*kautoscaling.HorizontalPodAutoscaler, error) { + hpa.TypeMeta = hpaTypeMeta + hpa, err := c.hpaClient.Create(hpa) if err != nil { return nil, errors.WithStack(err) } return hpa, nil } -func (c *Client) UpdateHPA(hpa *kautoscaling.HorizontalPodAutoscaler) (*kautoscaling.HorizontalPodAutoscaler, error) { +func (c *Client) updateHPA(hpa *kautoscaling.HorizontalPodAutoscaler) (*kautoscaling.HorizontalPodAutoscaler, error) { + hpa.TypeMeta = hpaTypeMeta hpa, err := c.hpaClient.Update(hpa) if err != nil { return nil, errors.WithStack(err) @@ -79,6 +81,17 @@ func (c *Client) UpdateHPA(hpa *kautoscaling.HorizontalPodAutoscaler) (*kautosca return hpa, nil } +func (c *Client) ApplyHPA(hpa *kautoscaling.HorizontalPodAutoscaler) (*kautoscaling.HorizontalPodAutoscaler, error) { + existing, err := c.GetHPA(hpa.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreateHPA(hpa) + } + return c.updateHPA(hpa) +} + func (c *Client) GetHPA(name string) (*kautoscaling.HorizontalPodAutoscaler, error) { hpa, err := c.hpaClient.Get(name, kmeta.GetOptions{}) if kerrors.IsNotFound(err) { diff --git a/pkg/lib/k8s/ingress.go b/pkg/lib/k8s/ingress.go index fc29a815b9..56f5f77de3 100644 --- a/pkg/lib/k8s/ingress.go +++ b/pkg/lib/k8s/ingress.go @@ -80,15 +80,17 @@ func Ingress(spec *IngressSpec) *kextensions.Ingress { return ingress } -func (c *Client) CreateIngress(spec *IngressSpec) (*kextensions.Ingress, error) { - ingress, err := c.ingressClient.Create(Ingress(spec)) +func (c *Client) CreateIngress(ingress *kextensions.Ingress) (*kextensions.Ingress, error) { + ingress.TypeMeta = ingressTypeMeta + ingress, err := c.ingressClient.Create(ingress) if err != nil { return nil, errors.WithStack(err) } return ingress, nil } -func (c *Client) UpdateIngress(ingress *kextensions.Ingress) (*kextensions.Ingress, error) { +func (c *Client) updateIngress(ingress *kextensions.Ingress) (*kextensions.Ingress, error) { + ingress.TypeMeta = ingressTypeMeta ingress, err := c.ingressClient.Update(ingress) if err != nil { return nil, errors.WithStack(err) @@ -96,6 +98,17 @@ func (c *Client) UpdateIngress(ingress *kextensions.Ingress) (*kextensions.Ingre return ingress, nil } +func (c *Client) ApplyIngress(ingress *kextensions.Ingress) (*kextensions.Ingress, error) { + existing, err := c.GetIngress(ingress.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreateIngress(ingress) + } + return c.updateIngress(ingress) +} + func (c *Client) GetIngress(name string) (*kextensions.Ingress, error) { ingress, err := c.ingressClient.Get(name, kmeta.GetOptions{}) if kerrors.IsNotFound(err) { diff --git a/pkg/lib/k8s/job.go b/pkg/lib/k8s/job.go index 0745f684d5..e96f665c4a 100644 --- a/pkg/lib/k8s/job.go +++ b/pkg/lib/k8s/job.go @@ -25,9 +25,6 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" ) -const JobSuccessCondition = "status.succeeded > 0" -const JobFailureCondition = "status.failed > 0" - var jobTypeMeta = kmeta.TypeMeta{ APIVersion: "batch/v1", Kind: "Job", @@ -79,15 +76,17 @@ func Job(spec *JobSpec) *kbatch.Job { return job } -func (c *Client) CreateJob(spec *JobSpec) (*kbatch.Job, error) { - job, err := c.jobClient.Create(Job(spec)) +func (c *Client) CreateJob(job *kbatch.Job) (*kbatch.Job, error) { + job.TypeMeta = jobTypeMeta + job, err := c.jobClient.Create(job) if err != nil { return nil, errors.WithStack(err) } return job, nil } -func (c *Client) UpdateJob(job *kbatch.Job) (*kbatch.Job, error) { +func (c *Client) updateJob(job *kbatch.Job) (*kbatch.Job, error) { + job.TypeMeta = jobTypeMeta job, err := c.jobClient.Update(job) if err != nil { return nil, errors.WithStack(err) @@ -95,6 +94,17 @@ func (c *Client) UpdateJob(job *kbatch.Job) (*kbatch.Job, error) { return job, nil } +func (c *Client) ApplyJob(job *kbatch.Job) (*kbatch.Job, error) { + existing, err := c.GetJob(job.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreateJob(job) + } + return c.updateJob(job) +} + func (c *Client) GetJob(name string) (*kbatch.Job, error) { job, err := c.jobClient.Get(name, kmeta.GetOptions{}) if kerrors.IsNotFound(err) { @@ -158,3 +168,14 @@ func JobMap(jobs []kbatch.Job) map[string]kbatch.Job { } return jobMap } + +func (c *Client) IsJobRunning(name string) (bool, error) { + job, err := c.GetJob(name) + if err != nil { + return false, err + } + if job == nil { + return false, nil + } + return job.Status.CompletionTime == nil, nil +} diff --git a/pkg/lib/k8s/k8s.go b/pkg/lib/k8s/k8s.go index a1e202ab7d..4e9758c1da 100644 --- a/pkg/lib/k8s/k8s.go +++ b/pkg/lib/k8s/k8s.go @@ -50,6 +50,7 @@ type Client struct { clientset *kclientset.Clientset podClient kclientcore.PodInterface serviceClient kclientcore.ServiceInterface + configMapClient kclientcore.ConfigMapInterface deploymentClient kclientapps.DeploymentInterface jobClient kclientbatch.JobInterface ingressClient kclientextensions.IngressInterface @@ -80,6 +81,7 @@ func New(namespace string, inCluster bool) (*Client, error) { client.podClient = client.clientset.CoreV1().Pods(namespace) client.serviceClient = client.clientset.CoreV1().Services(namespace) + client.configMapClient = client.clientset.CoreV1().ConfigMaps(namespace) client.deploymentClient = client.clientset.AppsV1().Deployments(namespace) client.jobClient = client.clientset.BatchV1().Jobs(namespace) client.ingressClient = client.clientset.ExtensionsV1beta1().Ingresses(namespace) diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index 5f258abe7c..7e132bc2e3 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -75,15 +75,17 @@ func Pod(spec *PodSpec) *kcore.Pod { return pod } -func (c *Client) CreatePod(spec *PodSpec) (*kcore.Pod, error) { - pod, err := c.podClient.Create(Pod(spec)) +func (c *Client) CreatePod(pod *kcore.Pod) (*kcore.Pod, error) { + pod.TypeMeta = podTypeMeta + pod, err := c.podClient.Create(pod) if err != nil { return nil, errors.WithStack(err) } return pod, nil } -func (c *Client) UpdatePod(pod *kcore.Pod) (*kcore.Pod, error) { +func (c *Client) updatePod(pod *kcore.Pod) (*kcore.Pod, error) { + pod.TypeMeta = podTypeMeta pod, err := c.podClient.Update(pod) if err != nil { return nil, errors.WithStack(err) @@ -91,6 +93,17 @@ func (c *Client) UpdatePod(pod *kcore.Pod) (*kcore.Pod, error) { return pod, nil } +func (c *Client) ApplyPod(pod *kcore.Pod) (*kcore.Pod, error) { + existing, err := c.GetPod(pod.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreatePod(pod) + } + return c.updatePod(pod) +} + func GetPodLastContainerStartTime(pod *kcore.Pod) *time.Time { var startTime *time.Time for _, containerStatus := range pod.Status.ContainerStatuses { diff --git a/pkg/lib/k8s/service.go b/pkg/lib/k8s/service.go index 63f2d9f924..8bd4149979 100644 --- a/pkg/lib/k8s/service.go +++ b/pkg/lib/k8s/service.go @@ -66,15 +66,17 @@ func Service(spec *ServiceSpec) *kcore.Service { return service } -func (c *Client) CreateService(spec *ServiceSpec) (*kcore.Service, error) { - service, err := c.serviceClient.Create(Service(spec)) +func (c *Client) CreateService(service *kcore.Service) (*kcore.Service, error) { + service.TypeMeta = serviceTypeMeta + service, err := c.serviceClient.Create(service) if err != nil { return nil, errors.WithStack(err) } return service, nil } -func (c *Client) UpdateService(service *kcore.Service) (*kcore.Service, error) { +func (c *Client) updateService(service *kcore.Service) (*kcore.Service, error) { + service.TypeMeta = serviceTypeMeta service, err := c.serviceClient.Update(service) if err != nil { return nil, errors.WithStack(err) @@ -82,6 +84,20 @@ func (c *Client) UpdateService(service *kcore.Service) (*kcore.Service, error) { return service, nil } +func (c *Client) ApplyService(service *kcore.Service) (*kcore.Service, error) { + existing, err := c.GetService(service.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.CreateService(service) + } + + service.Spec.ClusterIP = existing.Spec.ClusterIP + service.ResourceVersion = existing.ResourceVersion + return c.updateService(service) +} + func (c *Client) GetService(name string) (*kcore.Service, error) { service, err := c.serviceClient.Get(name, kmeta.GetOptions{}) if kerrors.IsNotFound(err) { diff --git a/pkg/lib/spark/spark.go b/pkg/lib/spark/spark.go index dd65bf9bbf..19c64e6650 100644 --- a/pkg/lib/spark/spark.go +++ b/pkg/lib/spark/spark.go @@ -17,8 +17,6 @@ limitations under the License. package spark import ( - "strings" - sparkop "github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/apis/sparkoperator.k8s.io/v1alpha1" sparkopclientset "github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/client/clientset/versioned" sparkopclientapi "github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/client/clientset/versioned/typed/sparkoperator.k8s.io/v1alpha1" @@ -28,7 +26,6 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/slices" ) type Client struct { @@ -36,55 +33,117 @@ type Client struct { sparkClient sparkopclientapi.SparkApplicationInterface } -var ( - doneStates = []string{ - string(sparkop.CompletedState), - string(sparkop.FailedState), - string(sparkop.FailedSubmissionState), - string(sparkop.UnknownState), +func New(restConfig *kclientrest.Config, namespace string) (*Client, error) { + var err error + client := &Client{} + client.sparkClientset, err = sparkopclientset.NewForConfig(restConfig) + if err != nil { + return nil, errors.Wrap(err, "spark", "kubeconfig") } - runningStates = []string{ - string(sparkop.NewState), - string(sparkop.SubmittedState), - string(sparkop.RunningState), + client.sparkClient = client.sparkClientset.SparkoperatorV1alpha1().SparkApplications(namespace) + return client, nil +} + +var sparkAppTypeMeta = kmeta.TypeMeta{ + APIVersion: "sparkoperator.k8s.io/v1alpha1", + Kind: "SparkApplication", +} + +type Spec struct { + Name string + Namespace string + Spec sparkop.SparkApplicationSpec + Labels map[string]string +} + +func App(spec *Spec) *sparkop.SparkApplication { + if spec.Namespace == "" { + spec.Namespace = "default" } + return &sparkop.SparkApplication{ + TypeMeta: sparkAppTypeMeta, + ObjectMeta: kmeta.ObjectMeta{ + Name: spec.Name, + Namespace: spec.Namespace, + Labels: spec.Labels, + }, + Spec: spec.Spec, + } +} - successStates = []string{ - string(sparkop.CompletedState), +func (c *Client) Create(sparkApp *sparkop.SparkApplication) (*sparkop.SparkApplication, error) { + sparkApp.TypeMeta = sparkAppTypeMeta + sparkApp, err := c.sparkClient.Create(sparkApp) + if err != nil { + return nil, errors.WithStack(err) } + return sparkApp, nil +} - failureStates = []string{ - string(sparkop.FailedState), - string(sparkop.FailedSubmissionState), - string(sparkop.UnknownState), +func (c *Client) update(sparkApp *sparkop.SparkApplication) (*sparkop.SparkApplication, error) { + sparkApp.TypeMeta = sparkAppTypeMeta + sparkApp, err := c.sparkClient.Update(sparkApp) + if err != nil { + return nil, errors.WithStack(err) } + return sparkApp, nil +} - SuccessCondition = "status.applicationState.state in (" + strings.Join(successStates, ",") + ")" - FailureCondition = "status.applicationState.state in (" + strings.Join(failureStates, ",") + ")" -) +func (c *Client) Apply(sparkApp *sparkop.SparkApplication) (*sparkop.SparkApplication, error) { + existing, err := c.Get(sparkApp.Name) + if err != nil { + return nil, err + } + if existing == nil { + return c.Create(sparkApp) + } + return c.update(sparkApp) +} -func New(restConfig *kclientrest.Config, namespace string) (*Client, error) { - var err error - client := &Client{} - client.sparkClientset, err = sparkopclientset.NewForConfig(restConfig) +func (c *Client) Get(name string) (*sparkop.SparkApplication, error) { + sparkApp, err := c.sparkClient.Get(name, kmeta.GetOptions{}) + if kerrors.IsNotFound(err) { + return nil, nil + } if err != nil { - return nil, errors.Wrap(err, "spark", "kubeconfig") + return nil, errors.WithStack(err) } + sparkApp.TypeMeta = sparkAppTypeMeta + return sparkApp, nil +} - client.sparkClient = client.sparkClientset.SparkoperatorV1alpha1().SparkApplications(namespace) - return client, nil +func (c *Client) Delete(appName string) (bool, error) { + err := c.sparkClient.Delete(appName, &kmeta.DeleteOptions{}) + if kerrors.IsNotFound(err) { + return false, nil + } + if err != nil { + return false, errors.WithStack(err) + } + return true, nil +} + +func (c *Client) Exists(name string) (bool, error) { + sparkApp, err := c.Get(name) + if err != nil { + return false, err + } + return sparkApp != nil, nil } func (c *Client) List(opts *kmeta.ListOptions) ([]sparkop.SparkApplication, error) { if opts == nil { opts = &kmeta.ListOptions{} } - sparkList, err := c.sparkClient.List(*opts) + sparkAppList, err := c.sparkClient.List(*opts) if err != nil { return nil, errors.WithStack(err) } - return sparkList.Items, nil + for i := range sparkAppList.Items { + sparkAppList.Items[i].TypeMeta = sparkAppTypeMeta + } + return sparkAppList.Items, nil } func (c *Client) ListByLabels(labels map[string]string) ([]sparkop.SparkApplication, error) { @@ -98,17 +157,21 @@ func (c *Client) ListByLabel(labelKey string, labelValue string) ([]sparkop.Spar return c.ListByLabels(map[string]string{labelKey: labelValue}) } -func (c *Client) Delete(appName string) (bool, error) { - err := c.sparkClient.Delete(appName, &kmeta.DeleteOptions{}) - if kerrors.IsNotFound(err) { - return false, nil +func Map(services []sparkop.SparkApplication) map[string]sparkop.SparkApplication { + sparkAppMap := map[string]sparkop.SparkApplication{} + for _, sparkApp := range services { + sparkAppMap[sparkApp.Name] = sparkApp } - if err != nil { - return false, errors.WithStack(err) - } - return true, nil + return sparkAppMap } -func IsDone(sparkApp *sparkop.SparkApplication) bool { - return slices.HasString(doneStates, string(sparkApp.Status.AppState.State)) +func (c *Client) IsRunning(name string) (bool, error) { + sparkApp, err := c.Get(name) + if err != nil { + return false, err + } + if sparkApp == nil { + return false, nil + } + return sparkApp.Status.CompletionTime.IsZero(), nil } diff --git a/pkg/operator/api/context/context.go b/pkg/operator/api/context/context.go index 7dafe70b25..0aef70158b 100644 --- a/pkg/operator/api/context/context.go +++ b/pkg/operator/api/context/context.go @@ -196,15 +196,6 @@ func (ctx *Context) AllResourcesByName(name string) []Resource { return resources } -// Overwrites any existing workload IDs -func (ctx *Context) PopulateWorkloadIDs(resourceWorkloadIDs map[string]string) { - for _, res := range ctx.ComputedResources() { - if workloadID, ok := resourceWorkloadIDs[res.GetID()]; ok { - res.SetWorkloadID(workloadID) - } - } -} - func (ctx *Context) CheckAllWorkloadIDsPopulated() error { for _, res := range ctx.ComputedResources() { if res.GetWorkloadID() == "" { diff --git a/pkg/operator/api/context/dependencies.go b/pkg/operator/api/context/dependencies.go index af10369d79..bc2de00a32 100644 --- a/pkg/operator/api/context/dependencies.go +++ b/pkg/operator/api/context/dependencies.go @@ -27,9 +27,13 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/api/resource" ) -func (ctx *Context) AllComputedResourceDependencies(resourceID string) strset.Set { +// Get all dependencies for resourceID(s). Note: provided resourceIDs are not included in the dependency set +func (ctx *Context) AllComputedResourceDependencies(resourceIDs ...string) strset.Set { allDependencies := strset.New() - ctx.allComputedResourceDependenciesHelper(resourceID, allDependencies) + for _, resourceID := range resourceIDs { + ctx.allComputedResourceDependenciesHelper(resourceID, allDependencies) + } + allDependencies.Remove(resourceIDs...) return allDependencies } @@ -43,42 +47,46 @@ func (ctx *Context) allComputedResourceDependenciesHelper(resourceID string, all } } -func (ctx *Context) DirectComputedResourceDependencies(resourceID string) strset.Set { - for _, pythonPackage := range ctx.PythonPackages { - if pythonPackage.GetID() == resourceID { - return ctx.pythonPackageDependencies(pythonPackage) - } - } - for _, rawColumn := range ctx.RawColumns { - if rawColumn.GetID() == resourceID { - return ctx.rawColumnDependencies(rawColumn) +// Get all dependencies for resourceID(s). Note: provided resourceIDs are not included in the dependency set +func (ctx *Context) DirectComputedResourceDependencies(resourceIDs ...string) strset.Set { + allDependencies := strset.New() + for _, resourceID := range resourceIDs { + for _, pythonPackage := range ctx.PythonPackages { + if pythonPackage.GetID() == resourceID { + allDependencies.Merge(ctx.pythonPackageDependencies(pythonPackage)) + } } - } - for _, aggregate := range ctx.Aggregates { - if aggregate.ID == resourceID { - return ctx.aggregatesDependencies(aggregate) + for _, rawColumn := range ctx.RawColumns { + if rawColumn.GetID() == resourceID { + allDependencies.Merge(ctx.rawColumnDependencies(rawColumn)) + } } - } - for _, transformedColumn := range ctx.TransformedColumns { - if transformedColumn.ID == resourceID { - return ctx.transformedColumnDependencies(transformedColumn) + for _, aggregate := range ctx.Aggregates { + if aggregate.ID == resourceID { + allDependencies.Merge(ctx.aggregatesDependencies(aggregate)) + } } - } - for _, model := range ctx.Models { - if model.ID == resourceID { - return ctx.modelDependencies(model) + for _, transformedColumn := range ctx.TransformedColumns { + if transformedColumn.ID == resourceID { + allDependencies.Merge(ctx.transformedColumnDependencies(transformedColumn)) + } } - if model.Dataset.ID == resourceID { - return ctx.trainingDatasetDependencies(model) + for _, model := range ctx.Models { + if model.ID == resourceID { + allDependencies.Merge(ctx.modelDependencies(model)) + } + if model.Dataset.ID == resourceID { + allDependencies.Merge(ctx.trainingDatasetDependencies(model)) + } } - } - - for _, api := range ctx.APIs { - if api.ID == resourceID { - return ctx.apiDependencies(api) + for _, api := range ctx.APIs { + if api.ID == resourceID { + allDependencies.Merge(ctx.apiDependencies(api)) + } } } - return strset.New() + allDependencies.Remove(resourceIDs...) + return allDependencies } func (ctx *Context) pythonPackageDependencies(pythonPackage *PythonPackage) strset.Set { diff --git a/pkg/operator/api/resource/saved_status.go b/pkg/operator/api/resource/saved_status.go index 34c4d4afb1..e68347f655 100644 --- a/pkg/operator/api/resource/saved_status.go +++ b/pkg/operator/api/resource/saved_status.go @@ -44,6 +44,7 @@ type APISavedStatus struct { type DataExitCode string const ( + ExitCodeDataUnknown DataExitCode = "" ExitCodeDataSucceeded DataExitCode = "succeeded" ExitCodeDataFailed DataExitCode = "failed" ExitCodeDataKilled DataExitCode = "killed" diff --git a/pkg/operator/api/userconfig/compute.go b/pkg/operator/api/userconfig/compute.go index 7a270e3b84..2763ee45e3 100644 --- a/pkg/operator/api/userconfig/compute.go +++ b/pkg/operator/api/userconfig/compute.go @@ -354,6 +354,9 @@ func MaxSparkCompute(sparkComputes ...*SparkCompute) *SparkCompute { aggregated := SparkCompute{} for _, sparkCompute := range sparkComputes { + if sparkCompute == nil { + continue + } if sparkCompute.Executors > aggregated.Executors { aggregated.Executors = sparkCompute.Executors } @@ -393,6 +396,9 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute { aggregated := TFCompute{} for _, tc := range tfComputes { + if tc == nil { + continue + } if tc.CPU.Cmp(aggregated.CPU.Quantity) > 0 { aggregated.CPU = tc.CPU } diff --git a/pkg/operator/config/config.go b/pkg/operator/config/config.go index 6339b48195..7636faf736 100644 --- a/pkg/operator/config/config.go +++ b/pkg/operator/config/config.go @@ -20,7 +20,6 @@ import ( "path/filepath" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/argo" "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/configreader" "github.com/cortexlabs/cortex/pkg/lib/hash" @@ -34,7 +33,6 @@ var ( AWS *aws.Client Kubernetes *k8s.Client Telemetry *telemetry.Client - Argo *argo.Client Spark *spark.Client ) @@ -93,8 +91,6 @@ func Init() error { return err } - Argo = argo.New(Kubernetes.RestConfig, Kubernetes.Namespace) - if Spark, err = spark.New(Kubernetes.RestConfig, Kubernetes.Namespace); err != nil { return err } diff --git a/pkg/operator/context/context.go b/pkg/operator/context/context.go index bf931f99e3..58794f401d 100644 --- a/pkg/operator/context/context.go +++ b/pkg/operator/context/context.go @@ -311,7 +311,7 @@ func LatestWorkloadIDKey(resourceID string, appName string) string { ) } -func WorkloadSpecKey(workloadID string, appName string) string { +func BaseWorkloadKey(workloadID string, appName string) string { return filepath.Join( consts.AppsDir, appName, diff --git a/pkg/operator/endpoints/deploy.go b/pkg/operator/endpoints/deploy.go index d7d4d3acb1..b3e8d7b3e2 100644 --- a/pkg/operator/endpoints/deploy.go +++ b/pkg/operator/endpoints/deploy.go @@ -19,7 +19,6 @@ package endpoints import ( "net/http" - "github.com/cortexlabs/cortex/pkg/lib/argo" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/files" "github.com/cortexlabs/cortex/pkg/lib/zip" @@ -42,30 +41,35 @@ func Deploy(w http.ResponseWriter, r *http.Request) { return } - newWf, err := workloads.Create(ctx) + err = workloads.PopulateWorkloadIDs(ctx) if RespondIfError(w, err) { return } - existingWf, err := workloads.GetWorkflow(ctx.App.Name) + err = workloads.ValidateDeploy(ctx) if RespondIfError(w, err) { return } - isRunning := false - if existingWf != nil { - isRunning = argo.IsRunning(existingWf) + + existingCtx := workloads.CurrentContext(ctx.App.Name) + + fullCtxMatch := false + if existingCtx != nil && existingCtx.ID == ctx.ID && context.APIResourcesAndComputesMatch(ctx, existingCtx) { + fullCtxMatch = true + } + + isUpdating, err := workloads.IsDeploymentUpdating(ctx.App.Name) + if RespondIfError(w, err) { + return } - if isRunning { - if newWf.Labels["ctxID"] == existingWf.Labels["ctxID"] { - prevCtx := workloads.CurrentContext(ctx.App.Name) - if context.APIResourcesAndComputesMatch(ctx, prevCtx) { - respondDeploy(w, ResDeploymentRunning) - return - } + if isUpdating { + if fullCtxMatch { + respondDeploy(w, ResDeploymentUpToDateUpdating) + return } if !force { - respondDeploy(w, ResDifferentDeploymentRunning) + respondDeploy(w, ResDifferentDeploymentUpdating) return } } @@ -75,28 +79,26 @@ func Deploy(w http.ResponseWriter, r *http.Request) { return } - err = workloads.Run(newWf, ctx, existingWf) + err = workloads.Run(ctx) if RespondIfError(w, err) { return } switch { - case isRunning && ignoreCache: - respondDeploy(w, ResDeploymentStoppedCacheDeletedDeploymentStarted) - case isRunning && !ignoreCache && argo.NumTasks(newWf) == 0: - respondDeploy(w, ResDeploymentStoppedDeploymentUpToDate) - case isRunning && !ignoreCache && argo.NumTasks(newWf) != 0: - respondDeploy(w, ResDeploymentStoppedDeploymentStarted) - case !isRunning && ignoreCache: + case isUpdating && ignoreCache: respondDeploy(w, ResCachedDeletedDeploymentStarted) - case !isRunning && !ignoreCache && argo.NumTasks(newWf) == 0: - if existingWf != nil && existingWf.Labels["ctxID"] == newWf.Labels["ctxID"] { - respondDeploy(w, ResDeploymentUpToDate) - return - } + case isUpdating && !ignoreCache: respondDeploy(w, ResDeploymentUpdated) - case !isRunning && !ignoreCache && argo.NumTasks(newWf) != 0: + case !isUpdating && ignoreCache: + respondDeploy(w, ResCachedDeletedDeploymentStarted) + case !isUpdating && !ignoreCache && existingCtx == nil: respondDeploy(w, ResDeploymentStarted) + case !isUpdating && !ignoreCache && existingCtx != nil && !fullCtxMatch: + respondDeploy(w, ResDeploymentUpdated) + case !isUpdating && !ignoreCache && existingCtx != nil && fullCtxMatch: + respondDeploy(w, ResDeploymentUpToDate) + default: + respondDeploy(w, ResDeploymentUpdated) // unexpected } } diff --git a/pkg/operator/endpoints/resources.go b/pkg/operator/endpoints/resources.go index a75cdf5e6b..af7c3a93c0 100644 --- a/pkg/operator/endpoints/resources.go +++ b/pkg/operator/endpoints/resources.go @@ -40,17 +40,7 @@ func GetResources(w http.ResponseWriter, r *http.Request) { return } - deployments, err := workloads.APIDeploymentMap(ctx.App.Name) - if RespondIfError(w, err) { - return - } - - apiStatuses, err := workloads.GetCurrentAPIStatuses(dataStatuses, deployments, ctx) - if RespondIfError(w, err) { - return - } - - apiGroupStatuses, err := workloads.GetAPIGroupStatuses(apiStatuses, deployments, ctx) + apiStatuses, apiGroupStatuses, err := workloads.GetCurrentAPIAndGroupStatuses(dataStatuses, ctx) if RespondIfError(w, err) { return } diff --git a/pkg/operator/endpoints/shared.go b/pkg/operator/endpoints/shared.go index d276e02ccb..2df4e3bc15 100644 --- a/pkg/operator/endpoints/shared.go +++ b/pkg/operator/endpoints/shared.go @@ -29,16 +29,13 @@ import ( ) const ( - ResDeploymentStarted = "Deployment started" - ResDeploymentUpdated = "Deployment updated" - ResDeploymentDeleted = "Deployment deleted" - ResDeploymentUpToDate = "Deployment is up-to-date" - ResDeploymentRunning = "Deployment is already running" - ResDifferentDeploymentRunning = "Another deployment is running, use --force to override" - ResCachedDeletedDeploymentStarted = "Cache deleted, deployment started" - ResDeploymentStoppedDeploymentStarted = "Running deployment stopped, new deployment started" - ResDeploymentStoppedCacheDeletedDeploymentStarted = "Running deployment stopped, cached deleted, new deployment started" - ResDeploymentStoppedDeploymentUpToDate = "Running deployment stopped, new deployment is up-to-date" + ResDeploymentStarted = "Deployment started" + ResDeploymentUpdated = "Deployment updated" + ResDeploymentDeleted = "Deployment deleted" + ResDeploymentUpToDate = "Deployment is up-to-date" + ResDeploymentUpToDateUpdating = "Deployment is already updating" + ResDifferentDeploymentUpdating = "Previous deployment is currently updating, use --force to override" + ResCachedDeletedDeploymentStarted = "Cache deleted, deployment started" ) func Respond(w http.ResponseWriter, response interface{}) { diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 4e88ae6b56..be56591c7e 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -17,38 +17,21 @@ limitations under the License. package main import ( - "fmt" "log" "net/http" "strings" - "time" "github.com/gorilla/mux" - cron "gopkg.in/robfig/cron.v2" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/sets/strset" - "github.com/cortexlabs/cortex/pkg/lib/telemetry" "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/context" "github.com/cortexlabs/cortex/pkg/operator/endpoints" "github.com/cortexlabs/cortex/pkg/operator/workloads" ) -const ( - operatorPortStr = "8888" - workflowDeletionDelay = 60 // seconds - cronInterval = 5 // seconds -) - -var ( - awsClient *aws.Client - telemtryClient *telemetry.Client - markedWorkflows = strset.New() -) +const operatorPortStr = "8888" func main() { if err := config.Init(); err != nil { @@ -67,7 +50,6 @@ func main() { } config.Telemetry.ReportEvent("operator.init") - startCron() router := mux.NewRouter() router.Use(panicMiddleware) @@ -132,71 +114,3 @@ func apiVersionCheckMiddleware(next http.Handler) http.Handler { next.ServeHTTP(w, r) }) } - -func startCron() { - cronRunner := cron.New() - cronInterval := fmt.Sprintf("@every %ds", cronInterval) - cronRunner.AddFunc(cronInterval, runCron) - cronRunner.Start() -} - -func runCron() { - defer reportAndRecover("cron failed") - apiPods, err := config.Kubernetes.ListPodsByLabels(map[string]string{ - "workloadType": workloads.WorkloadTypeAPI, - "userFacing": "true", - }) - if err != nil { - config.Telemetry.ReportError(err) - errors.PrintError(err) - } - - if err := workloads.UpdateAPISavedStatuses(apiPods); err != nil { - config.Telemetry.ReportError(err) - errors.PrintError(err) - } - - if err := workloads.UploadLogPrefixesFromAPIPods(apiPods); err != nil { - config.Telemetry.ReportError(err) - errors.PrintError(err) - } - - failedPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{ - FieldSelector: "status.phase=Failed", - }) - if err != nil { - config.Telemetry.ReportError(err) - errors.PrintError(err) - } - - if err := workloads.UpdateDataWorkflowErrors(failedPods); err != nil { - config.Telemetry.ReportError(err) - errors.PrintError(err) - } -} - -func deleteWorkflowDelayed(wfName string) { - deletionDelay := time.Duration(workflowDeletionDelay) * time.Second - if !markedWorkflows.Has(wfName) { - markedWorkflows.Add(wfName) - time.Sleep(deletionDelay) - config.Argo.Delete(wfName) - go deleteMarkerDelayed(markedWorkflows, wfName) - } -} - -// Wait some time before trying to delete again -func deleteMarkerDelayed(markerMap strset.Set, key string) { - time.Sleep(20 * time.Second) - markerMap.Remove(key) -} - -func reportAndRecover(strs ...string) error { - if errInterface := recover(); errInterface != nil { - err := errors.CastRecoverError(errInterface, strs...) - config.Telemetry.ReportError(err) - errors.PrintError(err) - return err - } - return nil -} diff --git a/pkg/operator/workloads/api_saved_status.go b/pkg/operator/workloads/api_saved_status.go index fdb11c5f99..57bbd58c04 100644 --- a/pkg/operator/workloads/api_saved_status.go +++ b/pkg/operator/workloads/api_saved_status.go @@ -137,7 +137,7 @@ func updateAPISavedStatusStartTime(savedStatus *resource.APISavedStatus, pods [] } } -func UpdateAPISavedStatuses(allPods []kcore.Pod) error { +func updateAPISavedStatuses(allPods []kcore.Pod) error { podMap := make(map[string][]kcore.Pod) for _, pod := range allPods { appName := pod.Labels["appName"] diff --git a/pkg/operator/workloads/api_status.go b/pkg/operator/workloads/api_status.go index 3f40262a5c..bef1767f04 100644 --- a/pkg/operator/workloads/api_status.go +++ b/pkg/operator/workloads/api_status.go @@ -20,6 +20,7 @@ import ( "time" kapps "k8s.io/api/apps/v1" + kautoscaling "k8s.io/api/autoscaling/v1" kcore "k8s.io/api/core/v1" "github.com/cortexlabs/cortex/pkg/lib/errors" @@ -30,19 +31,36 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" ) -func GetCurrentAPIStatuses( +func GetCurrentAPIAndGroupStatuses( dataStatuses map[string]*resource.DataStatus, - deployments map[string]*kapps.Deployment, // api.Name -> deployment ctx *context.Context, -) (map[string]*resource.APIStatus, error) { +) (map[string]*resource.APIStatus, map[string]*resource.APIGroupStatus, error) { + deployments, err := apiDeploymentMap(ctx.App.Name) + if err != nil { + return nil, nil, err + } - failedWorkloadIDs, err := getFailedArgoWorkloadIDs(ctx.App.Name) + apiStatuses, err := getCurrentAPIStatuses(dataStatuses, deployments, ctx) if err != nil { - return nil, err + return nil, nil, err } + apiGroupStatuses, err := getAPIGroupStatuses(apiStatuses, deployments, ctx) + if err != nil { + return nil, nil, err + } + + return apiStatuses, apiGroupStatuses, nil +} + +func getCurrentAPIStatuses( + dataStatuses map[string]*resource.DataStatus, + deployments map[string]*kapps.Deployment, // api.Name -> deployment + ctx *context.Context, +) (map[string]*resource.APIStatus, error) { + podList, err := config.Kubernetes.ListPodsByLabels(map[string]string{ - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "appName": ctx.App.Name, "userFacing": "true", }) @@ -99,7 +117,7 @@ func GetCurrentAPIStatuses( for resourceID, apiStatus := range apiStatuses { apiStatus.Path = context.APIPath(apiStatus.APIName, apiStatus.AppName) apiStatus.ReplicaCounts = replicaCountsMap[resourceID] - apiStatus.Code = apiStatusCode(apiStatus, failedWorkloadIDs) + apiStatus.Code = apiStatusCode(apiStatus) } for _, apiStatus := range apiStatuses { @@ -175,11 +193,30 @@ func getReplicaCountsMap( return replicaCountsMap } -func apiStatusCode(apiStatus *resource.APIStatus, failedWorkloadIDs strset.Set) resource.StatusCode { - if failedWorkloadIDs.Has(apiStatus.WorkloadID) { - return resource.StatusError +func numUpdatedReadyReplicas(ctx *context.Context, api *context.API) (int32, error) { + podList, err := config.Kubernetes.ListPodsByLabels(map[string]string{ + "workloadType": workloadTypeAPI, + "appName": ctx.App.Name, + "resourceID": api.ID, + "userFacing": "true", + }) + if err != nil { + return 0, errors.Wrap(err, ctx.App.Name) } + var readyReplicas int32 + apiComputeID := api.Compute.IDWithoutReplicas() + for _, pod := range podList { + podStatus := k8s.GetPodStatus(&pod) + if podStatus == k8s.PodStatusRunning && APIPodComputeID(pod.Spec.Containers) == apiComputeID { + readyReplicas++ + } + } + + return readyReplicas, nil +} + +func apiStatusCode(apiStatus *resource.APIStatus) resource.StatusCode { if apiStatus.MaxReplicas == 0 { if apiStatus.TotalReady() > 0 { return resource.StatusStopping @@ -235,7 +272,7 @@ func updateAPIStatusCodeByParents(apiStatus *resource.APIStatus, dataStatuses ma } } -func GetAPIGroupStatuses( +func getAPIGroupStatuses( apiStatuses map[string]*resource.APIStatus, deployments map[string]*kapps.Deployment, // api.Name -> deployment ctx *context.Context, @@ -340,14 +377,7 @@ func getGroupedReplicaCounts(apiStatuses []*resource.APIStatus, ctx *context.Con groupedReplicaCounts.ReadyStaleCompute = apiStatus.ReadyStaleCompute groupedReplicaCounts.FailedUpdated = apiStatus.FailedUpdatedCompute groupedReplicaCounts.FailedStaleCompute = apiStatus.FailedStaleCompute - - groupedReplicaCounts.Requested = ctxAPI.Compute.InitReplicas - if apiStatus.K8sRequested > 0 { - groupedReplicaCounts.Requested = apiStatus.K8sRequested - } - if groupedReplicaCounts.Requested < ctxAPI.Compute.MinReplicas { - groupedReplicaCounts.Requested = ctxAPI.Compute.MinReplicas - } + groupedReplicaCounts.Requested = getRequestedReplicas(ctxAPI, apiStatus.K8sRequested, nil) } else { groupedReplicaCounts.ReadyStaleModel += apiStatus.TotalReady() groupedReplicaCounts.FailedStaleModel += apiStatus.TotalFailed() @@ -357,6 +387,36 @@ func getGroupedReplicaCounts(apiStatuses []*resource.APIStatus, ctx *context.Con return groupedReplicaCounts } +func getRequestedReplicas(api *context.API, k8sRequested int32, hpa *kautoscaling.HorizontalPodAutoscaler) int32 { + // In case HPA hasn't updated the k8s deployment yet. May not be common, so not necessary to pass in hpa + if hpa != nil && hpa.Spec.MinReplicas != nil && k8sRequested < *hpa.Spec.MinReplicas { + k8sRequested = *hpa.Spec.MinReplicas + } + if hpa != nil && k8sRequested > hpa.Spec.MaxReplicas { + k8sRequested = hpa.Spec.MaxReplicas + } + + requestedReplicas := api.Compute.InitReplicas + if k8sRequested > 0 { + requestedReplicas = k8sRequested + } + if requestedReplicas < api.Compute.MinReplicas { + requestedReplicas = api.Compute.MinReplicas + } + if requestedReplicas > api.Compute.MaxReplicas { + requestedReplicas = api.Compute.MaxReplicas + } + return requestedReplicas +} + +func getRequestedReplicasFromDeployment(api *context.API, k8sDeployment *kapps.Deployment, hpa *kautoscaling.HorizontalPodAutoscaler) int32 { + var k8sRequested int32 + if k8sDeployment != nil && k8sDeployment.Spec.Replicas != nil { + k8sRequested = *k8sDeployment.Spec.Replicas + } + return getRequestedReplicas(api, k8sRequested, hpa) +} + func setInsufficientComputeAPIStatusCodes(apiStatuses map[string]*resource.APIStatus, ctx *context.Context) error { stalledPods, err := config.Kubernetes.StalledPods() if err != nil { diff --git a/pkg/operator/workloads/api.go b/pkg/operator/workloads/api_workload.go similarity index 69% rename from pkg/operator/workloads/api.go rename to pkg/operator/workloads/api_workload.go index fd8fd5aca0..4c3403f6d4 100644 --- a/pkg/operator/workloads/api.go +++ b/pkg/operator/workloads/api_workload.go @@ -22,14 +22,13 @@ import ( kapps "k8s.io/api/apps/v1" kautoscaling "k8s.io/api/autoscaling/v1" kcore "k8s.io/api/core/v1" + kextensions "k8s.io/api/extensions/v1beta1" kresource "k8s.io/apimachinery/pkg/api/resource" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" intstr "k8s.io/apimachinery/pkg/util/intstr" "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/api/context" "github.com/cortexlabs/cortex/pkg/operator/api/userconfig" "github.com/cortexlabs/cortex/pkg/operator/config" @@ -38,8 +37,207 @@ import ( const ( apiContainerName = "api" tfServingContainerName = "serve" + + defaultPortInt32, defaultPortStr = int32(8888), "8888" + tfServingPortInt32, tfServingPortStr = int32(9000), "9000" ) +type APIWorkload struct { + BaseWorkload +} + +func populateAPIWorkloadIDs(ctx *context.Context, latestResourceWorkloadIDs map[string]string) { + for _, api := range ctx.APIs { + if api.WorkloadID != "" { + continue + } + if workloadID := latestResourceWorkloadIDs[api.ID]; workloadID != "" { + api.WorkloadID = workloadID + continue + } + api.WorkloadID = generateWorkloadID() + } +} + +func extractAPIWorkloads(ctx *context.Context) []Workload { + workloads := make([]Workload, 0, len(ctx.APIs)) + + for _, api := range ctx.APIs { + workloads = append(workloads, &APIWorkload{ + singleBaseWorkload(api, ctx.App.Name, workloadTypeAPI), + }) + } + + return workloads +} + +func (aw *APIWorkload) Start(ctx *context.Context) error { + api := ctx.APIs.OneByID(aw.GetSingleResourceID()) + + k8sDeloymentName := internalAPIName(api.Name, ctx.App.Name) + k8sDeloyment, err := config.Kubernetes.GetDeployment(k8sDeloymentName) + if err != nil { + return err + } + hpa, err := config.Kubernetes.GetHPA(k8sDeloymentName) + if err != nil { + return err + } + + desiredReplicas := getRequestedReplicasFromDeployment(api, k8sDeloyment, hpa) + + var deploymentSpec *kapps.Deployment + + switch api.ModelFormat { + case userconfig.TensorFlowModelFormat: + deploymentSpec = tfAPISpec(ctx, api, aw.WorkloadID, desiredReplicas) + case userconfig.ONNXModelFormat: + deploymentSpec = onnxAPISpec(ctx, api, aw.WorkloadID, desiredReplicas) + default: + return errors.New(api.Name, "unknown model format encountered") // unexpected + } + + _, err = config.Kubernetes.ApplyIngress(ingressSpec(ctx, api)) + if err != nil { + return err + } + + _, err = config.Kubernetes.ApplyService(serviceSpec(ctx, api)) + if err != nil { + return err + } + + _, err = config.Kubernetes.ApplyDeployment(deploymentSpec) + if err != nil { + return err + } + + _, err = config.Kubernetes.ApplyHPA(hpaSpec(ctx, api)) + if err != nil { + return err + } + + return nil +} + +func (aw *APIWorkload) IsSucceeded(ctx *context.Context) (bool, error) { + api := ctx.APIs.OneByID(aw.GetSingleResourceID()) + k8sDeloymentName := internalAPIName(api.Name, ctx.App.Name) + + k8sDeployment, err := config.Kubernetes.GetDeployment(k8sDeloymentName) + if err != nil { + return false, err + } + if k8sDeployment == nil || k8sDeployment.Labels["resourceID"] != api.ID || k8sDeployment.DeletionTimestamp != nil { + return false, nil + } + + hpa, err := config.Kubernetes.GetHPA(k8sDeloymentName) + if err != nil { + return false, err + } + + if doesAPIComputeNeedsUpdating(api, k8sDeployment, hpa) { + return false, nil + } + + updatedReplicas, err := numUpdatedReadyReplicas(ctx, api) + if err != nil { + return false, err + } + requestedReplicas := getRequestedReplicasFromDeployment(api, k8sDeployment, hpa) + if updatedReplicas < requestedReplicas { + return false, nil + } + + return true, nil +} + +func (aw *APIWorkload) IsRunning(ctx *context.Context) (bool, error) { + api := ctx.APIs.OneByID(aw.GetSingleResourceID()) + k8sDeloymentName := internalAPIName(api.Name, ctx.App.Name) + + k8sDeployment, err := config.Kubernetes.GetDeployment(k8sDeloymentName) + if err != nil { + return false, err + } + if k8sDeployment == nil || k8sDeployment.Labels["resourceID"] != api.ID || k8sDeployment.DeletionTimestamp != nil { + return false, nil + } + + hpa, err := config.Kubernetes.GetHPA(k8sDeloymentName) + if err != nil { + return false, err + } + + if doesAPIComputeNeedsUpdating(api, k8sDeployment, hpa) { + return false, nil + } + + updatedReplicas, err := numUpdatedReadyReplicas(ctx, api) + if err != nil { + return false, err + } + requestedReplicas := getRequestedReplicasFromDeployment(api, k8sDeployment, hpa) + if updatedReplicas < requestedReplicas { + return true, nil + } + + return false, nil +} + +func (aw *APIWorkload) IsStarted(ctx *context.Context) (bool, error) { + api := ctx.APIs.OneByID(aw.GetSingleResourceID()) + k8sDeloymentName := internalAPIName(api.Name, ctx.App.Name) + + k8sDeployment, err := config.Kubernetes.GetDeployment(k8sDeloymentName) + if err != nil { + return false, err + } + if k8sDeployment == nil || k8sDeployment.Labels["resourceID"] != api.ID || k8sDeployment.DeletionTimestamp != nil { + return false, nil + } + + hpa, err := config.Kubernetes.GetHPA(k8sDeloymentName) + if err != nil { + return false, err + } + + if doesAPIComputeNeedsUpdating(api, k8sDeployment, hpa) { + return false, nil + } + + return true, nil +} + +func (aw *APIWorkload) CanRun(ctx *context.Context) (bool, error) { + return areAllDataDependenciesSucceeded(ctx, aw.GetResourceIDs()) +} + +func (aw *APIWorkload) IsFailed(ctx *context.Context) (bool, error) { + api := ctx.APIs.OneByID(aw.GetSingleResourceID()) + + pods, err := config.Kubernetes.ListPodsByLabels(map[string]string{ + "appName": ctx.App.Name, + "workloadType": workloadTypeAPI, + "apiName": api.Name, + "resourceID": api.ID, + "workloadID": aw.GetWorkloadID(), + "userFacing": "true", + }) + if err != nil { + return false, err + } + + for _, pod := range pods { + if k8s.GetPodStatus(&pod) == k8s.PodStatusFailed { + return true, nil + } + } + + return false, nil +} + func tfAPISpec( ctx *context.Context, api *context.API, @@ -72,20 +270,20 @@ func tfAPISpec( Replicas: desiredReplicas, Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, "resourceID": ctx.APIs[api.Name].ID, "workloadID": workloadID, }, Selector: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, }, PodSpec: k8s.PodSpec{ Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, "resourceID": ctx.APIs[api.Name].ID, "workloadID": workloadID, @@ -191,20 +389,20 @@ func onnxAPISpec( Replicas: desiredReplicas, Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, "resourceID": ctx.APIs[api.Name].ID, "workloadID": workloadID, }, Selector: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, }, PodSpec: k8s.PodSpec{ Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, "resourceID": ctx.APIs[api.Name].ID, "workloadID": workloadID, @@ -255,8 +453,8 @@ func onnxAPISpec( }) } -func ingressSpec(ctx *context.Context, api *context.API) *k8s.IngressSpec { - return &k8s.IngressSpec{ +func ingressSpec(ctx *context.Context, api *context.API) *kextensions.Ingress { + return k8s.Ingress(&k8s.IngressSpec{ Name: internalAPIName(api.Name, ctx.App.Name), ServiceName: internalAPIName(api.Name, ctx.App.Name), ServicePort: defaultPortInt32, @@ -264,30 +462,30 @@ func ingressSpec(ctx *context.Context, api *context.API) *k8s.IngressSpec { IngressClass: "apis", Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, }, Namespace: config.Cortex.Namespace, - } + }) } -func serviceSpec(ctx *context.Context, api *context.API) *k8s.ServiceSpec { - return &k8s.ServiceSpec{ +func serviceSpec(ctx *context.Context, api *context.API) *kcore.Service { + return k8s.Service(&k8s.ServiceSpec{ Name: internalAPIName(api.Name, ctx.App.Name), Port: defaultPortInt32, TargetPort: defaultPortInt32, Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, }, Selector: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, }, Namespace: config.Cortex.Namespace, - } + }) } func hpaSpec(ctx *context.Context, api *context.API) *kautoscaling.HorizontalPodAutoscaler { @@ -298,75 +496,14 @@ func hpaSpec(ctx *context.Context, api *context.API) *kautoscaling.HorizontalPod TargetCPUUtilization: api.Compute.TargetCPUUtilization, Labels: map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, "apiName": api.Name, }, Namespace: config.Cortex.Namespace, }) } -func apiWorkloadSpecs(ctx *context.Context) ([]*WorkloadSpec, error) { - var workloadSpecs []*WorkloadSpec - - deployments, err := APIDeploymentMap(ctx.App.Name) - if err != nil { - return nil, err - } - - hpas, err := apiHPAMap(ctx.App.Name) - if err != nil { - return nil, err - } - - for apiName, api := range ctx.APIs { - workloadID := generateWorkloadID() - desiredReplicas := api.Compute.InitReplicas - - deployment, deploymentExists := deployments[apiName] - if deploymentExists && deployment.Labels["resourceID"] == api.ID && deployment.DeletionTimestamp == nil { - hpa := hpas[apiName] - - if !apiComputeNeedsUpdating(api, deployment, hpa) { - continue // Deployment is fully up to date (model and compute/replicas) - } - - // Reuse workloadID if just modifying compute/replicas - workloadID = deployment.Labels["workloadID"] - - // Use current replicas or min replicas - if deployment.Spec.Replicas != nil { - desiredReplicas = *deployment.Spec.Replicas - } - if hpa != nil && hpa.Spec.MinReplicas != nil && *hpa.Spec.MinReplicas > desiredReplicas { - desiredReplicas = *hpa.Spec.MinReplicas - } - } - - var spec kmeta.Object - - switch api.ModelFormat { - case userconfig.TensorFlowModelFormat: - spec = tfAPISpec(ctx, api, workloadID, desiredReplicas) - case userconfig.ONNXModelFormat: - spec = onnxAPISpec(ctx, api, workloadID, desiredReplicas) - default: - return nil, errors.New(api.Name, "unknown model format encountered") // unexpected - } - - workloadSpecs = append(workloadSpecs, &WorkloadSpec{ - WorkloadID: workloadID, - ResourceIDs: strset.New(api.ID), - K8sSpecs: []kmeta.Object{spec, hpaSpec(ctx, api)}, - K8sAction: "apply", - WorkloadType: WorkloadTypeAPI, - // SuccessCondition: k8s.DeploymentSuccessConditionAll, # Currently success conditions don't work for multi-resource config - }) - } - - return workloadSpecs, nil -} - -func apiComputeNeedsUpdating(api *context.API, deployment *kapps.Deployment, hpa *kautoscaling.HorizontalPodAutoscaler) bool { +func doesAPIComputeNeedsUpdating(api *context.API, deployment *kapps.Deployment, hpa *kautoscaling.HorizontalPodAutoscaler) bool { if hpa == nil { return true } @@ -398,7 +535,7 @@ func apiComputeNeedsUpdating(api *context.API, deployment *kapps.Deployment, hpa func deleteOldAPIs(ctx *context.Context) { ingresses, _ := config.Kubernetes.ListIngressesByLabels(map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, }) for _, ingress := range ingresses { if _, ok := ctx.APIs[ingress.Labels["apiName"]]; !ok { @@ -408,7 +545,7 @@ func deleteOldAPIs(ctx *context.Context) { services, _ := config.Kubernetes.ListServicesByLabels(map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, }) for _, service := range services { if _, ok := ctx.APIs[service.Labels["apiName"]]; !ok { @@ -418,7 +555,7 @@ func deleteOldAPIs(ctx *context.Context) { deployments, _ := config.Kubernetes.ListDeploymentsByLabels(map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, }) for _, deployment := range deployments { if _, ok := ctx.APIs[deployment.Labels["apiName"]]; !ok { @@ -428,7 +565,7 @@ func deleteOldAPIs(ctx *context.Context) { hpas, _ := config.Kubernetes.ListHPAsByLabels(map[string]string{ "appName": ctx.App.Name, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, }) for _, hpa := range hpas { if _, ok := ctx.APIs[hpa.Labels["apiName"]]; !ok { @@ -437,38 +574,11 @@ func deleteOldAPIs(ctx *context.Context) { } } -func createServicesAndIngresses(ctx *context.Context) error { - for _, api := range ctx.APIs { - ingressExists, err := config.Kubernetes.IngressExists(internalAPIName(api.Name, ctx.App.Name)) - if err != nil { - return errors.Wrap(err, ctx.App.Name, "ingresses", api.Name, "create") - } - if !ingressExists { - _, err = config.Kubernetes.CreateIngress(ingressSpec(ctx, api)) - if err != nil { - return errors.Wrap(err, ctx.App.Name, "ingresses", api.Name, "create") - } - } - - serviceExists, err := config.Kubernetes.ServiceExists(internalAPIName(api.Name, ctx.App.Name)) - if err != nil { - return errors.Wrap(err, ctx.App.Name, "services", api.Name, "create") - } - if !serviceExists { - _, err = config.Kubernetes.CreateService(serviceSpec(ctx, api)) - if err != nil { - return errors.Wrap(err, ctx.App.Name, "services", api.Name, "create") - } - } - } - return nil -} - // This returns map apiName -> deployment (not internalName -> deployment) -func APIDeploymentMap(appName string) (map[string]*kapps.Deployment, error) { +func apiDeploymentMap(appName string) (map[string]*kapps.Deployment, error) { deploymentList, err := config.Kubernetes.ListDeploymentsByLabels(map[string]string{ "appName": appName, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, }) if err != nil { return nil, errors.Wrap(err, appName) @@ -491,7 +601,7 @@ func addToDeploymentMap(deployments map[string]*kapps.Deployment, deployment kap func apiHPAMap(appName string) (map[string]*kautoscaling.HorizontalPodAutoscaler, error) { hpaList, err := config.Kubernetes.ListHPAsByLabels(map[string]string{ "appName": appName, - "workloadType": WorkloadTypeAPI, + "workloadType": workloadTypeAPI, }) if err != nil { return nil, errors.Wrap(err, appName) diff --git a/pkg/operator/workloads/consts.go b/pkg/operator/workloads/consts.go deleted file mode 100644 index 2115b9f154..0000000000 --- a/pkg/operator/workloads/consts.go +++ /dev/null @@ -1,29 +0,0 @@ -/* -Copyright 2019 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package workloads - -const ( - WorkloadTypeAPI = "api" - workloadTypeData = "data-job" - workloadTypeTrain = "training-job" - workloadTypePythonPackager = "python-packager" - - defaultPortInt32, defaultPortStr = int32(8888), "8888" - tfServingPortInt32, tfServingPortStr = int32(9000), "9000" - - userFacingCheckInterval = 1 // seconds -) diff --git a/pkg/operator/workloads/cron.go b/pkg/operator/workloads/cron.go new file mode 100644 index 0000000000..8a787fd41b --- /dev/null +++ b/pkg/operator/workloads/cron.go @@ -0,0 +1,100 @@ +/* +Copyright 2019 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloads + +import ( + "time" + + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/operator/config" +) + +const cronInterval = 5 // seconds + +var cronChannel = make(chan struct{}, 1) + +func cronRunner() { + timer := time.NewTimer(0) + defer timer.Stop() + + for { + select { + case <-cronChannel: + runCron() + case <-timer.C: + runCron() + } + timer.Reset(5 * time.Second) + } +} + +func runCronNow() { + cronChannel <- struct{}{} +} + +func runCron() { + defer reportAndRecover("cron failed") + + if err := UpdateWorkflows(); err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } + + apiPods, err := config.Kubernetes.ListPodsByLabels(map[string]string{ + "workloadType": workloadTypeAPI, + "userFacing": "true", + }) + if err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } + + if err := updateAPISavedStatuses(apiPods); err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } + + if err := uploadLogPrefixesFromAPIPods(apiPods); err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } + + failedPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{ + FieldSelector: "status.phase=Failed", + }) + if err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } + + if err := updateDataWorkloadErrors(failedPods); err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } +} + +func reportAndRecover(strs ...string) error { + if errInterface := recover(); errInterface != nil { + err := errors.CastRecoverError(errInterface, strs...) + config.Telemetry.ReportError(err) + errors.PrintError(err) + return err + } + return nil +} diff --git a/pkg/operator/workloads/current_contexts.go b/pkg/operator/workloads/current_contexts.go index e62d5dacda..f5228c0def 100644 --- a/pkg/operator/workloads/current_contexts.go +++ b/pkg/operator/workloads/current_contexts.go @@ -17,11 +17,17 @@ limitations under the License. package workloads import ( + "fmt" "sync" + "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/operator/api/context" + "github.com/cortexlabs/cortex/pkg/operator/config" + ocontext "github.com/cortexlabs/cortex/pkg/operator/context" ) +const configMapName = "cortex-current-contexts" + // appName -> currently deployed context var currentCtxs = struct { m map[string]*context.Context @@ -37,23 +43,82 @@ func CurrentContext(appName string) *context.Context { func CurrentContexts() []*context.Context { currentCtxs.RLock() defer currentCtxs.RUnlock() - ctxs := make([]*context.Context, len(currentCtxs.m)) - i := 0 + ctxs := make([]*context.Context, 0, len(currentCtxs.m)) for _, ctx := range currentCtxs.m { - ctxs[i] = ctx - i++ + ctxs = append(ctxs, ctx) } return ctxs } -func setCurrentContext(ctx *context.Context) { +func setCurrentContext(ctx *context.Context) error { currentCtxs.Lock() defer currentCtxs.Unlock() + currentCtxs.m[ctx.App.Name] = ctx + + err := updateContextConfigMap() + if err != nil { + return err + } + + return nil } -func deleteCurrentContext(appName string) { +func deleteCurrentContext(appName string) error { currentCtxs.Lock() defer currentCtxs.Unlock() + delete(currentCtxs.m, appName) + + err := updateContextConfigMap() + if err != nil { + return err + } + + return nil +} + +func updateContextConfigMap() error { + configMapData := make(map[string]string, len(currentCtxs.m)) + for appName, ctx := range currentCtxs.m { + configMapData[appName] = ctx.ID + } + + configMap := k8s.ConfigMap(&k8s.ConfigMapSpec{ + Name: configMapName, + Namespace: config.Cortex.Namespace, + Data: configMapData, + }) + + _, err := config.Kubernetes.ApplyConfigMap(configMap) + if err != nil { + return err + } + + return nil +} + +func reloadCurrentContexts() error { + currentCtxs.Lock() + defer currentCtxs.Unlock() + + configMap, err := config.Kubernetes.GetConfigMap(configMapName) + if err != nil { + return err + } + if configMap == nil { + return nil + } + + for appName, ctxID := range configMap.Data { + ctx, err := ocontext.DownloadContext(ctxID, appName) + if err != nil { + fmt.Printf("Deleting stale workflow: %s", appName) + DeleteApp(appName, true) + } else if ctx != nil { + currentCtxs.m[appName] = ctx + } + } + + return nil } diff --git a/pkg/operator/workloads/data_saved_status.go b/pkg/operator/workloads/data_saved_status.go index 6b4e60f81a..cd338a32cd 100644 --- a/pkg/operator/workloads/data_saved_status.go +++ b/pkg/operator/workloads/data_saved_status.go @@ -19,10 +19,14 @@ package workloads import ( "time" + kcore "k8s.io/api/core/v1" + "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/lib/pointer" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/api/context" "github.com/cortexlabs/cortex/pkg/operator/api/resource" "github.com/cortexlabs/cortex/pkg/operator/config" @@ -88,7 +92,7 @@ func getDataSavedStatuses(resourceWorkloadIDs map[string]string, appName string) return nil, err } - savedStatusMap := map[string]*resource.DataSavedStatus{} + savedStatusMap := make(map[string]*resource.DataSavedStatus) for _, savedStatus := range savedStatuses { if savedStatus != nil { savedStatusMap[savedStatus.ResourceID] = savedStatus @@ -135,3 +139,86 @@ func updateKilledDataSavedStatuses(ctx *context.Context) error { } return nil } + +func updateDataWorkloadErrors(failedPods []kcore.Pod) error { + checkedWorkloadIDs := strset.New() + nowTime := pointer.Time(time.Now()) + + for _, pod := range failedPods { + appName, ok := pod.Labels["appName"] + if !ok { + continue + } + workloadID, ok := pod.Labels["workloadID"] + if !ok { + continue + } + + if pod.Labels["workloadType"] == workloadTypeAPI { + continue + } + + if checkedWorkloadIDs.Has(workloadID) { + continue + } + checkedWorkloadIDs.Add(workloadID) + + savedWorkload, err := getSavedBaseWorkload(workloadID, appName) + if err != nil { + return err + } + if savedWorkload == nil { + continue + } + + resourceWorkloadIDs := make(map[string]string, len(savedWorkload.Resources)) + for _, resource := range savedWorkload.Resources { + resourceWorkloadIDs[resource.ID] = workloadID + } + + savedStatuses, err := getDataSavedStatuses(resourceWorkloadIDs, appName) + if err != nil { + return err + } + + var savedStatusesToUpload []*resource.DataSavedStatus + for resourceID, res := range savedWorkload.Resources { + savedStatus := savedStatuses[resourceID] + + if savedStatus == nil { + savedStatus = &resource.DataSavedStatus{ + BaseSavedStatus: resource.BaseSavedStatus{ + ResourceID: resourceID, + ResourceType: res.ResourceType, + WorkloadID: workloadID, + AppName: appName, + }, + } + } + + if savedStatus.End == nil { + savedStatus.End = nowTime + if savedStatus.Start == nil { + savedStatus.Start = nowTime + } + + switch k8s.GetPodStatus(&pod) { + case k8s.PodStatusKilled: + savedStatus.ExitCode = resource.ExitCodeDataKilled + case k8s.PodStatusKilledOOM: + savedStatus.ExitCode = resource.ExitCodeDataOOM + default: + savedStatus.ExitCode = resource.ExitCodeDataFailed + } + + savedStatusesToUpload = append(savedStatusesToUpload, savedStatus) + } + } + + err = uploadDataSavedStatuses(savedStatusesToUpload) + if err != nil { + return err + } + } + return nil +} diff --git a/pkg/operator/workloads/errors.go b/pkg/operator/workloads/errors.go index 612d898f30..81e596cbaa 100644 --- a/pkg/operator/workloads/errors.go +++ b/pkg/operator/workloads/errors.go @@ -21,8 +21,6 @@ type ErrorKind int const ( ErrUnknown ErrorKind = iota ErrMoreThanOneWorkflow - ErrContextAppMismatch - ErrWorkflowAppMismatch ErrCortexInstallationBroken ErrLoadBalancerInitializing ErrNotFound @@ -31,8 +29,6 @@ const ( var errorKinds = []string{ "err_unknown", "err_more_than_one_workflow", - "err_context_app_mismatch", - "err_workflow_app_mismatch", "err_cortex_installation_broken", "err_load_balancer_initializing", "err_not_found", @@ -90,20 +86,6 @@ func ErrorMoreThanOneWorkflow() error { } } -func ErrorContextAppMismatch() error { - return Error{ - Kind: ErrContextAppMismatch, - message: "context deployments do not match", - } -} - -func ErrorWorkflowAppMismatch() error { - return Error{ - Kind: ErrWorkflowAppMismatch, - message: "workflow deployments do not match", - } -} - func ErrorCortexInstallationBroken() error { return Error{ Kind: ErrCortexInstallationBroken, diff --git a/pkg/operator/workloads/latest_workload_id.go b/pkg/operator/workloads/latest_workload_id.go index a354171707..e4d51f75d4 100644 --- a/pkg/operator/workloads/latest_workload_id.go +++ b/pkg/operator/workloads/latest_workload_id.go @@ -89,7 +89,7 @@ func getSavedLatestWorkloadIDs(resourceIDs strset.Set, appName string) (map[stri return nil, err } - workloadIDMap := map[string]string{} + workloadIDMap := make(map[string]string) for i := range workloadIDList { workloadIDMap[resourceIDList[i]] = workloadIDList[i] } diff --git a/pkg/operator/workloads/log_prefix.go b/pkg/operator/workloads/log_prefix.go index fda2d5d905..e16899cf88 100644 --- a/pkg/operator/workloads/log_prefix.go +++ b/pkg/operator/workloads/log_prefix.go @@ -89,11 +89,11 @@ func getSavedLogPrefix(workloadID string, appName string, allowNil bool) (string return logPrefix, nil } -func UploadLogPrefixesFromAPIPods(pods []kcore.Pod) error { +func uploadLogPrefixesFromAPIPods(pods []kcore.Pod) error { logPrefixInfos := []*LogPrefixInfo{} currentWorkloadIDs := make(map[string]strset.Set) for _, pod := range pods { - if pod.Labels["workloadType"] != WorkloadTypeAPI { + if pod.Labels["workloadType"] != workloadTypeAPI { continue } diff --git a/pkg/operator/workloads/logs.go b/pkg/operator/workloads/logs.go index fe8d21c542..4d60a56bd1 100644 --- a/pkg/operator/workloads/logs.go +++ b/pkg/operator/workloads/logs.go @@ -36,12 +36,14 @@ import ( ) const ( - writeWait = 10 * time.Second - closeGracePeriod = 10 * time.Second - maxMessageSize = 8192 - podCheckInterval = 5 * time.Second - maxParallelPodLogging = 5 - initLogTailLines = 100 + socketWriteDeadlineWait = 10 * time.Second + socketCloseGracePeriod = 10 * time.Second + socketMaxMessageSize = 8192 + + pendingPodCheckInterval = 1 * time.Second + newPodCheckInterval = 5 * time.Second + maxParallelPodLogging = 5 + initLogTailLines = 100 ) func ReadLogs(appName string, workloadID string, verbose bool, socket *websocket.Conn) { @@ -86,7 +88,7 @@ func ReadLogs(appName string, workloadID string, verbose bool, socket *websocket getKubectlLogs(podMap[k8s.PodStatusKilledOOM], verbose, wrotePending, false, socket) case len(podMap[k8s.PodStatusFailed]) > 0: previous := false - if pods[0].Labels["workloadType"] == WorkloadTypeAPI { + if pods[0].Labels["workloadType"] == workloadTypeAPI { previous = true } getKubectlLogs(podMap[k8s.PodStatusFailed], verbose, wrotePending, previous, socket) @@ -103,9 +105,13 @@ func ReadLogs(appName string, workloadID string, verbose bool, socket *websocket return } - wf, _ := GetWorkflow(appName) - pWf, _ := parseWorkflow(wf) - if pWf == nil || pWf.Workloads[workloadID] == nil { + isEnded, err := IsWorkloadEnded(appName, workloadID) + + if err != nil { + writeSocket(err.Error(), socket) + return + } + if isEnded { logPrefix, err := getSavedLogPrefix(workloadID, appName, true) if err != nil { writeSocket(err.Error(), socket) @@ -118,19 +124,6 @@ func ReadLogs(appName string, workloadID string, verbose bool, socket *websocket return } - failedArgoPod, err := getFailedArgoPodForWorkload(workloadID, appName) - if err != nil { - writeSocket(err.Error(), socket) - return - } - if failedArgoPod != nil { - if !writeSocket("\nFailed to start:\n", socket) { - return - } - getKubectlLogs([]kcore.Pod{*failedArgoPod}, true, false, false, socket) - return - } - if !wrotePending { if !writeSocket("\nPending", socket) { return @@ -138,7 +131,7 @@ func ReadLogs(appName string, workloadID string, verbose bool, socket *websocket wrotePending = true } - time.Sleep(time.Duration(userFacingCheckInterval) * time.Second) + time.Sleep(pendingPodCheckInterval) } } @@ -184,7 +177,7 @@ func startKubectlProcess(pod kcore.Pod, previous bool, attrs *os.ProcAttr) (*os. identifier := pod.Name kubectlArgs = append(kubectlArgs, pod.Name) - if pod.Labels["workloadType"] == WorkloadTypeAPI && pod.Labels["userFacing"] == "true" { + if pod.Labels["workloadType"] == workloadTypeAPI && pod.Labels["userFacing"] == "true" { kubectlArgs = append(kubectlArgs, apiContainerName) kubectlArgs = append(kubectlArgs, fmt.Sprintf("--tail=%d", initLogTailLines)) identifier += " " + apiContainerName @@ -294,7 +287,7 @@ func podCheck(podCheckCancel chan struct{}, socket *websocket.Conn, initialPodLi delete(processMap, podName) } deleteProcesses(deleteMap) - timer.Reset(podCheckInterval) + timer.Reset(newPodCheckInterval) } } } @@ -340,7 +333,7 @@ func getCloudWatchLogs(prefix string, verbose bool, socket *websocket.Conn) { } func pumpStdin(socket *websocket.Conn, writer io.Writer) { - socket.SetReadLimit(maxMessageSize) + socket.SetReadLimit(socketMaxMessageSize) for { _, message, err := socket.ReadMessage() if err != nil { @@ -358,7 +351,7 @@ func pumpStdin(socket *websocket.Conn, writer io.Writer) { func pumpStdout(socket *websocket.Conn, socketWriterError chan error, reader io.Reader, verbose bool, checkForLastLog bool) { scanner := bufio.NewScanner(reader) for scanner.Scan() { - socket.SetWriteDeadline(time.Now().Add(writeWait)) + socket.SetWriteDeadline(time.Now().Add(socketWriteDeadlineWait)) logBytes := scanner.Bytes() isLastLog := false if !verbose { @@ -382,9 +375,9 @@ func pumpStdout(socket *websocket.Conn, socketWriterError chan error, reader io. default: } - socket.SetWriteDeadline(time.Now().Add(writeWait)) + socket.SetWriteDeadline(time.Now().Add(socketWriteDeadlineWait)) socket.WriteMessage(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")) - time.Sleep(closeGracePeriod) + time.Sleep(socketCloseGracePeriod) socket.Close() } diff --git a/pkg/operator/workloads/parsed_workflow.go b/pkg/operator/workloads/parsed_workflow.go deleted file mode 100644 index ac0ab2d934..0000000000 --- a/pkg/operator/workloads/parsed_workflow.go +++ /dev/null @@ -1,143 +0,0 @@ -/* -Copyright 2019 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package workloads - -import ( - "time" - - argowf "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1" - kcore "k8s.io/api/core/v1" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/cortexlabs/cortex/pkg/lib/argo" - "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/sets/strset" - "github.com/cortexlabs/cortex/pkg/operator/config" -) - -type WorkflowItem struct { - WorkloadID string - WorkloadType string - StartedAt *time.Time - FinishedAt *time.Time - ArgoPhase *argowf.NodePhase - DirectDependencies strset.Set - AllDependencies strset.Set -} - -type ParsedWorkflow struct { - Workloads map[string]*WorkflowItem // workloadID -> *WorkflowItem - Wf *argowf.Workflow -} - -func parseWorkflow(wf *argowf.Workflow) (*ParsedWorkflow, error) { - if wf == nil { - return nil, nil - } - - pWf := &ParsedWorkflow{ - Workloads: map[string]*WorkflowItem{}, - Wf: wf, - } - - for _, argoWfItem := range argo.ParseWorkflow(wf) { - workloadID := argoWfItem.Labels["workloadID"] - workloadType := argoWfItem.Labels["workloadType"] - if workloadID == "" || workloadType == "" { - continue - } - - pWf.Workloads[workloadID] = &WorkflowItem{ - WorkloadID: workloadID, - WorkloadType: workloadType, - StartedAt: argoWfItem.StartedAt(), - FinishedAt: argoWfItem.FinishedAt(), - ArgoPhase: argoWfItem.Phase(), - DirectDependencies: argoWfItem.Dependencies(), - } - } - - for workloadID, wfItem := range pWf.Workloads { - allDependencies, err := getAllDependencies(workloadID, pWf.Workloads) - if err != nil { - return nil, err - } - wfItem.AllDependencies = allDependencies - } - - return pWf, nil -} - -func getAllDependencies(workloadID string, workloads map[string]*WorkflowItem) (strset.Set, error) { - wfItem, ok := workloads[workloadID] - if !ok { - return nil, errors.Wrap(ErrorNotFound(), "workload", workloadID) - } - allDependencies := strset.New() - if len(wfItem.DirectDependencies) == 0 { - return allDependencies, nil - } - for dependency := range wfItem.DirectDependencies { - allDependencies.Add(dependency) - subDependencies, err := getAllDependencies(dependency, workloads) - if err != nil { - return nil, err - } - allDependencies.Merge(subDependencies) - } - return allDependencies, nil -} - -func getFailedArgoWorkloadIDs(appName string) (strset.Set, error) { - failedArgoPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{ - FieldSelector: "status.phase=Failed", - LabelSelector: k8s.LabelSelector(map[string]string{ - "appName": appName, - "argo": "true", - }), - }) - if err != nil { - return nil, err - } - - failedWorkloadIDs := strset.New() - for _, pod := range failedArgoPods { - failedWorkloadIDs.Add(pod.Labels["workloadID"]) - } - return failedWorkloadIDs, nil -} - -func getFailedArgoPodForWorkload(workloadID string, appName string) (*kcore.Pod, error) { - failedArgoPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{ - FieldSelector: "status.phase=Failed", - LabelSelector: k8s.LabelSelector(map[string]string{ - "appName": appName, - "workloadID": workloadID, - "argo": "true", - }), - }) - if err != nil { - return nil, err - } - - if len(failedArgoPods) == 0 { - return nil, nil - } - - return &failedArgoPods[0], nil -} diff --git a/pkg/operator/workloads/python_package_job.go b/pkg/operator/workloads/python_package_job.go deleted file mode 100644 index 6eed89740d..0000000000 --- a/pkg/operator/workloads/python_package_job.go +++ /dev/null @@ -1,109 +0,0 @@ -/* -Copyright 2019 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package workloads - -import ( - "strings" - - kbatch "k8s.io/api/batch/v1" - kcore "k8s.io/api/core/v1" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/argo" - "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/sets/strset" - "github.com/cortexlabs/cortex/pkg/operator/api/context" - "github.com/cortexlabs/cortex/pkg/operator/config" -) - -func pythonPackageJobSpec(ctx *context.Context, pythonPackages strset.Set, workloadID string) *kbatch.Job { - spec := k8s.Job(&k8s.JobSpec{ - Name: workloadID, - Labels: map[string]string{ - "appName": ctx.App.Name, - "workloadType": workloadTypePythonPackager, - "workloadID": workloadID, - }, - PodSpec: k8s.PodSpec{ - Labels: map[string]string{ - "appName": ctx.App.Name, - "workloadType": workloadTypePythonPackager, - "workloadID": workloadID, - "userFacing": "true", - }, - K8sPodSpec: kcore.PodSpec{ - RestartPolicy: "Never", - Containers: []kcore.Container{ - { - Name: "python-packager", - Image: config.Cortex.PythonPackagerImage, - ImagePullPolicy: "Always", - Args: []string{ - "--workload-id=" + workloadID, - "--context=" + config.AWS.S3Path(ctx.Key), - "--cache-dir=" + consts.ContextCacheDir, - "--python-packages=" + strings.Join(pythonPackages.Slice(), ","), - "--build", - }, - Env: k8s.AWSCredentials(), - VolumeMounts: k8s.DefaultVolumeMounts(), - }, - }, - Volumes: k8s.DefaultVolumes(), - ServiceAccountName: "default", - }, - }, - Namespace: config.Cortex.Namespace, - }) - argo.EnableGC(spec) - return spec -} - -func pythonPackageWorkloadSpecs(ctx *context.Context) ([]*WorkloadSpec, error) { - resourceIDs := strset.New() - - for _, pythonPackage := range ctx.PythonPackages { - isPythonPackageCached, err := checkResourceCached(pythonPackage, ctx) - if err != nil { - return nil, err - } - if isPythonPackageCached { - continue - } - resourceIDs.Add(pythonPackage.GetID()) - } - - if len(resourceIDs) == 0 { - return nil, nil - } - - workloadID := generateWorkloadID() - - spec := pythonPackageJobSpec(ctx, resourceIDs, workloadID) - workloadSpec := &WorkloadSpec{ - WorkloadID: workloadID, - ResourceIDs: resourceIDs, - K8sSpecs: []kmeta.Object{spec}, - K8sAction: "create", - SuccessCondition: k8s.JobSuccessCondition, - FailureCondition: k8s.JobFailureCondition, - WorkloadType: workloadTypePythonPackager, - } - - return []*WorkloadSpec{workloadSpec}, nil -} diff --git a/pkg/operator/workloads/python_package_workload.go b/pkg/operator/workloads/python_package_workload.go new file mode 100644 index 0000000000..1e594ba95e --- /dev/null +++ b/pkg/operator/workloads/python_package_workload.go @@ -0,0 +1,132 @@ +/* +Copyright 2019 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloads + +import ( + "strings" + + kcore "k8s.io/api/core/v1" + + "github.com/cortexlabs/cortex/pkg/consts" + "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/operator/api/context" + "github.com/cortexlabs/cortex/pkg/operator/config" +) + +type PythonPackagesWorkload struct { + BaseWorkload +} + +func populatePythonPackageWorkloadIDs(ctx *context.Context, latestResourceWorkloadIDs map[string]string) { + pythonPackagesWorkloadID := generateWorkloadID() + + for _, pythonPackage := range ctx.PythonPackages { + if pythonPackage.WorkloadID != "" { + continue + } + if workloadID := latestResourceWorkloadIDs[pythonPackage.ID]; workloadID != "" { + pythonPackage.WorkloadID = workloadID + continue + } + pythonPackage.WorkloadID = pythonPackagesWorkloadID + } +} + +func extractPythonPackageWorkloads(ctx *context.Context) []Workload { + workloadMap := make(map[string]*PythonPackagesWorkload) + for _, pythonPackage := range ctx.PythonPackages { + if _, ok := workloadMap[pythonPackage.WorkloadID]; !ok { + workloadMap[pythonPackage.WorkloadID] = &PythonPackagesWorkload{ + emptyBaseWorkload(ctx.App.Name, pythonPackage.WorkloadID, workloadTypePythonPackager), + } + } + workloadMap[pythonPackage.WorkloadID].AddResource(pythonPackage) + } + + workloads := make([]Workload, 0, len(workloadMap)) + for _, workload := range workloadMap { + workloads = append(workloads, workload) + } + return workloads +} + +func (pyw *PythonPackagesWorkload) Start(ctx *context.Context) error { + spec := &k8s.JobSpec{ + Name: pyw.WorkloadID, + Labels: map[string]string{ + "appName": ctx.App.Name, + "workloadType": workloadTypePythonPackager, + "workloadID": pyw.WorkloadID, + }, + PodSpec: k8s.PodSpec{ + Labels: map[string]string{ + "appName": ctx.App.Name, + "workloadType": workloadTypePythonPackager, + "workloadID": pyw.WorkloadID, + "userFacing": "true", + }, + K8sPodSpec: kcore.PodSpec{ + RestartPolicy: "Never", + Containers: []kcore.Container{ + { + Name: "python-packager", + Image: config.Cortex.PythonPackagerImage, + ImagePullPolicy: "Always", + Args: []string{ + "--workload-id=" + pyw.WorkloadID, + "--context=" + config.AWS.S3Path(ctx.Key), + "--cache-dir=" + consts.ContextCacheDir, + "--python-packages=" + strings.Join(pyw.GetResourceIDs().Slice(), ","), + "--build", + }, + Env: k8s.AWSCredentials(), + VolumeMounts: k8s.DefaultVolumeMounts(), + }, + }, + Volumes: k8s.DefaultVolumes(), + ServiceAccountName: "default", + }, + }, + Namespace: config.Cortex.Namespace, + } + + _, err := config.Kubernetes.CreateJob(k8s.Job(spec)) + if err != nil { + return err + } + return nil +} + +func (pyw *PythonPackagesWorkload) IsStarted(ctx *context.Context) (bool, error) { + return config.Kubernetes.JobExists(pyw.WorkloadID) +} + +func (pyw *PythonPackagesWorkload) IsRunning(ctx *context.Context) (bool, error) { + return config.Kubernetes.IsJobRunning(pyw.WorkloadID) +} + +func (pyw *PythonPackagesWorkload) CanRun(ctx *context.Context) (bool, error) { + return areAllDataDependenciesSucceeded(ctx, pyw.GetResourceIDs()) +} + +func (pyw *PythonPackagesWorkload) IsSucceeded(ctx *context.Context) (bool, error) { + return areAllDataResourcesSucceeded(ctx, pyw.GetResourceIDs()) +} + +func (pyw *PythonPackagesWorkload) IsFailed(ctx *context.Context) (bool, error) { + return areAnyDataResourcesFailed(ctx, pyw.GetResourceIDs()) +} diff --git a/pkg/operator/workloads/saved_base_workload.go b/pkg/operator/workloads/saved_base_workload.go new file mode 100644 index 0000000000..f87773455a --- /dev/null +++ b/pkg/operator/workloads/saved_base_workload.go @@ -0,0 +1,112 @@ +/* +Copyright 2019 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloads + +import ( + "github.com/cortexlabs/cortex/pkg/lib/aws" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/parallel" + "github.com/cortexlabs/cortex/pkg/operator/config" + ocontext "github.com/cortexlabs/cortex/pkg/operator/context" +) + +func uploadBaseWorkload(baseWorkload *BaseWorkload) error { + if isBaseWorkloadCached(baseWorkload) { + return nil + } + + key := ocontext.BaseWorkloadKey(baseWorkload.WorkloadID, baseWorkload.AppName) + err := config.AWS.UploadJSONToS3(baseWorkload, key) + if err != nil { + return errors.Wrap(err, "upload base workload", baseWorkload.AppName, baseWorkload.WorkloadID) + } + cacheBaseWorkload(baseWorkload) + return nil +} + +func uploadBaseWorkloads(baseWorkloads []*BaseWorkload) error { + fns := make([]func() error, len(baseWorkloads)) + for i, baseWorkload := range baseWorkloads { + fns[i] = uploadBaseWorkloadFunc(baseWorkload) + } + return parallel.RunFirstErr(fns...) +} + +func uploadBaseWorkloadsFromWorkloads(workloads []Workload) error { + fns := make([]func() error, len(workloads)) + for i, workload := range workloads { + fns[i] = uploadBaseWorkloadFunc(workload.GetBaseWorkloadPtr()) + } + return parallel.RunFirstErr(fns...) +} + +func uploadBaseWorkloadFunc(baseWorkload *BaseWorkload) func() error { + return func() error { + return uploadBaseWorkload(baseWorkload) + } +} + +func getSavedBaseWorkload(workloadID string, appName string) (*BaseWorkload, error) { + if cachedBaseWorkload, ok := getCachedBaseWorkload(workloadID, appName); ok { + return cachedBaseWorkload, nil + } + + key := ocontext.BaseWorkloadKey(workloadID, appName) + var baseWorkload BaseWorkload + err := config.AWS.ReadJSONFromS3(&baseWorkload, key) + if aws.IsNoSuchKeyErr(err) { + return nil, nil + } + if err != nil { + return nil, errors.Wrap(err, "download base workload", appName, workloadID) + } + cacheBaseWorkload(&baseWorkload) + return &baseWorkload, nil +} + +func getSavedBaseWorkloads(workloadIDs []string, appName string) (map[string]*BaseWorkload, error) { + baseWorkloads := make([]*BaseWorkload, len(workloadIDs)) + fns := make([]func() error, len(workloadIDs)) + i := 0 + for _, workloadID := range workloadIDs { + fns[i] = getSavedBaseWorkloadFunc(workloadID, appName, baseWorkloads, i) + i++ + } + err := parallel.RunFirstErr(fns...) + if err != nil { + return nil, err + } + + baseWorkloadMap := make(map[string]*BaseWorkload) + for _, baseWorkload := range baseWorkloads { + if baseWorkload != nil { + baseWorkloadMap[baseWorkload.WorkloadID] = baseWorkload + } + } + return baseWorkloadMap, err +} + +func getSavedBaseWorkloadFunc(workloadID string, appName string, baseWorkloads []*BaseWorkload, i int) func() error { + return func() error { + baseWorkload, err := getSavedBaseWorkload(workloadID, appName) + if err != nil { + return err + } + baseWorkloads[i] = baseWorkload + return nil + } +} diff --git a/pkg/operator/workloads/saved_base_workload_cache.go b/pkg/operator/workloads/saved_base_workload_cache.go new file mode 100644 index 0000000000..e93617054f --- /dev/null +++ b/pkg/operator/workloads/saved_base_workload_cache.go @@ -0,0 +1,73 @@ +/* +Copyright 2019 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloads + +import ( + "sync" + + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" +) + +// appName -> map(workloadID -> *BaseWorkload) +var baseWorkloadCache = struct { + m map[string]map[string]*BaseWorkload + sync.RWMutex +}{m: make(map[string]map[string]*BaseWorkload)} + +func getCachedBaseWorkload(workloadID string, appName string) (*BaseWorkload, bool) { + baseWorkloadCache.RLock() + defer baseWorkloadCache.RUnlock() + if _, ok := baseWorkloadCache.m[appName]; ok { + if baseWorkload, ok := baseWorkloadCache.m[appName][workloadID]; ok { + if baseWorkload != nil { + return baseWorkload.Copy(), true + } + } + } + return nil, false +} + +func isBaseWorkloadCached(baseWorkload *BaseWorkload) bool { + cachedBaseWorkload, _ := getCachedBaseWorkload(baseWorkload.WorkloadID, baseWorkload.AppName) + return BaseWorkloadPtrsEqual(baseWorkload, cachedBaseWorkload) +} + +func cacheBaseWorkload(baseWorkload *BaseWorkload) { + baseWorkloadCache.Lock() + defer baseWorkloadCache.Unlock() + if _, ok := baseWorkloadCache.m[baseWorkload.AppName]; !ok { + baseWorkloadCache.m[baseWorkload.AppName] = make(map[string]*BaseWorkload) + } + baseWorkloadCache.m[baseWorkload.AppName][baseWorkload.WorkloadID] = baseWorkload.Copy() +} + +// app name -> workload IDs +func uncacheBaseWorkloads(currentWorkloadIDs map[string]strset.Set) { + baseWorkloadCache.Lock() + defer baseWorkloadCache.Unlock() + for appName := range baseWorkloadCache.m { + if _, ok := currentWorkloadIDs[appName]; !ok { + delete(baseWorkloadCache.m, appName) + } else { + for workloadID := range baseWorkloadCache.m[appName] { + if !currentWorkloadIDs[appName].Has(workloadID) { + delete(baseWorkloadCache.m[appName], workloadID) + } + } + } + } +} diff --git a/pkg/operator/workloads/shared.go b/pkg/operator/workloads/shared.go index 560075b898..91e788483e 100644 --- a/pkg/operator/workloads/shared.go +++ b/pkg/operator/workloads/shared.go @@ -18,27 +18,62 @@ package workloads import ( "github.com/cortexlabs/cortex/pkg/lib/random" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/api/context" "github.com/cortexlabs/cortex/pkg/operator/api/resource" ) +// k8s needs all characters to be lower case, and the first to be a letter func generateWorkloadID() string { - // k8s needs all characters to be lower case, and the first to be a letter return random.LowercaseLetters(1) + random.LowercaseString(19) } -func checkResourceCached(res context.ComputedResource, ctx *context.Context) (bool, error) { - workloadID := res.GetWorkloadID() - if workloadID == "" { - return false, nil - } +// Check if all resourceIDs have succeeded (only data resource types) +func areAllDataResourcesSucceeded(ctx *context.Context, resourceIDs strset.Set) (bool, error) { + resourceWorkloadIDs := ctx.DataResourceWorkloadIDs() + for resourceID := range resourceIDs { + workloadID := resourceWorkloadIDs[resourceID] + if workloadID == "" { + continue + } + + savedStatus, err := getDataSavedStatus(resourceID, workloadID, ctx.App.Name) + if err != nil { + return false, err + } - savedStatus, err := getDataSavedStatus(res.GetID(), workloadID, ctx.App.Name) - if err != nil { - return false, err + if savedStatus == nil || savedStatus.ExitCode != resource.ExitCodeDataSucceeded { + return false, nil + } } - if savedStatus != nil && savedStatus.ExitCode == resource.ExitCodeDataSucceeded { - return true, nil + + return true, nil +} + +// Check if any resourceIDs have succeeded (only data resource types) +func areAnyDataResourcesFailed(ctx *context.Context, resourceIDs strset.Set) (bool, error) { + resourceWorkloadIDs := ctx.DataResourceWorkloadIDs() + for resourceID := range resourceIDs { + workloadID := resourceWorkloadIDs[resourceID] + if workloadID == "" { + continue + } + + savedStatus, err := getDataSavedStatus(resourceID, workloadID, ctx.App.Name) + if err != nil { + return false, err + } + + if savedStatus != nil && savedStatus.ExitCode != resource.ExitCodeDataSucceeded && savedStatus.ExitCode != resource.ExitCodeDataUnknown { + return true, nil + } } + return false, nil } + +// Check if all dependencies of targetResourceIDs have succeeded (only data resource types) +func areAllDataDependenciesSucceeded(ctx *context.Context, targetResourceIDs strset.Set) (bool, error) { + dependencies := ctx.DirectComputedResourceDependencies(targetResourceIDs.Slice()...) + return areAllDataResourcesSucceeded(ctx, dependencies) +} diff --git a/pkg/operator/workloads/data_job.go b/pkg/operator/workloads/spark_workload.go similarity index 55% rename from pkg/operator/workloads/data_job.go rename to pkg/operator/workloads/spark_workload.go index 0e119ed122..39245b0d63 100644 --- a/pkg/operator/workloads/data_job.go +++ b/pkg/operator/workloads/spark_workload.go @@ -21,11 +21,8 @@ import ( "strings" sparkop "github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/apis/sparkoperator.k8s.io/v1alpha1" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/argo" - "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" @@ -36,16 +33,112 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" ) -func dataJobSpec( - ctx *context.Context, - shouldIngest bool, - rawColumns strset.Set, - aggregates strset.Set, - transformedColumns strset.Set, - trainingDatasets strset.Set, - workloadID string, - sparkCompute *userconfig.SparkCompute, -) *sparkop.SparkApplication { +type SparkWorkload struct { + BaseWorkload +} + +func sparkResources(ctx *context.Context) []context.ComputedResource { + var sparkResources []context.ComputedResource + for _, rawColumn := range ctx.RawColumns { + sparkResources = append(sparkResources, rawColumn) + } + for _, aggregate := range ctx.Aggregates { + sparkResources = append(sparkResources, aggregate) + } + for _, transformedColumn := range ctx.TransformedColumns { + sparkResources = append(sparkResources, transformedColumn) + } + for _, model := range ctx.Models { + sparkResources = append(sparkResources, model.Dataset) + } + return sparkResources +} + +func populateSparkWorkloadIDs(ctx *context.Context, latestResourceWorkloadIDs map[string]string) { + sparkWorkloadID := generateWorkloadID() + for _, res := range sparkResources(ctx) { + if res.GetWorkloadID() != "" { + continue + } + if workloadID := latestResourceWorkloadIDs[res.GetID()]; workloadID != "" { + res.SetWorkloadID(workloadID) + continue + } + res.SetWorkloadID(sparkWorkloadID) + } +} + +func extractSparkWorkloads(ctx *context.Context) []Workload { + workloadMap := make(map[string]*SparkWorkload) + + for _, res := range sparkResources(ctx) { + if _, ok := workloadMap[res.GetWorkloadID()]; !ok { + workloadMap[res.GetWorkloadID()] = &SparkWorkload{ + emptyBaseWorkload(ctx.App.Name, res.GetWorkloadID(), workloadTypeSpark), + } + } + workloadMap[res.GetWorkloadID()].AddResource(res) + } + + workloads := make([]Workload, 0, len(workloadMap)) + for _, workload := range workloadMap { + workloads = append(workloads, workload) + } + return workloads +} + +func (sw *SparkWorkload) Start(ctx *context.Context) error { + rawDatasetExists, err := config.AWS.IsS3File(filepath.Join(ctx.RawDataset.Key, "_SUCCESS")) + if err != nil { + return errors.Wrap(err, ctx.App.Name, "raw dataset") + } + shouldIngest := !rawDatasetExists + + rawColumns := strset.New() + aggregates := strset.New() + transformedColumns := strset.New() + trainingDatasets := strset.New() + + var sparkCompute *userconfig.SparkCompute + + if shouldIngest { + for _, rawColumn := range ctx.RawColumns { + sparkCompute = userconfig.MaxSparkCompute(sparkCompute, rawColumn.GetCompute()) + } + } + + for _, rawColumn := range ctx.RawColumns { + if sw.CreatesResource(rawColumn.GetID()) { + rawColumns.Add(rawColumn.GetID()) + sparkCompute = userconfig.MaxSparkCompute(sparkCompute, rawColumn.GetCompute()) + } + } + for _, aggregate := range ctx.Aggregates { + if sw.CreatesResource(aggregate.ID) { + aggregates.Add(aggregate.ID) + sparkCompute = userconfig.MaxSparkCompute(sparkCompute, aggregate.Compute) + } + } + for _, transformedColumn := range ctx.TransformedColumns { + if sw.CreatesResource(transformedColumn.ID) { + transformedColumns.Add(transformedColumn.ID) + sparkCompute = userconfig.MaxSparkCompute(sparkCompute, transformedColumn.Compute) + } + } + for _, model := range ctx.Models { + dataset := model.Dataset + if sw.CreatesResource(dataset.ID) { + trainingDatasets.Add(dataset.ID) + sparkCompute = userconfig.MaxSparkCompute(sparkCompute, model.DatasetCompute) + + dependencyIDs := ctx.AllComputedResourceDependencies(dataset.ID) + for _, transformedColumn := range ctx.TransformedColumns { + if _, ok := dependencyIDs[transformedColumn.ID]; ok { + sparkCompute = userconfig.MaxSparkCompute(sparkCompute, transformedColumn.Compute) + } + } + } + } args := []string{ "--raw-columns=" + strings.Join(rawColumns.Slice(), ","), @@ -56,12 +149,24 @@ func dataJobSpec( if shouldIngest { args = append(args, "--ingest") } - spec := sparkSpec(workloadID, ctx, workloadTypeData, sparkCompute, args...) - argo.EnableGC(spec) - return spec + + spec := sparkSpec(sw.WorkloadID, ctx, workloadTypeSpark, sparkCompute, args...) + _, err = config.Spark.Create(spec) + if err != nil { + return err + } + + return nil } -func sparkSpec(workloadID string, ctx *context.Context, workloadType string, sparkCompute *userconfig.SparkCompute, args ...string) *sparkop.SparkApplication { +func sparkSpec( + workloadID string, + ctx *context.Context, + workloadType string, + sparkCompute *userconfig.SparkCompute, + args ...string, +) *sparkop.SparkApplication { + var driverMemOverhead *string if sparkCompute.DriverMemOverhead != nil { driverMemOverhead = pointer.String(s.Int64(sparkCompute.DriverMemOverhead.ToKi()) + "k") @@ -75,19 +180,13 @@ func sparkSpec(workloadID string, ctx *context.Context, workloadType string, spa memOverheadFactor = pointer.String(s.Float64(*sparkCompute.MemOverheadFactor)) } - return &sparkop.SparkApplication{ - TypeMeta: kmeta.TypeMeta{ - APIVersion: "sparkoperator.k8s.io/v1alpha1", - Kind: "SparkApplication", - }, - ObjectMeta: kmeta.ObjectMeta{ - Name: workloadID, - Namespace: config.Cortex.Namespace, - Labels: map[string]string{ - "workloadID": workloadID, - "workloadType": workloadType, - "appName": ctx.App.Name, - }, + return spark.App(&spark.Spec{ + Name: workloadID, + Namespace: config.Cortex.Namespace, + Labels: map[string]string{ + "workloadID": workloadID, + "workloadType": workloadType, + "appName": ctx.App.Name, }, Spec: sparkop.SparkApplicationSpec{ Type: sparkop.PythonApplicationType, @@ -169,115 +268,25 @@ func sparkSpec(workloadID string, ctx *context.Context, workloadType string, spa Instances: &sparkCompute.Executors, }, }, - } + }) } -func dataWorkloadSpecs(ctx *context.Context) ([]*WorkloadSpec, error) { - workloadID := generateWorkloadID() - - rawFileExists, err := config.AWS.IsS3File(filepath.Join(ctx.RawDataset.Key, "_SUCCESS")) - if err != nil { - return nil, errors.Wrap(err, ctx.App.Name, "raw dataset") - } - - var allComputes []*userconfig.SparkCompute - - shouldIngest := !rawFileExists - if shouldIngest { - externalPath := ctx.Environment.Data.GetPath() - externalDataExists, err := aws.IsS3aPathPrefixExternal(externalPath) - if !externalDataExists || err != nil { - return nil, errors.Wrap(userconfig.ErrorExternalNotFound(externalPath), ctx.App.Name, userconfig.Identify(ctx.Environment), userconfig.DataKey, userconfig.PathKey) - } - for _, rawColumn := range ctx.RawColumns { - allComputes = append(allComputes, rawColumn.GetCompute()) - } - } - - rawColumnIDs := strset.New() - var rawColumns []string - for rawColumnName, rawColumn := range ctx.RawColumns { - isCached, err := checkResourceCached(rawColumn, ctx) - if err != nil { - return nil, err - } - if isCached { - continue - } - rawColumns = append(rawColumns, rawColumnName) - rawColumnIDs.Add(rawColumn.GetID()) - allComputes = append(allComputes, rawColumn.GetCompute()) - } - - aggregateIDs := strset.New() - var aggregates []string - for aggregateName, aggregate := range ctx.Aggregates { - isCached, err := checkResourceCached(aggregate, ctx) - if err != nil { - return nil, err - } - if isCached { - continue - } - aggregates = append(aggregates, aggregateName) - aggregateIDs.Add(aggregate.GetID()) - allComputes = append(allComputes, aggregate.Compute) - } - - transformedColumnIDs := strset.New() - var transformedColumns []string - for transformedColumnName, transformedColumn := range ctx.TransformedColumns { - isCached, err := checkResourceCached(transformedColumn, ctx) - if err != nil { - return nil, err - } - if isCached { - continue - } - transformedColumns = append(transformedColumns, transformedColumnName) - transformedColumnIDs.Add(transformedColumn.GetID()) - allComputes = append(allComputes, transformedColumn.Compute) - } - - trainingDatasetIDs := strset.New() - var trainingDatasets []string - for modelName, model := range ctx.Models { - dataset := model.Dataset - isCached, err := checkResourceCached(dataset, ctx) - if err != nil { - return nil, err - } - if isCached { - continue - } - trainingDatasets = append(trainingDatasets, modelName) - trainingDatasetIDs.Add(dataset.GetID()) - dependencyIDs := ctx.AllComputedResourceDependencies(dataset.GetID()) - for _, transformedColumn := range ctx.TransformedColumns { - if _, ok := dependencyIDs[transformedColumn.ID]; ok { - allComputes = append(allComputes, transformedColumn.Compute) - } - } - allComputes = append(allComputes, model.DatasetCompute) - } +func (sw *SparkWorkload) IsStarted(ctx *context.Context) (bool, error) { + return config.Spark.Exists(sw.WorkloadID) +} - resourceIDSet := strset.Union(rawColumnIDs, aggregateIDs, transformedColumnIDs, trainingDatasetIDs) +func (sw *SparkWorkload) IsRunning(ctx *context.Context) (bool, error) { + return config.Spark.IsRunning(sw.WorkloadID) +} - if !shouldIngest && len(resourceIDSet) == 0 { - return nil, nil - } +func (sw *SparkWorkload) CanRun(ctx *context.Context) (bool, error) { + return areAllDataDependenciesSucceeded(ctx, sw.GetResourceIDs()) +} - sparkCompute := userconfig.MaxSparkCompute(allComputes...) - spec := dataJobSpec(ctx, shouldIngest, rawColumnIDs, aggregateIDs, transformedColumnIDs, trainingDatasetIDs, workloadID, sparkCompute) +func (sw *SparkWorkload) IsSucceeded(ctx *context.Context) (bool, error) { + return areAllDataResourcesSucceeded(ctx, sw.GetResourceIDs()) +} - workloadSpec := &WorkloadSpec{ - WorkloadID: workloadID, - ResourceIDs: resourceIDSet, - K8sSpecs: []kmeta.Object{spec}, - K8sAction: "create", - SuccessCondition: spark.SuccessCondition, - FailureCondition: spark.FailureCondition, - WorkloadType: workloadTypeData, - } - return []*WorkloadSpec{workloadSpec}, nil +func (sw *SparkWorkload) IsFailed(ctx *context.Context) (bool, error) { + return areAnyDataResourcesFailed(ctx, sw.GetResourceIDs()) } diff --git a/pkg/operator/workloads/training_job.go b/pkg/operator/workloads/training_workload.go similarity index 53% rename from pkg/operator/workloads/training_job.go rename to pkg/operator/workloads/training_workload.go index 0ab4140741..e039971660 100644 --- a/pkg/operator/workloads/training_job.go +++ b/pkg/operator/workloads/training_workload.go @@ -17,13 +17,10 @@ limitations under the License. package workloads import ( - kbatch "k8s.io/api/batch/v1" kcore "k8s.io/api/core/v1" kresource "k8s.io/apimachinery/pkg/api/resource" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/argo" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/api/context" @@ -31,12 +28,54 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" ) -func trainingJobSpec( - ctx *context.Context, - modelID string, - workloadID string, - tfCompute *userconfig.TFCompute, -) *kbatch.Job { +type TrainingWorkload struct { + BaseWorkload +} + +func populateTrainingWorkloadIDs(ctx *context.Context, latestResourceWorkloadIDs map[string]string) { + trainingWorkloadIDs := make(map[string]string) + + for _, model := range ctx.Models { + if model.WorkloadID != "" { + continue + } + if workloadID := latestResourceWorkloadIDs[model.ID]; workloadID != "" { + model.WorkloadID = workloadID + continue + } + if workloadID, ok := trainingWorkloadIDs[model.ID]; ok { + // This is a duplicate model ID (different name) + model.WorkloadID = workloadID + continue + } + model.WorkloadID = generateWorkloadID() + trainingWorkloadIDs[model.ID] = model.WorkloadID + } +} + +func extractTrainingWorkloads(ctx *context.Context) []Workload { + workloads := make([]Workload, 0, len(ctx.Models)) + modelIDs := strset.New() + + for _, model := range ctx.Models { + if !modelIDs.Has(model.ID) { + workloads = append(workloads, &TrainingWorkload{ + singleBaseWorkload(model, ctx.App.Name, workloadTypeTrain), + }) + modelIDs.Add(model.ID) + } + } + + return workloads +} + +func (tw *TrainingWorkload) Start(ctx *context.Context) error { + var tfCompute *userconfig.TFCompute + for _, model := range ctx.Models { + if tw.CreatesResource(model.ID) { + tfCompute = userconfig.MaxTFCompute(tfCompute, model.Compute) + } + } resourceList := kcore.ResourceList{} limitsList := kcore.ResourceList{} @@ -52,18 +91,18 @@ func trainingJobSpec( limitsList["nvidia.com/gpu"] = *kresource.NewQuantity(tfCompute.GPU, kresource.DecimalSI) } - spec := k8s.Job(&k8s.JobSpec{ - Name: workloadID, + spec := &k8s.JobSpec{ + Name: tw.WorkloadID, Labels: map[string]string{ "appName": ctx.App.Name, "workloadType": workloadTypeTrain, - "workloadID": workloadID, + "workloadID": tw.WorkloadID, }, PodSpec: k8s.PodSpec{ Labels: map[string]string{ "appName": ctx.App.Name, "workloadType": workloadTypeTrain, - "workloadID": workloadID, + "workloadID": tw.WorkloadID, "userFacing": "true", }, K8sPodSpec: kcore.PodSpec{ @@ -74,10 +113,10 @@ func trainingJobSpec( Image: trainImage, ImagePullPolicy: "Always", Args: []string{ - "--workload-id=" + workloadID, + "--workload-id=" + tw.WorkloadID, "--context=" + config.AWS.S3Path(ctx.Key), "--cache-dir=" + consts.ContextCacheDir, - "--model=" + modelID, + "--model=" + tw.GetSingleResourceID(), }, Env: k8s.AWSCredentials(), VolumeMounts: k8s.DefaultVolumeMounts(), @@ -92,42 +131,31 @@ func trainingJobSpec( }, }, Namespace: config.Cortex.Namespace, - }) - argo.EnableGC(spec) - return spec + } + + _, err := config.Kubernetes.CreateJob(k8s.Job(spec)) + if err != nil { + return err + } + return nil } -func trainingWorkloadSpecs(ctx *context.Context) ([]*WorkloadSpec, error) { - modelsToTrain := make(map[string]*userconfig.TFCompute) - for _, model := range ctx.Models { - modelCached, err := checkResourceCached(model, ctx) - if err != nil { - return nil, err - } - if modelCached { - continue - } +func (tw *TrainingWorkload) IsStarted(ctx *context.Context) (bool, error) { + return config.Kubernetes.JobExists(tw.WorkloadID) +} - if tfCompute, ok := modelsToTrain[model.ID]; ok { - modelsToTrain[model.ID] = userconfig.MaxTFCompute(tfCompute, model.Compute) - } else { - modelsToTrain[model.ID] = model.Compute - } - } +func (tw *TrainingWorkload) IsRunning(ctx *context.Context) (bool, error) { + return config.Kubernetes.IsJobRunning(tw.WorkloadID) +} - var workloadSpecs []*WorkloadSpec - for modelID, tfCompute := range modelsToTrain { - workloadID := generateWorkloadID() - workloadSpecs = append(workloadSpecs, &WorkloadSpec{ - WorkloadID: workloadID, - ResourceIDs: strset.New(modelID), - K8sSpecs: []kmeta.Object{trainingJobSpec(ctx, modelID, workloadID, tfCompute)}, - K8sAction: "create", - SuccessCondition: k8s.JobSuccessCondition, - FailureCondition: k8s.JobFailureCondition, - WorkloadType: workloadTypeTrain, - }) - } +func (tw *TrainingWorkload) CanRun(ctx *context.Context) (bool, error) { + return areAllDataDependenciesSucceeded(ctx, tw.GetResourceIDs()) +} + +func (tw *TrainingWorkload) IsSucceeded(ctx *context.Context) (bool, error) { + return areAllDataResourcesSucceeded(ctx, tw.GetResourceIDs()) +} - return workloadSpecs, nil +func (tw *TrainingWorkload) IsFailed(ctx *context.Context) (bool, error) { + return areAnyDataResourcesFailed(ctx, tw.GetResourceIDs()) } diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go index 526d71ee4f..80d722ef4c 100644 --- a/pkg/operator/workloads/workflow.go +++ b/pkg/operator/workloads/workflow.go @@ -17,200 +17,91 @@ limitations under the License. package workloads import ( - "fmt" "path/filepath" - "strings" - - awfv1 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1" - ghodssyaml "github.com/ghodss/yaml" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/argo" + "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/json" - "github.com/cortexlabs/cortex/pkg/lib/slices" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/api/context" + "github.com/cortexlabs/cortex/pkg/operator/api/userconfig" "github.com/cortexlabs/cortex/pkg/operator/config" - ocontext "github.com/cortexlabs/cortex/pkg/operator/context" ) func Init() error { - workflows, err := config.Argo.List(nil) + err := reloadCurrentContexts() if err != nil { - return errors.Wrap(err, "init", "argo", "list") + return errors.Wrap(err, "init") } - for _, wf := range workflows { - ctx, err := ocontext.DownloadContext(wf.Labels["ctxID"], wf.Labels["appName"]) - if err != nil { - fmt.Println("Deleting stale workflow:", wf.Name) - config.Argo.Delete(wf.Name) - } else { - setCurrentContext(ctx) - } - } + go cronRunner() return nil } -func Create(ctx *context.Context) (*awfv1.Workflow, error) { - err := populateLatestWorkloadIDs(ctx) +func PopulateWorkloadIDs(ctx *context.Context) error { + resourceIDs := ctx.ComputedResourceIDs() + latestResourceWorkloadIDs, err := getSavedLatestWorkloadIDs(resourceIDs, ctx.App.Name) if err != nil { - return nil, err - } - - labels := map[string]string{ - "appName": ctx.App.Name, - "ctxID": ctx.ID, + return err } - wf := config.Argo.NewWorkflow(ctx.App.Name, labels) - var allSpecs []*WorkloadSpec + populatePythonPackageWorkloadIDs(ctx, latestResourceWorkloadIDs) + populateSparkWorkloadIDs(ctx, latestResourceWorkloadIDs) + populateTrainingWorkloadIDs(ctx, latestResourceWorkloadIDs) + populateAPIWorkloadIDs(ctx, latestResourceWorkloadIDs) - pythonPackageJobSpecs, err := pythonPackageWorkloadSpecs(ctx) - if err != nil { - return nil, err + if err := ctx.CheckAllWorkloadIDsPopulated(); err != nil { + return err } - allSpecs = append(allSpecs, pythonPackageJobSpecs...) + return nil +} - if ctx.Environment != nil { - dataJobSpecs, err := dataWorkloadSpecs(ctx) - if err != nil { - return nil, err - } - allSpecs = append(allSpecs, dataJobSpecs...) +func extractWorkloads(ctx *context.Context) []Workload { + var workloads []Workload + workloads = append(workloads, extractPythonPackageWorkloads(ctx)...) + workloads = append(workloads, extractSparkWorkloads(ctx)...) + workloads = append(workloads, extractTrainingWorkloads(ctx)...) + workloads = append(workloads, extractAPIWorkloads(ctx)...) + return workloads +} - trainingJobSpecs, err := trainingWorkloadSpecs(ctx) +func ValidateDeploy(ctx *context.Context) error { + if ctx.Environment != nil { + rawDatasetExists, err := config.AWS.IsS3File(filepath.Join(ctx.RawDataset.Key, "_SUCCESS")) if err != nil { - return nil, err - } - allSpecs = append(allSpecs, trainingJobSpecs...) - } - - apiSpecs, err := apiWorkloadSpecs(ctx) - if err != nil { - return nil, err - } - allSpecs = append(allSpecs, apiSpecs...) - - resourceWorkloadIDs := make(map[string]string) - for _, spec := range allSpecs { - for resourceID := range spec.ResourceIDs { - resourceWorkloadIDs[resourceID] = spec.WorkloadID + return errors.Wrap(err, ctx.App.Name, "raw dataset") } - } - ctx.PopulateWorkloadIDs(resourceWorkloadIDs) - - for _, spec := range allSpecs { - var dependencyWorkloadIDs []string - for resourceID := range spec.ResourceIDs { - for dependencyResourceID := range ctx.AllComputedResourceDependencies(resourceID) { - workloadID := resourceWorkloadIDs[dependencyResourceID] - if workloadID != "" && workloadID != spec.WorkloadID { - dependencyWorkloadIDs = append(dependencyWorkloadIDs, workloadID) - } - } - } - - var combinedManifest string - - switch len(spec.K8sSpecs) { - case 0: - return nil, errors.New("a kubernetes manifest must be specified") // unexpected internal error - case 1: - manifestBytes, err := json.Marshal(spec.K8sSpecs[0]) - if err != nil { - return nil, errors.Wrap(err, ctx.App.Name, "workloads", spec.WorkloadID) - } - combinedManifest = string(manifestBytes) - default: // >1 - if spec.SuccessCondition != "" || spec.FailureCondition != "" { - return nil, errors.New("success and failure conditions are not permitted with multiple manifests") // unexpected internal error + if !rawDatasetExists { + externalPath := ctx.Environment.Data.GetPath() + externalDataExists, err := aws.IsS3aPathPrefixExternal(externalPath) + if !externalDataExists || err != nil { + return errors.Wrap(userconfig.ErrorExternalNotFound(externalPath), ctx.App.Name, userconfig.Identify(ctx.Environment), userconfig.DataKey, userconfig.PathKey) } - manifests := make([]string, len(spec.K8sSpecs)) - for i, k8sSpec := range spec.K8sSpecs { - manifestJSON, err := json.Marshal(k8sSpec) - if err != nil { - return nil, errors.Wrap(err, ctx.App.Name, "workloads", spec.WorkloadID) - } - manifestYAML, err := ghodssyaml.JSONToYAML(manifestJSON) - if err != nil { - return nil, errors.Wrap(err, ctx.App.Name, "workloads", spec.WorkloadID) - } - manifests[i] = string(manifestYAML) - } - combinedManifest = strings.Join(manifests, "\n\n---\n\n") - } - - argo.AddTask(wf, &argo.WorkflowTask{ - Name: spec.WorkloadID, - Action: spec.K8sAction, - Manifest: combinedManifest, - SuccessCondition: spec.SuccessCondition, - FailureCondition: spec.FailureCondition, - Dependencies: slices.UniqueStrings(dependencyWorkloadIDs), - Labels: map[string]string{ - "appName": ctx.App.Name, - "workloadType": spec.WorkloadType, - "workloadID": spec.WorkloadID, - }, - }) - - err = uploadWorkloadSpec(spec, ctx) - if err != nil { - return nil, err } } - return wf, nil + return nil } -func populateLatestWorkloadIDs(ctx *context.Context) error { - resourceIDs := ctx.ComputedResourceIDs() - resourceWorkloadIDs, err := getSavedLatestWorkloadIDs(resourceIDs, ctx.App.Name) - if err != nil { +func Run(ctx *context.Context) error { + if err := ctx.CheckAllWorkloadIDsPopulated(); err != nil { return err - } - ctx.PopulateWorkloadIDs(resourceWorkloadIDs) - return nil -} -func Run(wf *awfv1.Workflow, ctx *context.Context, existingWf *awfv1.Workflow) error { - err := ctx.CheckAllWorkloadIDsPopulated() + prevCtx := CurrentContext(ctx.App.Name) + err := deleteOldDataJobs(prevCtx) if err != nil { return err } - if existingWf != nil { - existingCtx := CurrentContext(ctx.App.Name) - if wf.Labels["appName"] != existingWf.Labels["appName"] { - return ErrorWorkflowAppMismatch() - } - if existingCtx != nil && ctx.App.Name != existingCtx.App.Name { - return ErrorContextAppMismatch() - } - - err := Stop(existingWf, existingCtx) - if err != nil { - return err - } - } - - err = config.Argo.Run(wf) - if err != nil { - return errors.Wrap(err, ctx.App.Name) - } + deleteOldAPIs(ctx) - err = createServicesAndIngresses(ctx) + err = setCurrentContext(ctx) if err != nil { return err } - deleteOldAPIs(ctx) - - setCurrentContext(ctx) - resourceWorkloadIDs := ctx.ComputedResourceResourceWorkloadIDs() err = uploadLatestWorkloadIDs(resourceWorkloadIDs, ctx.App.Name) if err != nil { @@ -220,20 +111,26 @@ func Run(wf *awfv1.Workflow, ctx *context.Context, existingWf *awfv1.Workflow) e uncacheDataSavedStatuses(resourceWorkloadIDs, ctx.App.Name) uncacheLatestWorkloadIDs(ctx.ComputedResourceIDs(), ctx.App.Name) + runCronNow() + return nil } -func Stop(wf *awfv1.Workflow, ctx *context.Context) error { - if wf == nil { +func deleteOldDataJobs(ctx *context.Context) error { + if ctx == nil { return nil } - _, err := config.Argo.Delete(wf.Name) - if err != nil { - return errors.Wrap(err, ctx.App.Name) + jobs, _ := config.Kubernetes.ListJobsByLabel("appName", ctx.App.Name) + for _, job := range jobs { + config.Kubernetes.DeleteJob(job.Name) + } + sparkApps, _ := config.Spark.ListByLabel("appName", ctx.App.Name) + for _, sparkApp := range sparkApps { + config.Spark.Delete(sparkApp.Name) } - err = updateKilledDataSavedStatuses(ctx) + err := updateKilledDataSavedStatuses(ctx) if err != nil { return err } @@ -242,15 +139,6 @@ func Stop(wf *awfv1.Workflow, ctx *context.Context) error { } func DeleteApp(appName string, keepCache bool) bool { - ctx := CurrentContext(appName) - wasDeployed := false - - if ctx != nil { - wf, _ := GetWorkflow(appName) - Stop(wf, ctx) - wasDeployed = true - } - deployments, _ := config.Kubernetes.ListDeploymentsByLabel("appName", appName) for _, deployment := range deployments { config.Kubernetes.DeleteDeployment(deployment.Name) @@ -271,11 +159,21 @@ func DeleteApp(appName string, keepCache bool) bool { for _, job := range jobs { config.Kubernetes.DeleteJob(job.Name) } + sparkApps, _ := config.Spark.ListByLabel("appName", appName) + for _, sparkApp := range sparkApps { + config.Spark.Delete(sparkApp.Name) + } pods, _ := config.Kubernetes.ListPodsByLabel("appName", appName) for _, pod := range pods { config.Kubernetes.DeletePod(pod.Name) } + wasDeployed := false + if ctx := CurrentContext(appName); ctx != nil { + updateKilledDataSavedStatuses(ctx) + wasDeployed = true + } + deleteCurrentContext(appName) uncacheDataSavedStatuses(nil, appName) uncacheLatestWorkloadIDs(nil, appName) @@ -287,18 +185,138 @@ func DeleteApp(appName string, keepCache bool) bool { return wasDeployed } -func GetWorkflow(appName string) (*awfv1.Workflow, error) { - wfs, err := config.Argo.ListByLabel("appName", appName) +func UpdateWorkflows() error { + currentWorkloadIDs := make(map[string]strset.Set) + + for _, ctx := range CurrentContexts() { + err := updateWorkflow(ctx) + if err != nil { + return err + } + + currentWorkloadIDs[ctx.App.Name] = ctx.ComputedResourceWorkloadIDs() + } + + uncacheBaseWorkloads(currentWorkloadIDs) + + return nil +} + +func updateWorkflow(ctx *context.Context) error { + workloads := extractWorkloads(ctx) + + err := uploadBaseWorkloadsFromWorkloads(workloads) if err != nil { - return nil, errors.Wrap(err, appName) + return err + } + + for _, workload := range workloads { + isSucceeded, err := workload.IsSucceeded(ctx) + if err != nil { + return err + } + if isSucceeded { + continue + } + + isFailed, err := workload.IsFailed(ctx) + if err != nil { + return err + } + if isFailed { + continue + } + + isStarted, err := workload.IsStarted(ctx) + if err != nil { + return err + } + if isStarted { + continue + } + + canRun, err := workload.CanRun(ctx) + if err != nil { + return err + } + if !canRun { + continue + } + + err = workload.Start(ctx) + if err != nil { + return err + } } - if len(wfs) > 1 { - return nil, errors.Wrap(ErrorMoreThanOneWorkflow(), appName) + + return nil +} + +func IsWorkloadEnded(appName string, workloadID string) (bool, error) { + ctx := CurrentContext(appName) + if ctx == nil { + return false, nil + } + + for _, workload := range extractWorkloads(ctx) { + if workload.GetWorkloadID() == workloadID { + isSucceeded, err := workload.IsSucceeded(ctx) + if err != nil { + return false, err + } + if isSucceeded { + return true, nil + } + + isFailed, err := workload.IsFailed(ctx) + if err != nil { + return false, err + } + if isFailed { + return true, nil + } + + return false, nil + } } - if len(wfs) == 0 { - return nil, nil + return false, errors.New("workload not found in the current context") +} + +func IsDeploymentUpdating(appName string) (bool, error) { + ctx := CurrentContext(appName) + if ctx == nil { + return false, nil + } + + for _, workload := range extractWorkloads(ctx) { + isSucceeded, err := workload.IsSucceeded(ctx) + if err != nil { + return false, err + } + if isSucceeded { + continue + } + + isFailed, err := workload.IsFailed(ctx) + if err != nil { + return false, err + } + if isFailed { + continue + } + + canRun, err := workload.CanRun(ctx) + if err != nil { + return false, err + } + if !canRun { + continue + } + + // It's either running or can run + return true, nil } - return &wfs[0], nil + return false, nil } diff --git a/pkg/operator/workloads/workload.go b/pkg/operator/workloads/workload.go new file mode 100644 index 0000000000..63e53d5584 --- /dev/null +++ b/pkg/operator/workloads/workload.go @@ -0,0 +1,185 @@ +/* +Copyright 2019 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloads + +import ( + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" + "github.com/cortexlabs/cortex/pkg/operator/api/context" +) + +const ( + workloadTypeAPI = "api" + workloadTypeSpark = "spark-job" + workloadTypeTrain = "training-job" + workloadTypePythonPackager = "python-packager" +) + +type Workload interface { + BaseWorkloadInterface + CanRun(*context.Context) (bool, error) // All of the dependencies are satisfied and the workload can be started + Start(*context.Context) error // Start the workload + IsStarted(*context.Context) (bool, error) // The workload was started on the most recent deploy (might be running, succeeded, or failed). It's ok if this doesn't remain accurate across cx deploys + IsRunning(*context.Context) (bool, error) // The workload is currently running + IsSucceeded(*context.Context) (bool, error) // The workload succeeded + IsFailed(*context.Context) (bool, error) // The workload failed +} + +type BaseWorkload struct { + AppName string + WorkloadID string + WorkloadType string + Resources map[string]context.ResourceFields +} + +type BaseWorkloadInterface interface { + GetAppName() string + GetWorkloadID() string + GetWorkloadType() string + GetResources() map[string]context.ResourceFields + CreatesResource(resourceID string) bool + AddResource(res context.ComputedResource) + GetResourceIDs() strset.Set + GetSingleResourceID() string + GetBaseWorkloadPtr() *BaseWorkload +} + +func (bw *BaseWorkload) GetBaseWorkloadPtr() *BaseWorkload { + return bw +} + +func (bw *BaseWorkload) GetAppName() string { + return bw.AppName +} + +func (bw *BaseWorkload) GetWorkloadID() string { + return bw.WorkloadID +} + +func (bw *BaseWorkload) GetWorkloadType() string { + return bw.WorkloadType +} + +func (bw *BaseWorkload) GetResources() map[string]context.ResourceFields { + if bw.Resources == nil { + bw.Resources = make(map[string]context.ResourceFields) + } + return bw.Resources +} + +func (bw *BaseWorkload) GetResourceIDs() strset.Set { + resourceIDs := strset.NewWithSize(len(bw.Resources)) + for resourceID := range bw.Resources { + resourceIDs.Add(resourceID) + } + return resourceIDs +} + +func (bw *BaseWorkload) GetSingleResourceID() string { + for resourceID := range bw.Resources { + return resourceID + } + return "" +} + +func (bw *BaseWorkload) CreatesResource(resourceID string) bool { + if bw.Resources == nil { + bw.Resources = make(map[string]context.ResourceFields) + } + _, ok := bw.Resources[resourceID] + return ok +} + +func (bw *BaseWorkload) AddResource(res context.ComputedResource) { + if bw.Resources == nil { + bw.Resources = make(map[string]context.ResourceFields) + } + bw.Resources[res.GetID()] = context.ResourceFields{ + ID: res.GetID(), + ResourceType: res.GetResourceType(), + } +} + +func (bw *BaseWorkload) Copy() *BaseWorkload { + if bw == nil { + return nil + } + + copiedResources := make(map[string]context.ResourceFields, len(bw.Resources)) + for resID, res := range bw.Resources { + copiedResources[resID] = res + } + + return &BaseWorkload{ + AppName: bw.AppName, + WorkloadID: bw.WorkloadID, + WorkloadType: bw.WorkloadType, + Resources: copiedResources, + } +} + +func BaseWorkloadPtrsEqual(bw1 *BaseWorkload, bw2 *BaseWorkload) bool { + if bw1 == nil && bw2 == nil { + return true + } + if bw1 == nil || bw2 == nil { + return false + } + return bw1.Equal(*bw2) +} + +func (bw *BaseWorkload) Equal(bw2 BaseWorkload) bool { + if bw.AppName != bw2.AppName { + return false + } + if bw.WorkloadID != bw2.WorkloadID { + return false + } + if bw.WorkloadType != bw2.WorkloadType { + return false + } + if len(bw.Resources) != len(bw2.Resources) { + return false + } + for resID, res := range bw.Resources { + res2, ok := bw2.Resources[resID] + if !ok { + return false + } + if res.ID != res2.ID { + return false + } + if res.ResourceType != res2.ResourceType { + return false + } + } + return true +} + +func emptyBaseWorkload(appName string, workloadID string, workloadType string) BaseWorkload { + return BaseWorkload{ + AppName: appName, + WorkloadID: workloadID, + WorkloadType: workloadType, + Resources: make(map[string]context.ResourceFields), + } +} + +func singleBaseWorkload(res context.ComputedResource, appName string, workloadType string) BaseWorkload { + bw := emptyBaseWorkload(appName, res.GetWorkloadID(), workloadType) + bw.AddResource(res) + return bw +} diff --git a/pkg/operator/workloads/workload_spec.go b/pkg/operator/workloads/workload_spec.go deleted file mode 100644 index de605a512b..0000000000 --- a/pkg/operator/workloads/workload_spec.go +++ /dev/null @@ -1,176 +0,0 @@ -/* -Copyright 2019 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package workloads - -import ( - "time" - - kcore "k8s.io/api/core/v1" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/cortexlabs/cortex/pkg/lib/aws" - "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/pointer" - "github.com/cortexlabs/cortex/pkg/lib/sets/strset" - "github.com/cortexlabs/cortex/pkg/operator/api/context" - "github.com/cortexlabs/cortex/pkg/operator/api/resource" - "github.com/cortexlabs/cortex/pkg/operator/config" - ocontext "github.com/cortexlabs/cortex/pkg/operator/context" -) - -type WorkloadSpec struct { - WorkloadID string - ResourceIDs strset.Set - K8sSpecs []kmeta.Object - K8sAction string - SuccessCondition string - FailureCondition string - WorkloadType string -} - -type SavedWorkloadSpec struct { - AppName string - WorkloadID string - WorkloadType string - Resources map[string]*context.ResourceFields -} - -func uploadWorkloadSpec(workloadSpec *WorkloadSpec, ctx *context.Context) error { - if workloadSpec == nil { - return nil - } - - resources := make(map[string]*context.ResourceFields) - for resourceID := range workloadSpec.ResourceIDs { - resource := ctx.OneResourceByID(resourceID) - resources[resourceID] = &context.ResourceFields{ - ID: resource.GetID(), - ResourceType: resource.GetResourceType(), - } - } - - savedWorkloadSpec := SavedWorkloadSpec{ - AppName: ctx.App.Name, - WorkloadID: workloadSpec.WorkloadID, - WorkloadType: workloadSpec.WorkloadType, - Resources: resources, - } - - key := ocontext.WorkloadSpecKey(savedWorkloadSpec.WorkloadID, ctx.App.Name) - err := config.AWS.UploadJSONToS3(savedWorkloadSpec, key) - if err != nil { - return errors.Wrap(err, "upload workload spec", ctx.App.Name, savedWorkloadSpec.WorkloadID) - } - return nil -} - -func getSavedWorkloadSpec(workloadID string, appName string) (*SavedWorkloadSpec, error) { - key := ocontext.WorkloadSpecKey(workloadID, appName) - var savedWorkloadSpec SavedWorkloadSpec - err := config.AWS.ReadJSONFromS3(&savedWorkloadSpec, key) - if aws.IsNoSuchKeyErr(err) { - return nil, nil - } - if err != nil { - return nil, errors.Wrap(err, "download workload spec", appName, workloadID) - } - return &savedWorkloadSpec, nil -} - -func UpdateDataWorkflowErrors(failedPods []kcore.Pod) error { - checkedWorkloadIDs := strset.New() - nowTime := pointer.Time(time.Now()) - - for _, pod := range failedPods { - appName, ok := pod.Labels["appName"] - if !ok { - continue - } - workloadID, ok := pod.Labels["workloadID"] - if !ok { - continue - } - - if pod.Labels["workloadType"] == WorkloadTypeAPI { - continue - } - - if checkedWorkloadIDs.Has(workloadID) { - continue - } - checkedWorkloadIDs.Add(workloadID) - - savedWorkloadSpec, err := getSavedWorkloadSpec(workloadID, appName) - if err != nil { - return err - } - if savedWorkloadSpec == nil { - continue - } - - resourceWorkloadIDs := make(map[string]string, len(savedWorkloadSpec.Resources)) - for _, resource := range savedWorkloadSpec.Resources { - resourceWorkloadIDs[resource.ID] = workloadID - } - - savedStatuses, err := getDataSavedStatuses(resourceWorkloadIDs, appName) - if err != nil { - return err - } - - var savedStatusesToUpload []*resource.DataSavedStatus - for resourceID, res := range savedWorkloadSpec.Resources { - savedStatus := savedStatuses[resourceID] - - if savedStatus == nil { - savedStatus = &resource.DataSavedStatus{ - BaseSavedStatus: resource.BaseSavedStatus{ - ResourceID: resourceID, - ResourceType: res.ResourceType, - WorkloadID: workloadID, - AppName: appName, - }, - } - } - - if savedStatus.End == nil { - savedStatus.End = nowTime - if savedStatus.Start == nil { - savedStatus.Start = nowTime - } - - switch k8s.GetPodStatus(&pod) { - case k8s.PodStatusKilled: - savedStatus.ExitCode = resource.ExitCodeDataKilled - case k8s.PodStatusKilledOOM: - savedStatus.ExitCode = resource.ExitCodeDataOOM - default: - savedStatus.ExitCode = resource.ExitCodeDataFailed - } - - savedStatusesToUpload = append(savedStatusesToUpload, savedStatus) - } - } - - err = uploadDataSavedStatuses(savedStatusesToUpload) - if err != nil { - return err - } - } - return nil -}