Skip to content

Updates to the controller logic to better handle failures in etc updates #424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions config/crd/bases/mcad.ibm.com_appwrappers.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
Expand Down Expand Up @@ -778,6 +776,10 @@ spec:
QueueJob (by Informer)
format: date-time
type: string
controllerfirstdispatchtimestamp:
description: Microsecond level timestamp when controller first dispatches appwrapper
format: date-time
type: string
failed:
description: The number of resources which reached phase Failed.
format: int32
Expand All @@ -790,8 +792,7 @@ spec:
description: Is Dispatched?
type: boolean
local:
description: Indicate if message is a duplicate (for Informer to recognize
duplicate messages)
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
type: boolean
message:
type: string
Expand All @@ -812,15 +813,13 @@ spec:
format: int32
type: integer
queuejobstate:
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
...
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
type: string
running:
format: int32
type: integer
sender:
description: Indicate sender of this message (extremely useful for
debugging)
description: Indicate sender of this message (extremely useful for debugging)
type: string
state:
description: State - Pending, Running, Failed, Deleted
Expand All @@ -834,9 +833,22 @@ spec:
(is this different from the MinAvailable from JobStatus)
format: int32
type: integer
numberOfRequeueings:
description: Field to keep track of how many times a requeuing event has been triggered
format: int32
type: integer
default: 0
requeueingTimeInSeconds:
description: Field to keep track of total number of seconds spent in requeueing
format: int32
type: integer
default: 0
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}

28 changes: 20 additions & 8 deletions deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
Expand Down Expand Up @@ -778,6 +776,10 @@ spec:
QueueJob (by Informer)
format: date-time
type: string
controllerfirstdispatchtimestamp:
description: Microsecond level timestamp when controller first dispatches appwrapper
format: date-time
type: string
failed:
description: The number of resources which reached phase Failed.
format: int32
Expand All @@ -790,8 +792,7 @@ spec:
description: Is Dispatched?
type: boolean
local:
description: Indicate if message is a duplicate (for Informer to recognize
duplicate messages)
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
type: boolean
message:
type: string
Expand All @@ -812,15 +813,13 @@ spec:
format: int32
type: integer
queuejobstate:
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
...
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
type: string
running:
format: int32
type: integer
sender:
description: Indicate sender of this message (extremely useful for
debugging)
description: Indicate sender of this message (extremely useful for debugging)
type: string
state:
description: State - Pending, Running, Failed, Deleted
Expand All @@ -834,9 +833,22 @@ spec:
(is this different from the MinAvailable from JobStatus)
format: int32
type: integer
numberOfRequeueings:
description: Field to keep track of how many times a requeuing event has been triggered
format: int32
type: integer
default: 0
requeueingTimeInSeconds:
description: Field to keep track of total number of seconds spent in requeueing
format: int32
type: integer
default: 0
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}

2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.18
require (
github.com/eapache/go-resiliency v1.3.0
github.com/emicklei/go-restful v2.16.0+incompatible
github.com/gogo/protobuf v1.3.1
github.com/golang/protobuf v1.4.3
github.com/hashicorp/go-multierror v1.1.1
github.com/kubernetes-sigs/custom-metrics-apiserver v0.0.0-20210311094424-0ca2b1909cdc
Expand Down Expand Up @@ -45,7 +46,6 @@ require (
github.com/go-openapi/jsonreference v0.19.5 // indirect
github.com/go-openapi/spec v0.20.0 // indirect
github.com/go-openapi/swag v0.19.12 // indirect
github.com/gogo/protobuf v1.3.1 // indirect
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
github.com/google/go-cmp v0.5.5 // indirect
github.com/google/gofuzz v1.1.0 // indirect
Expand Down
26 changes: 9 additions & 17 deletions hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ export CLUSTER_CONTEXT="--name test"
export IMAGE_ECHOSERVER="kicbase/echo-server:1.0"
export IMAGE_UBUNTU_LATEST="ubuntu:latest"
export IMAGE_UBI_LATEST="registry.access.redhat.com/ubi8/ubi:latest"
export IMAGE_BUSY_BOX_LATEST="k8s.gcr.io/busybox:latest"
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/e2e-kind-config.yaml"}
export KA_BIN=_output/bin
export WAIT_TIME="20s"
Expand Down Expand Up @@ -207,27 +208,20 @@ function kind-up-cluster {
exit 1
fi

docker pull ${IMAGE_ECHOSERVER}
if [ $? -ne 0 ]
then
echo "Failed to pull ${IMAGE_ECHOSERVER}"
exit 1
fi

docker pull ${IMAGE_UBUNTU_LATEST}
docker pull ${IMAGE_UBI_LATEST}
if [ $? -ne 0 ]
then
echo "Failed to pull ${IMAGE_UBUNTU_LATEST}"
echo "Failed to pull ${IMAGE_UBI_LATEST}"
exit 1
fi

docker pull ${IMAGE_UBI_LATEST}
docker pull ${IMAGE_BUSY_BOX_LATEST}
if [ $? -ne 0 ]
then
echo "Failed to pull ${IMAGE_UBI_LATEST}"
echo "Failed to pull ${IMAGE_BUSY_BOX_LATEST}"
exit 1
fi

if [[ "$MCAD_IMAGE_PULL_POLICY" = "Always" ]]
then
docker pull ${IMAGE_MCAD}
Expand All @@ -244,7 +238,7 @@ function kind-up-cluster {
fi
docker images

for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST}
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST} ${IMAGE_BUSY_BOX_LATEST}
do
kind load docker-image ${image} ${CLUSTER_CONTEXT}
if [ $? -ne 0 ]
Expand Down Expand Up @@ -330,8 +324,6 @@ function mcad-quota-management-down {
echo "Failed to undeploy controller"
exit 1
fi
echo "Waiting for the test namespace to be cleaned up.."
sleep 60
}

function mcad-up {
Expand Down Expand Up @@ -402,4 +394,4 @@ setup-mcad-env
kuttl-tests
mcad-quota-management-down
mcad-up
go test ./test/e2e -v -timeout 120m -count=1
go test ./test/e2e -v -timeout 130m -count=1
17 changes: 12 additions & 5 deletions pkg/apis/controller/v1beta1/appwrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ type AppWrapperService struct {
}

// AppWrapperResource is App Wrapper aggregation resource
//todo: To be depricated
// todo: To be depricated
type AppWrapperResource struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata"`
Expand Down Expand Up @@ -246,7 +246,7 @@ type AppWrapperStatus struct {
// Microsecond level timestamp when controller first sees QueueJob (by Informer)
ControllerFirstTimestamp metav1.MicroTime `json:"controllerfirsttimestamp,omitempty"`

// Microsecond level timestamp when controller first sets appwrapper in state Running
// Microsecond level timestamp when controller first dispatches appwrapper
ControllerFirstDispatchTimestamp metav1.MicroTime `json:"controllerfirstdispatchtimestamp,omitempty"`

// Tell Informer to ignore this update message (do not generate a controller event)
Expand All @@ -264,18 +264,25 @@ type AppWrapperStatus struct {
// Represents the latest available observations of pods under appwrapper
PendingPodConditions []PendingPodSpec `json:"pendingpodconditions"`

//Resources consumed

// Represents the number of cpu consumed by all pods belonging to an appwrapper.
TotalCPU float64 `json:"totalcpu,omitempty"`

// Represents the amount of memory consumed by all pods belonging to an appwrapper.
TotalMemory float64 `json:"totalmemory,omitempty"`

// Represents the total number of GPUs consumed by all pods belonging to an appwrapper.
TotalGPU int64 `json:"totalgpu,omitempty"`

// Field to keep track of total number of seconds spent in requeueing
RequeueingTimeInSeconds int `json:"requeueingTimeInSeconds,omitempty"`

// Field to keep track of how many times a requeuing event has been triggered
NumberOfRequeueings int `json:"numberOfRequeueings,omitempty"`
}

type AppWrapperState string

//enqueued, active, deleting, succeeded, failed
// enqueued, active, deleting, succeeded, failed
const (
AppWrapperStateEnqueued AppWrapperState = "Pending"
AppWrapperStateActive AppWrapperState = "Running"
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/clusterstate/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ func (sc *ClusterStateCache) updateState() error {
}

func (sc *ClusterStateCache) deleteJob(job *api.JobInfo) {
klog.V(4).Infof("[deleteJob] Attempting to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
klog.V(10).Infof("[deleteJob] Attempting to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)

time.AfterFunc(5*time.Second, func() {
sc.deletedJobs.AddIfNotPresent(job)
Expand Down
20 changes: 10 additions & 10 deletions pkg/controller/metrics/adapter/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,39 +32,40 @@ package adapter

import (
"flag"
"net/http"
"os"

"github.com/project-codeflare/multi-cluster-app-dispatcher/cmd/kar-controllers/app/options"
openapinamer "k8s.io/apiserver/pkg/endpoints/openapi"
genericapiserver "k8s.io/apiserver/pkg/server"
"net/http"
"os"

"github.com/emicklei/go-restful"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"

adapterprov "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/metrics/adapter/provider"
"github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/apiserver"
basecmd "github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/cmd"
"github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/provider"
generatedopenapi "github.com/kubernetes-sigs/custom-metrics-apiserver/test-adapter/generated/openapi"
adapterprov "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/metrics/adapter/provider"

clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache"
)

// New returns a Cache implementation.
func New(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdpater {
func New(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdapter {
return newMetricsAdpater(serverOptions, config, clusterStateCache)
}

type MetricsAdpater struct {
type MetricsAdapter struct {
basecmd.AdapterBase

// Message is printed on succesful startup
Message string
}

func (a *MetricsAdpater) makeProviderOrDie(clusterStateCache clusterstatecache.Cache) (provider.MetricsProvider, *restful.WebService) {
func (a *MetricsAdapter) makeProviderOrDie(clusterStateCache clusterstatecache.Cache) (provider.MetricsProvider, *restful.WebService) {
klog.Infof("[makeProviderOrDie] Entered makeProviderOrDie()")
client, err := a.DynamicClient()
if err != nil {
Expand All @@ -79,7 +80,7 @@ func (a *MetricsAdpater) makeProviderOrDie(clusterStateCache clusterstatecache.C
return adapterprov.NewFakeProvider(client, mapper, clusterStateCache)
}

func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOption) []string{
func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOption) []string {
var portedArgs = make([]string, 0)
if serverOptions == nil {
return portedArgs
Expand All @@ -91,11 +92,10 @@ func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOpti
}
return portedArgs
}
func newMetricsAdpater(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdpater {
func newMetricsAdpater(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdapter {
klog.V(10).Infof("[newMetricsAdpater] Entered newMetricsAdpater()")

cmd := &MetricsAdpater{
}
cmd := &MetricsAdapter{}

cmd.OpenAPIConfig = genericapiserver.DefaultOpenAPIConfig(generatedopenapi.GetOpenAPIDefinitions, openapinamer.NewDefinitionNamer(apiserver.Scheme))
cmd.OpenAPIConfig.Info.Title = "MetricsAdpater"
Expand Down
54 changes: 0 additions & 54 deletions pkg/controller/queuejob/active_appwrapper.go

This file was deleted.

Loading