Skip to content

Updates to the controller logic to better handle failures in etc updates #424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions config/crd/bases/mcad.ibm.com_appwrappers.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
Expand Down Expand Up @@ -778,6 +776,10 @@ spec:
QueueJob (by Informer)
format: date-time
type: string
controllerfirstdispatchtimestamp:
description: Microsecond level timestamp when controller first dispatches appwrapper
format: date-time
type: string
failed:
description: The number of resources which reached phase Failed.
format: int32
Expand All @@ -790,8 +792,7 @@ spec:
description: Is Dispatched?
type: boolean
local:
description: Indicate if message is a duplicate (for Informer to recognize
duplicate messages)
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
type: boolean
message:
type: string
Expand All @@ -812,15 +813,13 @@ spec:
format: int32
type: integer
queuejobstate:
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
...
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
type: string
running:
format: int32
type: integer
sender:
description: Indicate sender of this message (extremely useful for
debugging)
description: Indicate sender of this message (extremely useful for debugging)
type: string
state:
description: State - Pending, Running, Failed, Deleted
Expand All @@ -834,9 +833,22 @@ spec:
(is this different from the MinAvailable from JobStatus)
format: int32
type: integer
number-of-requeueings:
description: Field to keep track of how many times a requeuing event has been triggered
format: int32
type: integer
default: 0
requeueing-time-seconds:
description: Field to keep track of total number of seconds spent in requeueing
format: int32
type: integer
default: 0
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}

28 changes: 20 additions & 8 deletions deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
Expand Down Expand Up @@ -778,6 +776,10 @@ spec:
QueueJob (by Informer)
format: date-time
type: string
controllerfirstdispatchtimestamp:
description: Microsecond level timestamp when controller first dispatches appwrapper
format: date-time
type: string
failed:
description: The number of resources which reached phase Failed.
format: int32
Expand All @@ -790,8 +792,7 @@ spec:
description: Is Dispatched?
type: boolean
local:
description: Indicate if message is a duplicate (for Informer to recognize
duplicate messages)
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
type: boolean
message:
type: string
Expand All @@ -812,15 +813,13 @@ spec:
format: int32
type: integer
queuejobstate:
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
...
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
type: string
running:
format: int32
type: integer
sender:
description: Indicate sender of this message (extremely useful for
debugging)
description: Indicate sender of this message (extremely useful for debugging)
type: string
state:
description: State - Pending, Running, Failed, Deleted
Expand All @@ -834,9 +833,22 @@ spec:
(is this different from the MinAvailable from JobStatus)
format: int32
type: integer
number-of-requeueings:
description: Field to keep track of how many times a requeuing event has been triggered
format: int32
type: integer
default: 0
requeueing-time-seconds:
description: Field to keep track of total number of seconds spent in requeueing
format: int32
type: integer
default: 0
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}

26 changes: 9 additions & 17 deletions hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ export CLUSTER_CONTEXT="--name test"
export IMAGE_ECHOSERVER="kicbase/echo-server:1.0"
export IMAGE_UBUNTU_LATEST="ubuntu:latest"
export IMAGE_UBI_LATEST="registry.access.redhat.com/ubi8/ubi:latest"
export IMAGE_BUSY_BOX_LATEST="k8s.gcr.io/busybox:latest"
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/e2e-kind-config.yaml"}
export KA_BIN=_output/bin
export WAIT_TIME="20s"
Expand Down Expand Up @@ -207,27 +208,20 @@ function kind-up-cluster {
exit 1
fi

docker pull ${IMAGE_ECHOSERVER}
if [ $? -ne 0 ]
then
echo "Failed to pull ${IMAGE_ECHOSERVER}"
exit 1
fi

docker pull ${IMAGE_UBUNTU_LATEST}
docker pull ${IMAGE_UBI_LATEST}
if [ $? -ne 0 ]
then
echo "Failed to pull ${IMAGE_UBUNTU_LATEST}"
echo "Failed to pull ${IMAGE_UBI_LATEST}"
exit 1
fi

docker pull ${IMAGE_UBI_LATEST}
docker pull ${IMAGE_BUSY_BOX_LATEST}
if [ $? -ne 0 ]
then
echo "Failed to pull ${IMAGE_UBI_LATEST}"
echo "Failed to pull ${IMAGE_BUSY_BOX_LATEST}"
exit 1
fi

if [[ "$MCAD_IMAGE_PULL_POLICY" = "Always" ]]
then
docker pull ${IMAGE_MCAD}
Expand All @@ -244,7 +238,7 @@ function kind-up-cluster {
fi
docker images

for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST}
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST} ${IMAGE_BUSY_BOX_LATEST}
do
kind load docker-image ${image} ${CLUSTER_CONTEXT}
if [ $? -ne 0 ]
Expand Down Expand Up @@ -330,8 +324,6 @@ function mcad-quota-management-down {
echo "Failed to undeploy controller"
exit 1
fi
echo "Waiting for the test namespace to be cleaned up.."
sleep 60
}

function mcad-up {
Expand Down Expand Up @@ -402,4 +394,4 @@ setup-mcad-env
kuttl-tests
mcad-quota-management-down
mcad-up
go test ./test/e2e -v -timeout 120m -count=1
go test ./test/e2e -v -timeout 120m -count=1
8 changes: 6 additions & 2 deletions pkg/apis/controller/v1beta1/appwrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ type AppWrapperService struct {
}

// AppWrapperResource is App Wrapper aggregation resource
//todo: To be depricated
// todo: To be depricated
type AppWrapperResource struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata"`
Expand Down Expand Up @@ -271,11 +271,15 @@ type AppWrapperStatus struct {
TotalMemory float64 `json:"totalmemory,omitempty"`

TotalGPU int64 `json:"totalgpu,omitempty"`

// Re-queueing state fields
RequeueingTimeInSeconds int `json:"requeueing-time-seconds,omitempty"`
NumberOfRequeueings int `json:"number-of-requeueings,omitempty"`
}

type AppWrapperState string

//enqueued, active, deleting, succeeded, failed
// enqueued, active, deleting, succeeded, failed
const (
AppWrapperStateEnqueued AppWrapperState = "Pending"
AppWrapperStateActive AppWrapperState = "Running"
Expand Down
54 changes: 0 additions & 54 deletions pkg/controller/queuejob/active_appwrapper.go

This file was deleted.

43 changes: 0 additions & 43 deletions pkg/controller/queuejob/active_appwrapper_test.go

This file was deleted.

Loading