Skip to content

Commit 089cf9f

Browse files
z103cbasm582
andauthored
Updates to the controller logic to better handle failures in etc updates (#424)
* Updates to the controller logic (wip) * Added retry logic for etc errors. * Fix small race condition * Preemption code updates. * Small locking issue fixed. * Fixed failed test. * Removed focused test. * Bug fixes and CRD documentation updates. * Fixed misspelling * Fix bad merge * E2E test improvements * Removed test focus * Fixed a few bugs with preemption. Updated test messages. Minor cleanups in the e2e tests. * Testcase updates * Added more messages to the e2e tests. * Error handling and messages updates. Fixed warning in go.mod file * Addressed PR review comments Fixed failed test Log message improvements * Fixed failing test. * Fixed bugs in the manage queque job e2e test updates. * Fixed Merges issues Fixed Failing test * fix test * remove Fit * bump build time --------- Co-authored-by: Abhishek Malvankar <[email protected]>
1 parent 6d58335 commit 089cf9f

File tree

20 files changed

+1470
-955
lines changed

20 files changed

+1470
-955
lines changed

config/crd/bases/mcad.ibm.com_appwrappers.yaml

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
2-
---
31
apiVersion: apiextensions.k8s.io/v1
42
kind: CustomResourceDefinition
53
metadata:
@@ -778,6 +776,10 @@ spec:
778776
QueueJob (by Informer)
779777
format: date-time
780778
type: string
779+
controllerfirstdispatchtimestamp:
780+
description: Microsecond level timestamp when controller first dispatches appwrapper
781+
format: date-time
782+
type: string
781783
failed:
782784
description: The number of resources which reached phase Failed.
783785
format: int32
@@ -790,8 +792,7 @@ spec:
790792
description: Is Dispatched?
791793
type: boolean
792794
local:
793-
description: Indicate if message is a duplicate (for Informer to recognize
794-
duplicate messages)
795+
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
795796
type: boolean
796797
message:
797798
type: string
@@ -812,15 +813,13 @@ spec:
812813
format: int32
813814
type: integer
814815
queuejobstate:
815-
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
816-
...
816+
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
817817
type: string
818818
running:
819819
format: int32
820820
type: integer
821821
sender:
822-
description: Indicate sender of this message (extremely useful for
823-
debugging)
822+
description: Indicate sender of this message (extremely useful for debugging)
824823
type: string
825824
state:
826825
description: State - Pending, Running, Failed, Deleted
@@ -834,9 +833,22 @@ spec:
834833
(is this different from the MinAvailable from JobStatus)
835834
format: int32
836835
type: integer
836+
numberOfRequeueings:
837+
description: Field to keep track of how many times a requeuing event has been triggered
838+
format: int32
839+
type: integer
840+
default: 0
841+
requeueingTimeInSeconds:
842+
description: Field to keep track of total number of seconds spent in requeueing
843+
format: int32
844+
type: integer
845+
default: 0
837846
type: object
838847
required:
839848
- spec
840849
type: object
841850
served: true
842851
storage: true
852+
subresources:
853+
status: {}
854+

deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
2-
---
31
apiVersion: apiextensions.k8s.io/v1
42
kind: CustomResourceDefinition
53
metadata:
@@ -778,6 +776,10 @@ spec:
778776
QueueJob (by Informer)
779777
format: date-time
780778
type: string
779+
controllerfirstdispatchtimestamp:
780+
description: Microsecond level timestamp when controller first dispatches appwrapper
781+
format: date-time
782+
type: string
781783
failed:
782784
description: The number of resources which reached phase Failed.
783785
format: int32
@@ -790,8 +792,7 @@ spec:
790792
description: Is Dispatched?
791793
type: boolean
792794
local:
793-
description: Indicate if message is a duplicate (for Informer to recognize
794-
duplicate messages)
795+
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
795796
type: boolean
796797
message:
797798
type: string
@@ -812,15 +813,13 @@ spec:
812813
format: int32
813814
type: integer
814815
queuejobstate:
815-
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
816-
...
816+
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
817817
type: string
818818
running:
819819
format: int32
820820
type: integer
821821
sender:
822-
description: Indicate sender of this message (extremely useful for
823-
debugging)
822+
description: Indicate sender of this message (extremely useful for debugging)
824823
type: string
825824
state:
826825
description: State - Pending, Running, Failed, Deleted
@@ -834,9 +833,22 @@ spec:
834833
(is this different from the MinAvailable from JobStatus)
835834
format: int32
836835
type: integer
836+
numberOfRequeueings:
837+
description: Field to keep track of how many times a requeuing event has been triggered
838+
format: int32
839+
type: integer
840+
default: 0
841+
requeueingTimeInSeconds:
842+
description: Field to keep track of total number of seconds spent in requeueing
843+
format: int32
844+
type: integer
845+
default: 0
837846
type: object
838847
required:
839848
- spec
840849
type: object
841850
served: true
842851
storage: true
852+
subresources:
853+
status: {}
854+

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.18
55
require (
66
github.com/eapache/go-resiliency v1.3.0
77
github.com/emicklei/go-restful v2.16.0+incompatible
8+
github.com/gogo/protobuf v1.3.1
89
github.com/golang/protobuf v1.4.3
910
github.com/hashicorp/go-multierror v1.1.1
1011
github.com/kubernetes-sigs/custom-metrics-apiserver v0.0.0-20210311094424-0ca2b1909cdc
@@ -45,7 +46,6 @@ require (
4546
github.com/go-openapi/jsonreference v0.19.5 // indirect
4647
github.com/go-openapi/spec v0.20.0 // indirect
4748
github.com/go-openapi/swag v0.19.12 // indirect
48-
github.com/gogo/protobuf v1.3.1 // indirect
4949
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
5050
github.com/google/go-cmp v0.5.5 // indirect
5151
github.com/google/gofuzz v1.1.0 // indirect

hack/run-e2e-kind.sh

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ export CLUSTER_CONTEXT="--name test"
3434
export IMAGE_ECHOSERVER="kicbase/echo-server:1.0"
3535
export IMAGE_UBUNTU_LATEST="ubuntu:latest"
3636
export IMAGE_UBI_LATEST="registry.access.redhat.com/ubi8/ubi:latest"
37+
export IMAGE_BUSY_BOX_LATEST="k8s.gcr.io/busybox:latest"
3738
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/e2e-kind-config.yaml"}
3839
export KA_BIN=_output/bin
3940
export WAIT_TIME="20s"
@@ -207,27 +208,20 @@ function kind-up-cluster {
207208
exit 1
208209
fi
209210

210-
docker pull ${IMAGE_ECHOSERVER}
211-
if [ $? -ne 0 ]
212-
then
213-
echo "Failed to pull ${IMAGE_ECHOSERVER}"
214-
exit 1
215-
fi
216-
217-
docker pull ${IMAGE_UBUNTU_LATEST}
211+
docker pull ${IMAGE_UBI_LATEST}
218212
if [ $? -ne 0 ]
219213
then
220-
echo "Failed to pull ${IMAGE_UBUNTU_LATEST}"
214+
echo "Failed to pull ${IMAGE_UBI_LATEST}"
221215
exit 1
222216
fi
223-
224-
docker pull ${IMAGE_UBI_LATEST}
217+
218+
docker pull ${IMAGE_BUSY_BOX_LATEST}
225219
if [ $? -ne 0 ]
226220
then
227-
echo "Failed to pull ${IMAGE_UBI_LATEST}"
221+
echo "Failed to pull ${IMAGE_BUSY_BOX_LATEST}"
228222
exit 1
229223
fi
230-
224+
231225
if [[ "$MCAD_IMAGE_PULL_POLICY" = "Always" ]]
232226
then
233227
docker pull ${IMAGE_MCAD}
@@ -244,7 +238,7 @@ function kind-up-cluster {
244238
fi
245239
docker images
246240

247-
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST}
241+
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST} ${IMAGE_BUSY_BOX_LATEST}
248242
do
249243
kind load docker-image ${image} ${CLUSTER_CONTEXT}
250244
if [ $? -ne 0 ]
@@ -330,8 +324,6 @@ function mcad-quota-management-down {
330324
echo "Failed to undeploy controller"
331325
exit 1
332326
fi
333-
echo "Waiting for the test namespace to be cleaned up.."
334-
sleep 60
335327
}
336328

337329
function mcad-up {
@@ -402,4 +394,4 @@ setup-mcad-env
402394
kuttl-tests
403395
mcad-quota-management-down
404396
mcad-up
405-
go test ./test/e2e -v -timeout 120m -count=1
397+
go test ./test/e2e -v -timeout 130m -count=1

pkg/apis/controller/v1beta1/appwrapper.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ type AppWrapperService struct {
101101
}
102102

103103
// AppWrapperResource is App Wrapper aggregation resource
104-
//todo: To be depricated
104+
// todo: To be depricated
105105
type AppWrapperResource struct {
106106
metav1.TypeMeta `json:",inline"`
107107
metav1.ObjectMeta `json:"metadata"`
@@ -246,7 +246,7 @@ type AppWrapperStatus struct {
246246
// Microsecond level timestamp when controller first sees QueueJob (by Informer)
247247
ControllerFirstTimestamp metav1.MicroTime `json:"controllerfirsttimestamp,omitempty"`
248248

249-
// Microsecond level timestamp when controller first sets appwrapper in state Running
249+
// Microsecond level timestamp when controller first dispatches appwrapper
250250
ControllerFirstDispatchTimestamp metav1.MicroTime `json:"controllerfirstdispatchtimestamp,omitempty"`
251251

252252
// Tell Informer to ignore this update message (do not generate a controller event)
@@ -264,18 +264,25 @@ type AppWrapperStatus struct {
264264
// Represents the latest available observations of pods under appwrapper
265265
PendingPodConditions []PendingPodSpec `json:"pendingpodconditions"`
266266

267-
//Resources consumed
268-
267+
// Represents the number of cpu consumed by all pods belonging to an appwrapper.
269268
TotalCPU float64 `json:"totalcpu,omitempty"`
270269

270+
// Represents the amount of memory consumed by all pods belonging to an appwrapper.
271271
TotalMemory float64 `json:"totalmemory,omitempty"`
272272

273+
// Represents the total number of GPUs consumed by all pods belonging to an appwrapper.
273274
TotalGPU int64 `json:"totalgpu,omitempty"`
275+
276+
// Field to keep track of total number of seconds spent in requeueing
277+
RequeueingTimeInSeconds int `json:"requeueingTimeInSeconds,omitempty"`
278+
279+
// Field to keep track of how many times a requeuing event has been triggered
280+
NumberOfRequeueings int `json:"numberOfRequeueings,omitempty"`
274281
}
275282

276283
type AppWrapperState string
277284

278-
//enqueued, active, deleting, succeeded, failed
285+
// enqueued, active, deleting, succeeded, failed
279286
const (
280287
AppWrapperStateEnqueued AppWrapperState = "Pending"
281288
AppWrapperStateActive AppWrapperState = "Running"

pkg/controller/clusterstate/cache/cache.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ func (sc *ClusterStateCache) updateState() error {
334334
}
335335

336336
func (sc *ClusterStateCache) deleteJob(job *api.JobInfo) {
337-
klog.V(4).Infof("[deleteJob] Attempting to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
337+
klog.V(10).Infof("[deleteJob] Attempting to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
338338

339339
time.AfterFunc(5*time.Second, func() {
340340
sc.deletedJobs.AddIfNotPresent(job)

pkg/controller/metrics/adapter/adapter.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,39 +32,40 @@ package adapter
3232

3333
import (
3434
"flag"
35+
"net/http"
36+
"os"
37+
3538
"github.com/project-codeflare/multi-cluster-app-dispatcher/cmd/kar-controllers/app/options"
3639
openapinamer "k8s.io/apiserver/pkg/endpoints/openapi"
3740
genericapiserver "k8s.io/apiserver/pkg/server"
38-
"net/http"
39-
"os"
4041

4142
"github.com/emicklei/go-restful"
4243
"k8s.io/apimachinery/pkg/util/wait"
4344
"k8s.io/client-go/rest"
4445
"k8s.io/klog/v2"
4546

46-
adapterprov "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/metrics/adapter/provider"
4747
"github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/apiserver"
4848
basecmd "github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/cmd"
4949
"github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/provider"
5050
generatedopenapi "github.com/kubernetes-sigs/custom-metrics-apiserver/test-adapter/generated/openapi"
51+
adapterprov "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/metrics/adapter/provider"
5152

5253
clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache"
5354
)
5455

5556
// New returns a Cache implementation.
56-
func New(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdpater {
57+
func New(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdapter {
5758
return newMetricsAdpater(serverOptions, config, clusterStateCache)
5859
}
5960

60-
type MetricsAdpater struct {
61+
type MetricsAdapter struct {
6162
basecmd.AdapterBase
6263

6364
// Message is printed on succesful startup
6465
Message string
6566
}
6667

67-
func (a *MetricsAdpater) makeProviderOrDie(clusterStateCache clusterstatecache.Cache) (provider.MetricsProvider, *restful.WebService) {
68+
func (a *MetricsAdapter) makeProviderOrDie(clusterStateCache clusterstatecache.Cache) (provider.MetricsProvider, *restful.WebService) {
6869
klog.Infof("[makeProviderOrDie] Entered makeProviderOrDie()")
6970
client, err := a.DynamicClient()
7071
if err != nil {
@@ -79,7 +80,7 @@ func (a *MetricsAdpater) makeProviderOrDie(clusterStateCache clusterstatecache.C
7980
return adapterprov.NewFakeProvider(client, mapper, clusterStateCache)
8081
}
8182

82-
func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOption) []string{
83+
func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOption) []string {
8384
var portedArgs = make([]string, 0)
8485
if serverOptions == nil {
8586
return portedArgs
@@ -91,11 +92,10 @@ func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOpti
9192
}
9293
return portedArgs
9394
}
94-
func newMetricsAdpater(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdpater {
95+
func newMetricsAdpater(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdapter {
9596
klog.V(10).Infof("[newMetricsAdpater] Entered newMetricsAdpater()")
9697

97-
cmd := &MetricsAdpater{
98-
}
98+
cmd := &MetricsAdapter{}
9999

100100
cmd.OpenAPIConfig = genericapiserver.DefaultOpenAPIConfig(generatedopenapi.GetOpenAPIDefinitions, openapinamer.NewDefinitionNamer(apiserver.Scheme))
101101
cmd.OpenAPIConfig.Info.Title = "MetricsAdpater"

pkg/controller/queuejob/active_appwrapper.go

Lines changed: 0 additions & 54 deletions
This file was deleted.

0 commit comments

Comments
 (0)