From af33c53f0b6380f000bfa8300fa2405c1fd49b30 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Thu, 1 Oct 2020 13:32:09 -0400 Subject: [PATCH] pkv/cvo/status: Raise Operator leveling grace-period to 40 minutes Similar to #422, further tune things up so that we can ensure that our 90th percentile of clusters do not trip over momentary cluster upgrade failures whenever operators take longer than 20 minutes to roll out. --- docs/user/status.md | 2 +- pkg/cvo/status.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user/status.md b/docs/user/status.md index 4b87b1603..72412cdec 100644 --- a/docs/user/status.md +++ b/docs/user/status.md @@ -22,7 +22,7 @@ If this happens it is a CVO coding error, because clearing [`desiredUpdate`][api `ClusterOperatorNotAvailable` (or the consolidated `ClusterOperatorsNotAvailable`) is set when the CVO fails to retrieve the ClusterOperator from the cluster or when the retrieved ClusterOperator does not satisfy [the reconciliation conditions](reconciliation.md#clusteroperator). Unlike most manifest-reconciliation failures, this error does not immediately result in `Failing=True`. -Under some conditions during installs and updates, the CVO will treat this condition as a `Progressing=True` condition and give the operator up to twenty minutes to level before reporting `Failing=True`. +Under some conditions during installs and updates, the CVO will treat this condition as a `Progressing=True` condition and give the operator up to fourty minutes to level before reporting `Failing=True`. ## RetrievedUpdates diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index c15259a5d..039f63b02 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -344,13 +344,13 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being -// unavailable AND the general payload task making progress towards its goal. An operator is given 20 minutes since +// unavailable AND the general payload task making progress towards its goal. An operator is given 40 minutes since // its last update to go ready, or an hour has elapsed since the update began, before the condition is ignored. func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) { if len(history) == 0 || status.Failure == nil || status.Reconciling || status.LastProgress.IsZero() { return "", "", false } - if now.Sub(status.LastProgress) > 20*time.Minute || now.Sub(history[0].StartedTime.Time) > time.Hour { + if now.Sub(status.LastProgress) > 40*time.Minute || now.Sub(history[0].StartedTime.Time) > time.Hour { return "", "", false } uErr, ok := status.Failure.(*payload.UpdateError)