Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Commit 959722c

Browse files
authored
Treat invalid Pod caused by network error as PodCreationUnknownError (#61)
1 parent a220bd3 commit 959722c

File tree

6 files changed

+125
-50
lines changed

6 files changed

+125
-50
lines changed

pkg/apis/frameworkcontroller/v1/completion.go

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,11 @@ import (
2626
"fmt"
2727
"github.com/microsoft/frameworkcontroller/pkg/common"
2828
core "k8s.io/api/core/v1"
29+
apiErrors "k8s.io/apimachinery/pkg/api/errors"
30+
"k8s.io/apimachinery/pkg/util/net"
2931
"reflect"
3032
"regexp"
33+
"strings"
3134
"time"
3235
)
3336

@@ -63,17 +66,19 @@ const (
6366

6467
// [-999, -1]: Predefined Framework Error
6568
// -1XX: Transient Error
66-
CompletionCodeConfigMapExternalDeleted CompletionCode = -100
67-
CompletionCodePodExternalDeleted CompletionCode = -101
68-
CompletionCodeConfigMapCreationTimeout CompletionCode = -110
69-
CompletionCodePodCreationTimeout CompletionCode = -111
69+
CompletionCodeConfigMapExternalDeleted CompletionCode = -100
70+
CompletionCodePodExternalDeleted CompletionCode = -101
71+
CompletionCodeConfigMapLocalCacheCreationTimeout CompletionCode = -110
72+
CompletionCodePodLocalCacheCreationTimeout CompletionCode = -111
73+
CompletionCodePodCreationTransientError CompletionCode = -120
7074
// -2XX: Permanent Error
71-
CompletionCodePodSpecPermanentError CompletionCode = -200
75+
CompletionCodePodCreationPermanentError CompletionCode = -200
7276
CompletionCodeStopFrameworkRequested CompletionCode = -210
7377
CompletionCodeFrameworkAttemptCompletion CompletionCode = -220
7478
CompletionCodeDeleteTaskRequested CompletionCode = -230
7579
// -3XX: Unknown Error
7680
CompletionCodePodFailedWithoutFailedContainer CompletionCode = -300
81+
CompletionCodePodCreationUnknownError CompletionCode = -310
7782
)
7883

7984
var completionCodeInfoList = []*CompletionCodeInfo{}
@@ -152,20 +157,28 @@ func initCompletionCodeInfos() {
152157
[]CompletionTypeAttribute{CompletionTypeAttributeTransient}},
153158
},
154159
{
155-
Code: CompletionCodeConfigMapCreationTimeout.Ptr(),
156-
Phrase: "ConfigMapCreationTimeout",
160+
Code: CompletionCodeConfigMapLocalCacheCreationTimeout.Ptr(),
161+
Phrase: "ConfigMapLocalCacheCreationTimeout",
157162
Type: CompletionType{CompletionTypeNameFailed,
158163
[]CompletionTypeAttribute{CompletionTypeAttributeTransient}},
159164
},
160165
{
161-
Code: CompletionCodePodCreationTimeout.Ptr(),
162-
Phrase: "PodCreationTimeout",
166+
Code: CompletionCodePodLocalCacheCreationTimeout.Ptr(),
167+
Phrase: "PodLocalCacheCreationTimeout",
163168
Type: CompletionType{CompletionTypeNameFailed,
164169
[]CompletionTypeAttribute{CompletionTypeAttributeTransient}},
165170
},
166171
{
167-
Code: CompletionCodePodSpecPermanentError.Ptr(),
168-
Phrase: "PodSpecPermanentError",
172+
// Only used to distinguish with others, and will never be used to complete
173+
// a TaskAttempt.
174+
Code: CompletionCodePodCreationTransientError.Ptr(),
175+
Phrase: "PodCreationTransientError",
176+
Type: CompletionType{CompletionTypeNameFailed,
177+
[]CompletionTypeAttribute{CompletionTypeAttributeTransient}},
178+
},
179+
{
180+
Code: CompletionCodePodCreationPermanentError.Ptr(),
181+
Phrase: "PodCreationPermanentError",
169182
Type: CompletionType{CompletionTypeNameFailed,
170183
[]CompletionTypeAttribute{CompletionTypeAttributePermanent}},
171184
},
@@ -193,6 +206,12 @@ func initCompletionCodeInfos() {
193206
Type: CompletionType{CompletionTypeNameFailed,
194207
[]CompletionTypeAttribute{}},
195208
},
209+
{
210+
Code: CompletionCodePodCreationUnknownError.Ptr(),
211+
Phrase: "PodCreationUnknownError",
212+
Type: CompletionType{CompletionTypeNameFailed,
213+
[]CompletionTypeAttribute{}},
214+
},
196215
})
197216
}
198217

@@ -238,6 +257,9 @@ type MatchedContainer struct {
238257
}
239258

240259
// Match ANY CompletionCodeInfo
260+
// The returned CompletionCode may not within CompletionCodeInfos, such as for
261+
// the ContainerUnrecognizedFailed, so it should not be used to
262+
// NewTaskAttemptCompletionStatus or NewFrameworkAttemptCompletionStatus later.
241263
func MatchCompletionCodeInfos(pod *core.Pod) PodMatchResult {
242264
for _, codeInfo := range completionCodeInfoList {
243265
for _, podPattern := range codeInfo.PodPatterns {
@@ -404,6 +426,55 @@ func generatePodUnmatchedResult(pod *core.Pod) PodMatchResult {
404426
}
405427
}
406428

429+
// The returned CompletionCode must be within CompletionCodeInfos.
430+
func ClassifyPodCreationError(apiErr error) PodMatchResult {
431+
diag := fmt.Sprintf("Failed to create Pod: %v", common.ToJson(apiErr))
432+
433+
// Treat Platform Error as Transient Error, such as Pod decoding error.
434+
if strings.Contains(apiErr.Error(), "object provided is unrecognized") ||
435+
strings.Contains(apiErr.Error(), "exceeded quota") {
436+
return PodMatchResult{
437+
CodeInfo: completionCodeInfoMap[CompletionCodePodCreationTransientError],
438+
Diagnostics: diag,
439+
}
440+
}
441+
442+
// Treat General Framework Error as Unknown Error for safety.
443+
if apiErrors.IsBadRequest(apiErr) ||
444+
apiErrors.IsForbidden(apiErr) {
445+
return PodMatchResult{
446+
CodeInfo: completionCodeInfoMap[CompletionCodePodCreationUnknownError],
447+
Diagnostics: diag,
448+
}
449+
}
450+
451+
// Treat Permanent Framework Error as Permanent Error only if it must be
452+
// Permanent Error.
453+
if apiErrors.IsInvalid(apiErr) ||
454+
apiErrors.IsRequestEntityTooLargeError(apiErr) {
455+
// TODO: Also check net.IsConnectionRefused
456+
if net.IsConnectionReset(apiErr) || net.IsProbableEOF(apiErr) {
457+
// The ApiServer Permanent Error may be caused by Network Transient Error,
458+
// so treat it as Unknown Error for safety.
459+
return PodMatchResult{
460+
CodeInfo: completionCodeInfoMap[CompletionCodePodCreationUnknownError],
461+
Diagnostics: diag,
462+
}
463+
} else {
464+
return PodMatchResult{
465+
CodeInfo: completionCodeInfoMap[CompletionCodePodCreationPermanentError],
466+
Diagnostics: diag,
467+
}
468+
}
469+
}
470+
471+
// Treat all other errors as Transient Error, including all non-APIStatus errors.
472+
return PodMatchResult{
473+
CodeInfo: completionCodeInfoMap[CompletionCodePodCreationTransientError],
474+
Diagnostics: diag,
475+
}
476+
}
477+
407478
///////////////////////////////////////////////////////////////////////////////////////
408479
// Completion Utils
409480
///////////////////////////////////////////////////////////////////////////////////////

pkg/apis/frameworkcontroller/v1/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ type Config struct {
6060
// Generally, it should be proportional to the cluster Framework workload, and within the ApiServer
6161
// serving capacity/limit such as the --max-mutating-requests-inflight.
6262
KubeClientQps *float32 `yaml:"kubeClientQps"`
63-
KubeClientBurst *int32 `yaml:"kubeClientBurst"`
63+
KubeClientBurst *int32 `yaml:"kubeClientBurst"`
6464

6565
// Number of concurrent workers to process each different Frameworks.
6666
// Generally, it should be proportional to the above rate limits of requests.

pkg/apis/frameworkcontroller/v1/funcs.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -195,15 +195,17 @@ func NewCompletedTaskTriggeredCompletionStatus(
195195
"conditions in FrameworkAttemptCompletionPolicy have ever been triggered",
196196
completedTaskCount, totalTaskCount)
197197
if triggerTaskStatus == nil {
198-
return CompletionCodeSucceeded.NewFrameworkAttemptCompletionStatus(diag, nil)
198+
return CompletionCodeSucceeded.
199+
NewFrameworkAttemptCompletionStatus(diag, nil)
199200
} else {
200-
return CompletionCodeSucceeded.NewFrameworkAttemptCompletionStatus(diag,
201-
&CompletionPolicyTriggerStatus{
202-
Message: diag,
203-
TaskRoleName: triggerTaskRoleName,
204-
TaskIndex: triggerTaskStatus.Index,
205-
},
206-
)
201+
return CompletionCodeSucceeded.
202+
NewFrameworkAttemptCompletionStatus(diag,
203+
&CompletionPolicyTriggerStatus{
204+
Message: diag,
205+
TaskRoleName: triggerTaskRoleName,
206+
TaskIndex: triggerTaskStatus.Index,
207+
},
208+
)
207209
}
208210
}
209211

pkg/common/utils.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"encoding/json"
2929
"flag"
3030
"fmt"
31+
errorWrap "github.com/pkg/errors"
3132
"gopkg.in/yaml.v2"
3233
"io/ioutil"
3334
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -287,3 +288,12 @@ func Decompress(compressedBytes []byte) (string, error) {
287288
}
288289
}
289290
}
291+
292+
func GetErrorCause(err error) error {
293+
causeErr := errorWrap.Cause(err)
294+
if causeErr == nil {
295+
return err
296+
} else {
297+
return causeErr
298+
}
299+
}

pkg/controller/controller.go

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,7 +1053,7 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
10531053
"ConfigMap does not appear in the local cache within timeout %v, "+
10541054
"so consider it was deleted and explicitly delete it",
10551055
common.SecToDuration(c.cConfig.ObjectLocalCacheCreationTimeoutSec))
1056-
code = ci.CompletionCodeConfigMapCreationTimeout
1056+
code = ci.CompletionCodeConfigMapLocalCacheCreationTimeout
10571057
klog.Warning(logPfx + diag)
10581058
}
10591059

@@ -1670,7 +1670,7 @@ func (c *FrameworkController) syncTaskState(
16701670
"Pod does not appear in the local cache within timeout %v, "+
16711671
"so consider it was deleted and explicitly delete it",
16721672
common.SecToDuration(c.cConfig.ObjectLocalCacheCreationTimeoutSec))
1673-
code = ci.CompletionCodePodCreationTimeout
1673+
code = ci.CompletionCodePodLocalCacheCreationTimeout
16741674
klog.Warning(logPfx + diag)
16751675
}
16761676

@@ -1752,8 +1752,9 @@ func (c *FrameworkController) syncTaskState(
17521752
diag := fmt.Sprintf("Pod succeeded")
17531753
klog.Info(logPfx + diag)
17541754
c.completeTaskAttempt(f, taskRoleName, taskIndex, false,
1755-
ci.CompletionCodeSucceeded.NewTaskAttemptCompletionStatus(
1756-
diag, ci.ExtractPodCompletionStatus(pod)))
1755+
ci.CompletionCodeSucceeded.
1756+
NewTaskAttemptCompletionStatus(
1757+
diag, ci.ExtractPodCompletionStatus(pod)))
17571758
return nil
17581759
} else if pod.Status.Phase == core.PodFailed {
17591760
result := ci.MatchCompletionCodeInfos(pod)
@@ -1910,26 +1911,26 @@ func (c *FrameworkController) syncTaskState(
19101911
// createTaskAttempt
19111912
pod, err = c.createPod(f, cm, taskRoleName, taskIndex)
19121913
if err != nil {
1913-
apiErr := errorWrap.Cause(err)
1914-
if internal.IsPodSpecPermanentError(apiErr) {
1915-
// Should be Framework Error instead of Platform Transient Error.
1916-
diag := fmt.Sprintf("Failed to create Pod: %v", common.ToJson(apiErr))
1917-
klog.Info(logPfx + diag)
1918-
1919-
// Ensure pod is deleted in remote to avoid managed pod leak after
1920-
// TaskAttemptCompleted.
1921-
_, err = c.getOrCleanupPod(f, cm, taskRoleName, taskIndex, true)
1922-
if err != nil {
1923-
return err
1924-
}
1914+
apiErr := common.GetErrorCause(err)
1915+
result := ci.ClassifyPodCreationError(apiErr)
1916+
if *result.CodeInfo.Code == ci.CompletionCodePodCreationTransientError {
1917+
// Do not complete the TaskAttempt, as generally, user does not need to
1918+
// aware the Transient Error during Pod creation.
1919+
return err
1920+
}
19251921

1926-
c.completeTaskAttempt(f, taskRoleName, taskIndex, true,
1927-
ci.CompletionCodePodSpecPermanentError.
1928-
NewTaskAttemptCompletionStatus(diag, nil))
1929-
return nil
1930-
} else {
1922+
klog.Info(logPfx + result.Diagnostics)
1923+
// Ensure pod is deleted in remote to avoid managed pod leak after
1924+
// TaskAttemptCompleted.
1925+
_, err = c.getOrCleanupPod(f, cm, taskRoleName, taskIndex, true)
1926+
if err != nil {
19311927
return err
19321928
}
1929+
1930+
c.completeTaskAttempt(f, taskRoleName, taskIndex, true,
1931+
result.CodeInfo.Code.
1932+
NewTaskAttemptCompletionStatus(result.Diagnostics, nil))
1933+
return nil
19331934
}
19341935

19351936
taskStatus.AttemptStatus.PodUID = &pod.UID

pkg/internal/utils.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ import (
3838
"k8s.io/client-go/tools/cache"
3939
"k8s.io/klog"
4040
"reflect"
41-
"strings"
4241
"time"
4342
)
4443

@@ -222,11 +221,3 @@ func GetPodDeletionStartTime(pod *core.Pod) *meta.Time {
222221
}
223222
return common.PtrTime(meta.NewTime(pod.DeletionTimestamp.Add(-gracePeriod)))
224223
}
225-
226-
func IsPodSpecPermanentError(apiErr error) bool {
227-
return apiErrors.IsBadRequest(apiErr) ||
228-
apiErrors.IsInvalid(apiErr) ||
229-
apiErrors.IsRequestEntityTooLargeError(apiErr) ||
230-
(apiErrors.IsForbidden(apiErr) &&
231-
!strings.Contains(apiErr.Error(), "exceeded quota"))
232-
}

0 commit comments

Comments
 (0)