@@ -26,8 +26,11 @@ import (
26
26
"fmt"
27
27
"github.com/microsoft/frameworkcontroller/pkg/common"
28
28
core "k8s.io/api/core/v1"
29
+ apiErrors "k8s.io/apimachinery/pkg/api/errors"
30
+ "k8s.io/apimachinery/pkg/util/net"
29
31
"reflect"
30
32
"regexp"
33
+ "strings"
31
34
"time"
32
35
)
33
36
@@ -63,17 +66,19 @@ const (
63
66
64
67
// [-999, -1]: Predefined Framework Error
65
68
// -1XX: Transient Error
66
- CompletionCodeConfigMapExternalDeleted CompletionCode = - 100
67
- CompletionCodePodExternalDeleted CompletionCode = - 101
68
- CompletionCodeConfigMapCreationTimeout CompletionCode = - 110
69
- CompletionCodePodCreationTimeout CompletionCode = - 111
69
+ CompletionCodeConfigMapExternalDeleted CompletionCode = - 100
70
+ CompletionCodePodExternalDeleted CompletionCode = - 101
71
+ CompletionCodeConfigMapLocalCacheCreationTimeout CompletionCode = - 110
72
+ CompletionCodePodLocalCacheCreationTimeout CompletionCode = - 111
73
+ CompletionCodePodCreationTransientError CompletionCode = - 120
70
74
// -2XX: Permanent Error
71
- CompletionCodePodSpecPermanentError CompletionCode = - 200
75
+ CompletionCodePodCreationPermanentError CompletionCode = - 200
72
76
CompletionCodeStopFrameworkRequested CompletionCode = - 210
73
77
CompletionCodeFrameworkAttemptCompletion CompletionCode = - 220
74
78
CompletionCodeDeleteTaskRequested CompletionCode = - 230
75
79
// -3XX: Unknown Error
76
80
CompletionCodePodFailedWithoutFailedContainer CompletionCode = - 300
81
+ CompletionCodePodCreationUnknownError CompletionCode = - 310
77
82
)
78
83
79
84
var completionCodeInfoList = []* CompletionCodeInfo {}
@@ -152,20 +157,28 @@ func initCompletionCodeInfos() {
152
157
[]CompletionTypeAttribute {CompletionTypeAttributeTransient }},
153
158
},
154
159
{
155
- Code : CompletionCodeConfigMapCreationTimeout .Ptr (),
156
- Phrase : "ConfigMapCreationTimeout " ,
160
+ Code : CompletionCodeConfigMapLocalCacheCreationTimeout .Ptr (),
161
+ Phrase : "ConfigMapLocalCacheCreationTimeout " ,
157
162
Type : CompletionType {CompletionTypeNameFailed ,
158
163
[]CompletionTypeAttribute {CompletionTypeAttributeTransient }},
159
164
},
160
165
{
161
- Code : CompletionCodePodCreationTimeout .Ptr (),
162
- Phrase : "PodCreationTimeout " ,
166
+ Code : CompletionCodePodLocalCacheCreationTimeout .Ptr (),
167
+ Phrase : "PodLocalCacheCreationTimeout " ,
163
168
Type : CompletionType {CompletionTypeNameFailed ,
164
169
[]CompletionTypeAttribute {CompletionTypeAttributeTransient }},
165
170
},
166
171
{
167
- Code : CompletionCodePodSpecPermanentError .Ptr (),
168
- Phrase : "PodSpecPermanentError" ,
172
+ // Only used to distinguish with others, and will never be used to complete
173
+ // a TaskAttempt.
174
+ Code : CompletionCodePodCreationTransientError .Ptr (),
175
+ Phrase : "PodCreationTransientError" ,
176
+ Type : CompletionType {CompletionTypeNameFailed ,
177
+ []CompletionTypeAttribute {CompletionTypeAttributeTransient }},
178
+ },
179
+ {
180
+ Code : CompletionCodePodCreationPermanentError .Ptr (),
181
+ Phrase : "PodCreationPermanentError" ,
169
182
Type : CompletionType {CompletionTypeNameFailed ,
170
183
[]CompletionTypeAttribute {CompletionTypeAttributePermanent }},
171
184
},
@@ -193,6 +206,12 @@ func initCompletionCodeInfos() {
193
206
Type : CompletionType {CompletionTypeNameFailed ,
194
207
[]CompletionTypeAttribute {}},
195
208
},
209
+ {
210
+ Code : CompletionCodePodCreationUnknownError .Ptr (),
211
+ Phrase : "PodCreationUnknownError" ,
212
+ Type : CompletionType {CompletionTypeNameFailed ,
213
+ []CompletionTypeAttribute {}},
214
+ },
196
215
})
197
216
}
198
217
@@ -238,6 +257,9 @@ type MatchedContainer struct {
238
257
}
239
258
240
259
// Match ANY CompletionCodeInfo
260
+ // The returned CompletionCode may not within CompletionCodeInfos, such as for
261
+ // the ContainerUnrecognizedFailed, so it should not be used to
262
+ // NewTaskAttemptCompletionStatus or NewFrameworkAttemptCompletionStatus later.
241
263
func MatchCompletionCodeInfos (pod * core.Pod ) PodMatchResult {
242
264
for _ , codeInfo := range completionCodeInfoList {
243
265
for _ , podPattern := range codeInfo .PodPatterns {
@@ -404,6 +426,55 @@ func generatePodUnmatchedResult(pod *core.Pod) PodMatchResult {
404
426
}
405
427
}
406
428
429
+ // The returned CompletionCode must be within CompletionCodeInfos.
430
+ func ClassifyPodCreationError (apiErr error ) PodMatchResult {
431
+ diag := fmt .Sprintf ("Failed to create Pod: %v" , common .ToJson (apiErr ))
432
+
433
+ // Treat Platform Error as Transient Error, such as Pod decoding error.
434
+ if strings .Contains (apiErr .Error (), "object provided is unrecognized" ) ||
435
+ strings .Contains (apiErr .Error (), "exceeded quota" ) {
436
+ return PodMatchResult {
437
+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationTransientError ],
438
+ Diagnostics : diag ,
439
+ }
440
+ }
441
+
442
+ // Treat General Framework Error as Unknown Error for safety.
443
+ if apiErrors .IsBadRequest (apiErr ) ||
444
+ apiErrors .IsForbidden (apiErr ) {
445
+ return PodMatchResult {
446
+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationUnknownError ],
447
+ Diagnostics : diag ,
448
+ }
449
+ }
450
+
451
+ // Treat Permanent Framework Error as Permanent Error only if it must be
452
+ // Permanent Error.
453
+ if apiErrors .IsInvalid (apiErr ) ||
454
+ apiErrors .IsRequestEntityTooLargeError (apiErr ) {
455
+ // TODO: Also check net.IsConnectionRefused
456
+ if net .IsConnectionReset (apiErr ) || net .IsProbableEOF (apiErr ) {
457
+ // The ApiServer Permanent Error may be caused by Network Transient Error,
458
+ // so treat it as Unknown Error for safety.
459
+ return PodMatchResult {
460
+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationUnknownError ],
461
+ Diagnostics : diag ,
462
+ }
463
+ } else {
464
+ return PodMatchResult {
465
+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationPermanentError ],
466
+ Diagnostics : diag ,
467
+ }
468
+ }
469
+ }
470
+
471
+ // Treat all other errors as Transient Error, including all non-APIStatus errors.
472
+ return PodMatchResult {
473
+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationTransientError ],
474
+ Diagnostics : diag ,
475
+ }
476
+ }
477
+
407
478
///////////////////////////////////////////////////////////////////////////////////////
408
479
// Completion Utils
409
480
///////////////////////////////////////////////////////////////////////////////////////
0 commit comments