Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Commit 1aa6e61

Browse files
authored
Support LogObjectSnapshot: Expose Framework and Pod History (#31)
1 parent 48f601b commit 1aa6e61

File tree

3 files changed

+106
-20
lines changed

3 files changed

+106
-20
lines changed

pkg/apis/frameworkcontroller/v1/config.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,35 @@ type Config struct {
7676
// all-or-nothing fashion in order to perform any useful work.
7777
FrameworkMinRetryDelaySecForTransientConflictFailed *int64 `yaml:"frameworkMinRetryDelaySecForTransientConflictFailed"`
7878
FrameworkMaxRetryDelaySecForTransientConflictFailed *int64 `yaml:"frameworkMaxRetryDelaySecForTransientConflictFailed"`
79+
80+
// Specify when to log the snapshot of which managed object.
81+
// This enables external systems to collect and process the history snapshots,
82+
// such as persistence, metrics conversion, visualization, alerting, acting,
83+
// analysis, etc.
84+
// Notes:
85+
// 1. The snapshot is logged to stderr.
86+
// 2. Check GetFrameworkSnapshotLogTail and GetPodSnapshotLogTail to see how
87+
// to extract the snapshot from stderr.
88+
// 3. The same snapshot may be logged more than once in some rare cases, so
89+
// external systems may need to deduplicate them by object.ResourceVersion.
90+
// 4. The snapshot triggered by deletion may be missed to log during the
91+
// FrameworkController downtime.
92+
LogObjectSnapshot LogObjectSnapshot `yaml:"logObjectSnapshot"`
93+
}
94+
95+
type LogObjectSnapshot struct {
96+
Framework LogFrameworkSnapshot `yaml:"framework"`
97+
Pod LogPodSnapshot `yaml:"pod"`
98+
}
99+
100+
type LogFrameworkSnapshot struct {
101+
OnTaskRetry *bool `yaml:"onTaskRetry"`
102+
OnFrameworkRetry *bool `yaml:"onFrameworkRetry"`
103+
OnFrameworkDeletion *bool `yaml:"onFrameworkDeletion"`
104+
}
105+
106+
type LogPodSnapshot struct {
107+
OnPodDeletion *bool `yaml:"onPodDeletion"`
79108
}
80109

81110
func NewConfig() *Config {
@@ -107,6 +136,18 @@ func NewConfig() *Config {
107136
if c.FrameworkMaxRetryDelaySecForTransientConflictFailed == nil {
108137
c.FrameworkMaxRetryDelaySecForTransientConflictFailed = common.PtrInt64(15 * 60)
109138
}
139+
if c.LogObjectSnapshot.Framework.OnTaskRetry == nil {
140+
c.LogObjectSnapshot.Framework.OnTaskRetry = common.PtrBool(true)
141+
}
142+
if c.LogObjectSnapshot.Framework.OnFrameworkRetry == nil {
143+
c.LogObjectSnapshot.Framework.OnFrameworkRetry = common.PtrBool(true)
144+
}
145+
if c.LogObjectSnapshot.Framework.OnFrameworkDeletion == nil {
146+
c.LogObjectSnapshot.Framework.OnFrameworkDeletion = common.PtrBool(true)
147+
}
148+
if c.LogObjectSnapshot.Pod.OnPodDeletion == nil {
149+
c.LogObjectSnapshot.Pod.OnPodDeletion = common.PtrBool(true)
150+
}
110151

111152
// Validation
112153
errPrefix := "Config Validation Failed: "

pkg/apis/frameworkcontroller/v1/funcs.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ func SplitTaskAttemptInstanceUID(taskAttemptInstanceUID *types.UID) (
9090
return int32(i), common.PtrUIDStr(parts[1])
9191
}
9292

93+
func GetFrameworkSnapshotLogTail(podPtr interface{}) string {
94+
return "FrameworkSnapshot: " + common.ToJson(podPtr)
95+
}
96+
97+
func GetPodSnapshotLogTail(frameworkPtr interface{}) string {
98+
return "PodSnapshot: " + common.ToJson(frameworkPtr)
99+
}
100+
93101
///////////////////////////////////////////////////////////////////////////////////////
94102
// Interfaces
95103
///////////////////////////////////////////////////////////////////////////////////////

pkg/controller/controller.go

Lines changed: 57 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -245,50 +245,63 @@ func NewFrameworkController() *FrameworkController {
245245

246246
fInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
247247
AddFunc: func(obj interface{}) {
248-
c.enqueueFrameworkObj(obj, "Framework Added")
248+
c.enqueueFrameworkObj(obj, "Framework Added", nil)
249249
},
250250
UpdateFunc: func(oldObj, newObj interface{}) {
251251
// FrameworkController only cares about Framework.Spec update
252252
oldF := oldObj.(*ci.Framework)
253253
newF := newObj.(*ci.Framework)
254254
if !reflect.DeepEqual(oldF.Spec, newF.Spec) {
255-
c.enqueueFrameworkObj(newObj, "Framework.Spec Updated")
255+
c.enqueueFrameworkObj(newObj, "Framework.Spec Updated", nil)
256256
}
257257
},
258258
DeleteFunc: func(obj interface{}) {
259-
c.enqueueFrameworkObj(obj, "Framework Deleted")
259+
c.enqueueFrameworkObj(obj, "Framework Deleted", func() string {
260+
if *c.cConfig.LogObjectSnapshot.Framework.OnFrameworkDeletion {
261+
return ": " + ci.GetFrameworkSnapshotLogTail(obj)
262+
} else {
263+
return ""
264+
}
265+
})
260266
},
261267
})
262268

263269
cmInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
264270
AddFunc: func(obj interface{}) {
265-
c.enqueueFrameworkConfigMapObj(obj, "Framework ConfigMap Added")
271+
c.enqueueFrameworkConfigMapObj(obj, "Framework ConfigMap Added", nil)
266272
},
267273
UpdateFunc: func(oldObj, newObj interface{}) {
268-
c.enqueueFrameworkConfigMapObj(newObj, "Framework ConfigMap Updated")
274+
c.enqueueFrameworkConfigMapObj(newObj, "Framework ConfigMap Updated", nil)
269275
},
270276
DeleteFunc: func(obj interface{}) {
271-
c.enqueueFrameworkConfigMapObj(obj, "Framework ConfigMap Deleted")
277+
c.enqueueFrameworkConfigMapObj(obj, "Framework ConfigMap Deleted", nil)
272278
},
273279
})
274280

275281
podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
276282
AddFunc: func(obj interface{}) {
277-
c.enqueueFrameworkPodObj(obj, "Framework Pod Added")
283+
c.enqueueFrameworkPodObj(obj, "Framework Pod Added", nil)
278284
},
279285
UpdateFunc: func(oldObj, newObj interface{}) {
280-
c.enqueueFrameworkPodObj(newObj, "Framework Pod Updated")
286+
c.enqueueFrameworkPodObj(newObj, "Framework Pod Updated", nil)
281287
},
282288
DeleteFunc: func(obj interface{}) {
283-
c.enqueueFrameworkPodObj(obj, "Framework Pod Deleted")
289+
c.enqueueFrameworkPodObj(obj, "Framework Pod Deleted", func() string {
290+
if *c.cConfig.LogObjectSnapshot.Pod.OnPodDeletion {
291+
return ": " + ci.GetPodSnapshotLogTail(obj)
292+
} else {
293+
return ""
294+
}
295+
})
284296
},
285297
})
286298

287299
return c
288300
}
289301

290302
// obj could be *ci.Framework or cache.DeletedFinalStateUnknown.
291-
func (c *FrameworkController) enqueueFrameworkObj(obj interface{}, msg string) {
303+
func (c *FrameworkController) enqueueFrameworkObj(
304+
obj interface{}, logSfx string, logTailFunc func() string) {
292305
key, err := internal.GetKey(obj)
293306
if err != nil {
294307
klog.Errorf("Failed to get key for obj %#v, skip to enqueue: %v", obj, err)
@@ -302,23 +315,29 @@ func (c *FrameworkController) enqueueFrameworkObj(obj interface{}, msg string) {
302315
}
303316

304317
c.fQueue.Add(key)
305-
klog.Infof("[%v]: enqueueFrameworkObj: %v", key, msg)
318+
319+
if logTailFunc != nil {
320+
logSfx += logTailFunc()
321+
}
322+
klog.Infof("[%v]: enqueueFrameworkObj: %v", key, logSfx)
306323
}
307324

308325
// obj could be *core.ConfigMap or cache.DeletedFinalStateUnknown.
309-
func (c *FrameworkController) enqueueFrameworkConfigMapObj(obj interface{}, msg string) {
326+
func (c *FrameworkController) enqueueFrameworkConfigMapObj(
327+
obj interface{}, logSfx string, logTailFunc func() string) {
310328
if cm := internal.ToConfigMap(obj); cm != nil {
311329
if f := c.getConfigMapOwner(cm); f != nil {
312-
c.enqueueFrameworkObj(f, msg+": "+cm.Name)
330+
c.enqueueFrameworkObj(f, logSfx+": "+cm.Name, logTailFunc)
313331
}
314332
}
315333
}
316334

317335
// obj could be *core.Pod or cache.DeletedFinalStateUnknown.
318-
func (c *FrameworkController) enqueueFrameworkPodObj(obj interface{}, msg string) {
336+
func (c *FrameworkController) enqueueFrameworkPodObj(
337+
obj interface{}, logSfx string, logTailFunc func() string) {
319338
if pod := internal.ToPod(obj); pod != nil {
320339
if cm := c.getPodOwner(pod); cm != nil {
321-
c.enqueueFrameworkConfigMapObj(cm, msg+": "+pod.Name)
340+
c.enqueueFrameworkConfigMapObj(cm, logSfx+": "+pod.Name, logTailFunc)
322341
}
323342
}
324343
}
@@ -393,6 +412,8 @@ func (c *FrameworkController) Run(stopCh <-chan struct{}) {
393412
c.cConfig.CRDEstablishedCheckIntervalSec,
394413
c.cConfig.CRDEstablishedCheckTimeoutSec)
395414

415+
// The recovery order is not important, since all Frameworks will be enqueued
416+
// to sync in any case.
396417
go c.fInformer.Run(stopCh)
397418
go c.cmInformer.Run(stopCh)
398419
go c.podInformer.Run(stopCh)
@@ -669,9 +690,9 @@ func (c *FrameworkController) enqueueTaskRetryDelayTimeoutCheck(
669690
return true
670691
}
671692

672-
func (c *FrameworkController) enqueueFramework(f *ci.Framework, msg string) {
693+
func (c *FrameworkController) enqueueFramework(f *ci.Framework, logSfx string) {
673694
c.fQueue.Add(f.Key())
674-
klog.Infof("[%v]: enqueueFramework: %v", f.Key(), msg)
695+
klog.Infof("[%v]: enqueueFramework: %v", f.Key(), logSfx)
675696
}
676697

677698
func (c *FrameworkController) syncFrameworkStatus(f *ci.Framework) error {
@@ -847,6 +868,14 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
847868

848869
// retryFramework
849870
klog.Infof(logPfx + "Retry Framework")
871+
872+
// The completed FrameworkAttempt has been persisted, so it is safe to also
873+
// expose it as one history snapshot.
874+
if *c.cConfig.LogObjectSnapshot.Framework.OnFrameworkRetry {
875+
klog.Infof(logPfx+
876+
"Framework will be retried: %v", ci.GetFrameworkSnapshotLogTail(f))
877+
}
878+
850879
f.Status.RetryPolicyStatus.TotalRetriedCount++
851880
if retryDecision.IsAccountable {
852881
f.Status.RetryPolicyStatus.AccountableRetriedCount++
@@ -1232,9 +1261,9 @@ func (c *FrameworkController) syncTaskState(
12321261
terminated := containerStatus.State.Terminated
12331262
if terminated != nil && terminated.ExitCode != 0 {
12341263
allContainerDiags = append(allContainerDiags, fmt.Sprintf(
1235-
"[Container %v, ExitCode: %v, Reason: %v, Message: %v]",
1236-
containerStatus.Name, terminated.ExitCode, terminated.Reason,
1237-
terminated.Message))
1264+
"[Container: %v, ExitCode: %v, Signal: %v, Reason: %v, Message: %v]",
1265+
containerStatus.Name, terminated.ExitCode, terminated.Signal,
1266+
terminated.Reason, common.ToJson(terminated.Message)))
12381267

12391268
if lastContainerExitCode == nil ||
12401269
lastContainerCompletionTime.Before(terminated.FinishedAt.Time) {
@@ -1323,6 +1352,14 @@ func (c *FrameworkController) syncTaskState(
13231352

13241353
// retryTask
13251354
klog.Infof(logPfx + "Retry Task")
1355+
1356+
// The completed TaskAttempt has been persisted, so it is safe to also
1357+
// expose it as one history snapshot.
1358+
if *c.cConfig.LogObjectSnapshot.Framework.OnTaskRetry {
1359+
klog.Infof(logPfx+
1360+
"Task will be retried: %v", ci.GetFrameworkSnapshotLogTail(f))
1361+
}
1362+
13261363
taskStatus.RetryPolicyStatus.TotalRetriedCount++
13271364
if retryDecision.IsAccountable {
13281365
taskStatus.RetryPolicyStatus.AccountableRetriedCount++

0 commit comments

Comments
 (0)