From e77d632db2c56eed84c5f261e3cb71825f8fa0dd Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Mon, 21 Apr 2025 14:24:47 -0700 Subject: [PATCH 1/7] upgrade api-go to 1.48.0 --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index b8a529a03..573c81bb3 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/nexus-rpc/sdk-go v0.3.0 github.com/robfig/cron v1.2.0 github.com/stretchr/testify v1.10.0 - go.temporal.io/api v1.46.0 + go.temporal.io/api v1.48.0 golang.org/x/sync v0.11.0 golang.org/x/sys v0.30.0 golang.org/x/time v0.3.0 diff --git a/go.sum b/go.sum index 0fc8b7de1..5f87472ef 100644 --- a/go.sum +++ b/go.sum @@ -71,8 +71,8 @@ github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= From f011e5457ddfce48f638c5c8005a569f2dc76e32 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Mon, 21 Apr 2025 17:46:01 -0700 Subject: [PATCH 2/7] add support for benign application error category --- internal/error.go | 55 ++++++++++++++++++++++++++++++ internal/error_test.go | 21 ++++++++++-- internal/failure_converter.go | 2 ++ internal/internal_task_handlers.go | 6 +++- internal/internal_task_pollers.go | 31 +++++++++++------ 5 files changed, 101 insertions(+), 14 deletions(-) diff --git a/internal/error.go b/internal/error.go index 9282fe6c7..ad1c234d0 100644 --- a/internal/error.go +++ b/internal/error.go @@ -119,6 +119,9 @@ Workflow consumers will get an instance of *WorkflowExecutionError. This error w */ type ( + // Category of the error. Maps to logging/metrics behaviours. + ApplicationErrorCategory string + // ApplicationErrorOptions represents a combination of error attributes and additional requests. // All fields are optional, providing flexibility in error customization. // @@ -137,6 +140,7 @@ type ( // // NOTE: This option is supported by Temporal Server >= v1.24.2 older version will ignore this value. NextRetryDelay time.Duration + Category ApplicationErrorCategory } // ApplicationError returned from activity implementations with message and optional details. @@ -150,6 +154,7 @@ type ( cause error details converter.EncodedValues nextRetryDelay time.Duration + category ApplicationErrorCategory } // TimeoutError returned when activity or child workflow timed out. @@ -380,6 +385,11 @@ var ( ErrMissingWorkflowID = errors.New("workflow ID is unset for Nexus operation") ) +const ( + // ErrorCategoryBenign indicates an error that is expected under normal operation and should not trigger alerts. + ErrorCategoryBenign ApplicationErrorCategory = "benign" +) + // NewApplicationError create new instance of *ApplicationError with message, type, and optional details. func NewApplicationError(msg string, errType string, nonRetryable bool, cause error, details ...interface{}) error { return NewApplicationErrorWithOptions( @@ -397,6 +407,7 @@ func NewApplicationErrorWithOptions(msg string, errType string, options Applicat cause: options.Cause, nonRetryable: options.NonRetryable, nextRetryDelay: options.NextRetryDelay, + category: options.Category, } // When return error to user, use EncodedValues as details and data is ready to be decoded by calling Get details := options.Details @@ -661,6 +672,11 @@ func (e *ApplicationError) Unwrap() error { // a zero value means to use the activities retry policy. func (e *ApplicationError) NextRetryDelay() time.Duration { return e.nextRetryDelay } +// Category returns the ApplicationErrorCategory of the error. +func (e *ApplicationError) Category() ApplicationErrorCategory { + return e.category +} + // Error from error interface func (e *TimeoutError) Error() string { msg := fmt.Sprintf("%s (type: %s)", e.message(), e.timeoutType) @@ -1029,3 +1045,42 @@ func getErrType(err error) string { return t.Name() } + +func applicationErrorCategoryToProto(category ApplicationErrorCategory) enumspb.ApplicationErrorCategory { + switch category { + case ErrorCategoryBenign: + return enumspb.APPLICATION_ERROR_CATEGORY_BENIGN + case "": + // Zero value maps to unspecified + return enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED + default: + // Fallback to unspecified if unknown case + return enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED + } +} + +func applicationErrorCategoryFromProto(category enumspb.ApplicationErrorCategory) ApplicationErrorCategory { + switch category { + case enumspb.APPLICATION_ERROR_CATEGORY_BENIGN: + return ErrorCategoryBenign + case enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED: + // Unspecified maps to zero value + return "" + default: + // Fallback to zero value if unknown case + return "" + } +} + +func IsBenignApplicationError(err error) bool { + var appError *ApplicationError + return errors.As(err, &appError) && appError.Category() == ErrorCategoryBenign +} + +func isBenignProtoApplicationFailure(failure *failurepb.Failure) bool { + if failure == nil { + return false + } + appFailureInfo := failure.GetApplicationFailureInfo() + return appFailureInfo != nil && appFailureInfo.GetCategory() == enumspb.APPLICATION_ERROR_CATEGORY_BENIGN +} diff --git a/internal/error_test.go b/internal/error_test.go index 8a69ea4e0..c5880b27e 100644 --- a/internal/error_test.go +++ b/internal/error_test.go @@ -723,6 +723,7 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T) NonRetryable: true, Cause: errors.New("cause error"), Details: []interface{}{"details", 2208}, + Category: ErrorCategoryBenign, }, ) f := fc.ErrorToFailure(err) @@ -734,15 +735,27 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T) require.Equal("cause error", f.GetCause().GetMessage()) require.Equal("", f.GetCause().GetApplicationFailureInfo().GetType()) require.Nil(f.GetCause().GetCause()) + require.Equal(enumspb.APPLICATION_ERROR_CATEGORY_BENIGN, f.GetApplicationFailureInfo().GetCategory()) err2 := fc.FailureToError(f) var applicationErr *ApplicationError require.True(errors.As(err2, &applicationErr)) require.Equal("message (type: customType, retryable: false): cause error", applicationErr.Error()) + require.Equal(ErrorCategoryBenign, applicationErr.Category()) err2 = errors.Unwrap(err2) require.True(errors.As(err2, &applicationErr)) require.Equal("cause error", applicationErr.Error()) + + err := NewApplicationErrorWithOptions( + "another message", + "another customType", + ApplicationErrorOptions{ + Category: "", + }, + ) + f := fc.ErrorToFailure(err) + require.Equal(enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED, f.GetApplicationFailureInfo().GetCategory()) } func Test_convertErrorToFailure_EncodeMessage(t *testing.T) { @@ -1104,6 +1117,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { Type: "MyCoolType", NonRetryable: true, Details: details, + Category: enumspb.APPLICATION_ERROR_CATEGORY_BENIGN, }}, Cause: &failurepb.Failure{ Message: "cause message", @@ -1120,6 +1134,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { require.Equal("message (type: MyCoolType, retryable: false): cause message (type: UnknownType, retryable: true)", applicationErr.Error()) require.Equal("MyCoolType", applicationErr.Type()) require.Equal(true, applicationErr.NonRetryable()) + require.Equal(ErrorCategoryBenign, applicationErr.Category()) var str string var n int require.NoError(applicationErr.Details(&str, &n)) @@ -1149,8 +1164,9 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { f = &failurepb.Failure{ Message: "message", FailureInfo: &failurepb.Failure_ApplicationFailureInfo{ApplicationFailureInfo: &failurepb.ApplicationFailureInfo{ - Type: "CoolError", - Details: details, + Type: "CoolError", + Details: details, + Category: enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED, }}, } @@ -1160,6 +1176,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { require.Equal("message (type: CoolError, retryable: true)", coolErr.Error()) require.Equal("CoolError", coolErr.Type()) require.Equal(false, coolErr.NonRetryable()) + require.Equal("", coolErr.Category()) } func Test_convertFailureToError_CanceledFailure(t *testing.T) { diff --git a/internal/failure_converter.go b/internal/failure_converter.go index fcc2f687b..dfd6773d1 100644 --- a/internal/failure_converter.go +++ b/internal/failure_converter.go @@ -115,6 +115,7 @@ func (dfc *DefaultFailureConverter) ErrorToFailure(err error) *failurepb.Failure NonRetryable: err.NonRetryable(), Details: convertErrDetailsToPayloads(err.details, dfc.dataConverter), NextRetryDelay: delay, + Category: applicationErrorCategoryToProto(err.Category()), } failure.FailureInfo = &failurepb.Failure_ApplicationFailureInfo{ApplicationFailureInfo: failureInfo} case *CanceledError: @@ -250,6 +251,7 @@ func (dfc *DefaultFailureConverter) FailureToError(failure *failurepb.Failure) e Cause: dfc.FailureToError(failure.GetCause()), Details: []interface{}{details}, NextRetryDelay: nextRetryDelay, + Category: applicationErrorCategoryFromProto(applicationFailureInfo.GetCategory()), }, ) } diff --git a/internal/internal_task_handlers.go b/internal/internal_task_handlers.go index 41f60c95e..11f9f87ff 100644 --- a/internal/internal_task_handlers.go +++ b/internal/internal_task_handlers.go @@ -2311,7 +2311,11 @@ func (ath *activityTaskHandlerImpl) Execute(taskQueue string, t *workflowservice return nil, ctx.Err() } if err != nil && err != ErrActivityResultPending { - ath.logger.Error("Activity error.", + logFunc := ath.logger.Error // Default to Error + if IsBenignApplicationError(err) { + logFunc = ath.logger.Debug // Downgrade to Debug for benign application errors + } + logFunc("Activity error.", tagWorkflowID, t.WorkflowExecution.GetWorkflowId(), tagRunID, t.WorkflowExecution.GetRunId(), tagActivityType, activityType, diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index b554561fc..1acd85950 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -474,19 +474,26 @@ func (wtp *workflowTaskPoller) RespondTaskCompletedWithMetrics( ) (response *workflowservice.RespondWorkflowTaskCompletedResponse, err error) { metricsHandler := wtp.metricsHandler.WithTags(metrics.WorkflowTags(task.WorkflowType.GetName())) if taskErr != nil { - wtp.logger.Warn("Failed to process workflow task.", - tagWorkflowType, task.WorkflowType.GetName(), - tagWorkflowID, task.WorkflowExecution.GetWorkflowId(), - tagRunID, task.WorkflowExecution.GetRunId(), - tagAttempt, task.Attempt, - tagError, taskErr) failWorkflowTask := wtp.errorToFailWorkflowTask(task.TaskToken, taskErr) + completedRequest = failWorkflowTask + failureReason := "WorkflowError" if failWorkflowTask.Cause == enumspb.WORKFLOW_TASK_FAILED_CAUSE_NON_DETERMINISTIC_ERROR { failureReason = "NonDeterminismError" } - incrementWorkflowTaskFailureCounter(metricsHandler, failureReason) - completedRequest = failWorkflowTask + + logFunc := wtp.logger.Warn // Default to Warn + if IsBenignApplicationError(taskErr) { + logFunc = wtp.logger.Debug // Downgrade to Debug for benign application errors + } else { + incrementWorkflowTaskFailureCounter(metricsHandler, failureReason) + } + logFunc("Failed to process workflow task.", + tagWorkflowType, task.WorkflowType.GetName(), + tagWorkflowID, task.WorkflowExecution.GetWorkflowId(), + tagRunID, task.WorkflowExecution.GetRunId(), + tagAttempt, task.Attempt, + tagError, taskErr) } metricsHandler.Timer(metrics.WorkflowTaskExecutionLatency).Record(time.Since(startTime)) @@ -705,7 +712,7 @@ func (lath *localActivityTaskHandler) executeLocalActivityTask(task *localActivi metricsHandler.Counter(metrics.LocalActivityErrorCounter).Inc(1) err = newPanicError(p, st) } - if err != nil { + if err != nil && !IsBenignApplicationError(err) { metricsHandler.Counter(metrics.LocalActivityFailedCounter).Inc(1) metricsHandler.Counter(metrics.LocalActivityExecutionFailedCounter).Inc(1) } @@ -1104,8 +1111,10 @@ func (atp *activityTaskPoller) ProcessTask(task interface{}) error { return err } // in case if activity execution failed, request should be of type RespondActivityTaskFailedRequest - if _, ok := request.(*workflowservice.RespondActivityTaskFailedRequest); ok { - activityMetricsHandler.Counter(metrics.ActivityExecutionFailedCounter).Inc(1) + if req, ok := request.(*workflowservice.RespondActivityTaskFailedRequest); ok { + if !isBenignProtoApplicationFailure(req.Failure) { + activityMetricsHandler.Counter(metrics.ActivityExecutionFailedCounter).Inc(1) + } } activityMetricsHandler.Timer(metrics.ActivityExecutionLatency).Record(time.Since(executionStartTime)) From 388c9baab17cf975e9194ef9a91fb2334aa64bd3 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Mon, 21 Apr 2025 17:59:35 -0700 Subject: [PATCH 3/7] go mod tidy all go.mod files, linting/formatting fixes, ci fixes --- contrib/datadog/go.mod | 2 +- contrib/datadog/go.sum | 4 ++-- contrib/envconfig/go.mod | 2 +- contrib/envconfig/go.sum | 4 ++-- contrib/opentelemetry/go.mod | 2 +- contrib/opentelemetry/go.sum | 4 ++-- contrib/opentracing/go.mod | 2 +- contrib/opentracing/go.sum | 4 ++-- contrib/resourcetuner/go.mod | 2 +- contrib/resourcetuner/go.sum | 4 ++-- contrib/tally/go.mod | 2 +- contrib/tally/go.sum | 4 ++-- internal/cmd/build/go.mod | 2 +- internal/cmd/build/go.sum | 4 ++-- internal/error.go | 2 +- internal/error_test.go | 6 +++--- test/go.mod | 2 +- test/go.sum | 4 ++-- 18 files changed, 28 insertions(+), 28 deletions(-) diff --git a/contrib/datadog/go.mod b/contrib/datadog/go.mod index d6115b803..c62d5eb4c 100644 --- a/contrib/datadog/go.mod +++ b/contrib/datadog/go.mod @@ -39,7 +39,7 @@ require ( github.com/secure-systems-lab/go-securesystemslib v0.7.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/tinylib/msgp v1.1.8 // indirect - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect go.uber.org/atomic v1.11.0 // indirect go4.org/intern v0.0.0-20230525184215-6c62f75575cb // indirect go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2 // indirect diff --git a/contrib/datadog/go.sum b/contrib/datadog/go.sum index 9b8ad87ff..7707c5e83 100644 --- a/contrib/datadog/go.sum +++ b/contrib/datadog/go.sum @@ -134,8 +134,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= diff --git a/contrib/envconfig/go.mod b/contrib/envconfig/go.mod index 3a7552de8..e179c5f76 100644 --- a/contrib/envconfig/go.mod +++ b/contrib/envconfig/go.mod @@ -22,7 +22,7 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/robfig/cron v1.2.0 // indirect github.com/stretchr/objx v0.5.2 // indirect - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect golang.org/x/net v0.36.0 // indirect golang.org/x/sync v0.11.0 // indirect golang.org/x/sys v0.30.0 // indirect diff --git a/contrib/envconfig/go.sum b/contrib/envconfig/go.sum index 7fc32e07d..e16cd419f 100644 --- a/contrib/envconfig/go.sum +++ b/contrib/envconfig/go.sum @@ -73,8 +73,8 @@ github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/contrib/opentelemetry/go.mod b/contrib/opentelemetry/go.mod index 4e18950eb..471de5bf7 100644 --- a/contrib/opentelemetry/go.mod +++ b/contrib/opentelemetry/go.mod @@ -32,7 +32,7 @@ require ( github.com/stretchr/objx v0.5.2 // indirect go.opentelemetry.io/otel/metric v1.27.0 go.opentelemetry.io/otel/sdk/metric v1.27.0 - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect golang.org/x/net v0.36.0 // indirect golang.org/x/sys v0.30.0 // indirect golang.org/x/text v0.22.0 // indirect diff --git a/contrib/opentelemetry/go.sum b/contrib/opentelemetry/go.sum index 58c0ddda6..0fb072cc8 100644 --- a/contrib/opentelemetry/go.sum +++ b/contrib/opentelemetry/go.sum @@ -86,8 +86,8 @@ go.opentelemetry.io/otel/sdk/metric v1.27.0 h1:5uGNOlpXi+Hbo/DRoI31BSb1v+OGcpv2N go.opentelemetry.io/otel/sdk/metric v1.27.0/go.mod h1:we7jJVrYN2kh3mVBlswtPU22K0SA+769l93J6bsyvqw= go.opentelemetry.io/otel/trace v1.27.0 h1:IqYb813p7cmbHk0a5y6pD5JPakbVfftRXABGt5/Rscw= go.opentelemetry.io/otel/trace v1.27.0/go.mod h1:6RiD1hkAprV4/q+yd2ln1HG9GoPx39SuvvstaLBl+l4= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/contrib/opentracing/go.mod b/contrib/opentracing/go.mod index fdcb6989f..0a647208a 100644 --- a/contrib/opentracing/go.mod +++ b/contrib/opentracing/go.mod @@ -22,7 +22,7 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/robfig/cron v1.2.0 // indirect github.com/stretchr/objx v0.5.2 // indirect - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect golang.org/x/net v0.36.0 // indirect golang.org/x/sync v0.11.0 // indirect golang.org/x/sys v0.30.0 // indirect diff --git a/contrib/opentracing/go.sum b/contrib/opentracing/go.sum index f7e437a72..393a47e17 100644 --- a/contrib/opentracing/go.sum +++ b/contrib/opentracing/go.sum @@ -73,8 +73,8 @@ github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/contrib/resourcetuner/go.mod b/contrib/resourcetuner/go.mod index 489392f83..946771607 100644 --- a/contrib/resourcetuner/go.mod +++ b/contrib/resourcetuner/go.mod @@ -36,7 +36,7 @@ require ( github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect golang.org/x/exp v0.0.0-20231127185646-65229373498e // indirect golang.org/x/net v0.36.0 // indirect golang.org/x/sync v0.11.0 // indirect diff --git a/contrib/resourcetuner/go.sum b/contrib/resourcetuner/go.sum index 1db988427..db08ef36e 100644 --- a/contrib/resourcetuner/go.sum +++ b/contrib/resourcetuner/go.sum @@ -106,8 +106,8 @@ github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.einride.tech/pid v0.1.3 h1:yWAKSmD2Z10jxd4gYFhOjbBNqXeIQwAtnCO/XKCT7sQ= go.einride.tech/pid v0.1.3/go.mod h1:33JSUbKrH/4v8DZf/0K8IC8Enjd92wB2birp+bCYQso= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= diff --git a/contrib/tally/go.mod b/contrib/tally/go.mod index f6d677680..83da008cf 100644 --- a/contrib/tally/go.mod +++ b/contrib/tally/go.mod @@ -23,7 +23,7 @@ require ( github.com/robfig/cron v1.2.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/twmb/murmur3 v1.1.5 // indirect - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect go.uber.org/atomic v1.9.0 // indirect golang.org/x/net v0.36.0 // indirect golang.org/x/sync v0.11.0 // indirect diff --git a/contrib/tally/go.sum b/contrib/tally/go.sum index c17e414d4..36988b47f 100644 --- a/contrib/tally/go.sum +++ b/contrib/tally/go.sum @@ -138,8 +138,8 @@ github.com/uber-go/tally/v4 v4.1.1/go.mod h1:aXeSTDMl4tNosyf6rdU8jlgScHyjEGGtfJ/ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= diff --git a/internal/cmd/build/go.mod b/internal/cmd/build/go.mod index 1bcdab7e1..4fd80ecec 100644 --- a/internal/cmd/build/go.mod +++ b/internal/cmd/build/go.mod @@ -24,7 +24,7 @@ require ( github.com/robfig/cron v1.2.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/stretchr/testify v1.10.0 // indirect - go.temporal.io/api v1.46.0 // indirect + go.temporal.io/api v1.48.0 // indirect golang.org/x/exp/typeparams v0.0.0-20250210185358-939b2ce775ac // indirect golang.org/x/mod v0.23.0 // indirect golang.org/x/net v0.36.0 // indirect diff --git a/internal/cmd/build/go.sum b/internal/cmd/build/go.sum index 6581a7339..5ca7a1904 100644 --- a/internal/cmd/build/go.sum +++ b/internal/cmd/build/go.sum @@ -89,8 +89,8 @@ go.opentelemetry.io/otel/sdk/metric v1.32.0 h1:rZvFnvmvawYb0alrYkjraqJq0Z4ZUJAiy go.opentelemetry.io/otel/sdk/metric v1.32.0/go.mod h1:PWeZlq0zt9YkYAp3gjKZ0eicRYvOh1Gd+X99x6GHpCQ= go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM= go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/internal/error.go b/internal/error.go index ad1c234d0..870fabb9a 100644 --- a/internal/error.go +++ b/internal/error.go @@ -387,7 +387,7 @@ var ( const ( // ErrorCategoryBenign indicates an error that is expected under normal operation and should not trigger alerts. - ErrorCategoryBenign ApplicationErrorCategory = "benign" + ErrorCategoryBenign ApplicationErrorCategory = "BENIGN" ) // NewApplicationError create new instance of *ApplicationError with message, type, and optional details. diff --git a/internal/error_test.go b/internal/error_test.go index c5880b27e..51e1c9f0d 100644 --- a/internal/error_test.go +++ b/internal/error_test.go @@ -747,14 +747,14 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T) require.True(errors.As(err2, &applicationErr)) require.Equal("cause error", applicationErr.Error()) - err := NewApplicationErrorWithOptions( + err = NewApplicationErrorWithOptions( "another message", "another customType", ApplicationErrorOptions{ Category: "", }, ) - f := fc.ErrorToFailure(err) + f = fc.ErrorToFailure(err) require.Equal(enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED, f.GetApplicationFailureInfo().GetCategory()) } @@ -1176,7 +1176,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { require.Equal("message (type: CoolError, retryable: true)", coolErr.Error()) require.Equal("CoolError", coolErr.Type()) require.Equal(false, coolErr.NonRetryable()) - require.Equal("", coolErr.Category()) + require.Equal(ApplicationErrorCategory(""), coolErr.Category()) } func Test_convertFailureToError_CanceledFailure(t *testing.T) { diff --git a/test/go.mod b/test/go.mod index f73880888..308d7de33 100644 --- a/test/go.mod +++ b/test/go.mod @@ -14,7 +14,7 @@ require ( go.opentelemetry.io/otel v1.28.0 go.opentelemetry.io/otel/sdk v1.28.0 go.opentelemetry.io/otel/trace v1.28.0 - go.temporal.io/api v1.46.0 + go.temporal.io/api v1.48.0 go.temporal.io/sdk v1.29.1 go.temporal.io/sdk/contrib/opentelemetry v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentracing v0.0.0-00010101000000-000000000000 diff --git a/test/go.sum b/test/go.sum index d8db3e3ba..e05159777 100644 --- a/test/go.sum +++ b/test/go.sum @@ -190,8 +190,8 @@ go.opentelemetry.io/otel/sdk/metric v1.27.0 h1:5uGNOlpXi+Hbo/DRoI31BSb1v+OGcpv2N go.opentelemetry.io/otel/sdk/metric v1.27.0/go.mod h1:we7jJVrYN2kh3mVBlswtPU22K0SA+769l93J6bsyvqw= go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g= go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI= -go.temporal.io/api v1.46.0 h1:O1efPDB6O2B8uIeCDIa+3VZC7tZMvYsMZYQapSbHvCg= -go.temporal.io/api v1.46.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/api v1.48.0 h1:qk8c2QkB4RRjBw2yB82hUulbhHxRIYnem+u2mvlbUf8= +go.temporal.io/api v1.48.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= From d96a9923a9d471677ea2121dd74a67e6a7727fa3 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Tue, 22 Apr 2025 02:37:14 -0700 Subject: [PATCH 4/7] fixes, added tests --- internal/internal_task_handlers.go | 4 +- internal/internal_task_pollers.go | 9 +- test/integration_test.go | 196 +++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 8 deletions(-) diff --git a/internal/internal_task_handlers.go b/internal/internal_task_handlers.go index 11f9f87ff..d06bede67 100644 --- a/internal/internal_task_handlers.go +++ b/internal/internal_task_handlers.go @@ -1894,7 +1894,9 @@ func (wth *workflowTaskHandlerImpl) completeWorkflow( }} } else if workflowContext.err != nil { // Workflow failures - metricsHandler.Counter(metrics.WorkflowFailedCounter).Inc(1) + if !IsBenignApplicationError(workflowContext.err) { + metricsHandler.Counter(metrics.WorkflowFailedCounter).Inc(1) + } closeCommand = createNewCommand(enumspb.COMMAND_TYPE_FAIL_WORKFLOW_EXECUTION) failure := wth.failureConverter.ErrorToFailure(workflowContext.err) closeCommand.Attributes = &commandpb.Command_FailWorkflowExecutionCommandAttributes{FailWorkflowExecutionCommandAttributes: &commandpb.FailWorkflowExecutionCommandAttributes{ diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index 1acd85950..09d9d2cea 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -482,13 +482,8 @@ func (wtp *workflowTaskPoller) RespondTaskCompletedWithMetrics( failureReason = "NonDeterminismError" } - logFunc := wtp.logger.Warn // Default to Warn - if IsBenignApplicationError(taskErr) { - logFunc = wtp.logger.Debug // Downgrade to Debug for benign application errors - } else { - incrementWorkflowTaskFailureCounter(metricsHandler, failureReason) - } - logFunc("Failed to process workflow task.", + incrementWorkflowTaskFailureCounter(metricsHandler, failureReason) + wtp.logger.Warn("Failed to process workflow task.", tagWorkflowType, task.WorkflowType.GetName(), tagWorkflowID, task.WorkflowExecution.GetWorkflowId(), tagRunID, task.WorkflowExecution.GetRunId(), diff --git a/test/integration_test.go b/test/integration_test.go index 9c680e5f2..81cd65df2 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -32,6 +32,7 @@ import ( "math" "math/rand" "os" + "slices" "strconv" "strings" "sync" @@ -7522,3 +7523,198 @@ func (ts *IntegrationTestSuite) TestRawValueQueryMetadata() { ts.Equal("Basic", metadata.Definition.Type) ts.Equal(3, len(metadata.Definition.QueryDefinitions)) } + +func (ts *IntegrationTestSuite) TestWorkflowTaskFailureMetric_BenignHandling() { + wfWithApplicationErr := func(ctx workflow.Context, isBenign bool) error { + if !isBenign { + return temporal.NewApplicationError("Non-benign failure", "", false, nil) + } + return temporal.NewApplicationErrorWithOptions( + "Benign failure", + "", + temporal.ApplicationErrorOptions{ + Category: internal.ErrorCategoryBenign, + }, + ) + } + + ts.worker.RegisterWorkflow(wfWithApplicationErr) + currCount := ts.metricCount(metrics.WorkflowFailedCounter) + + runNonBenign, err := ts.client.ExecuteWorkflow( + context.Background(), + ts.startWorkflowOptions("test-non-benign-failure-metric"), + wfWithApplicationErr, + false, + ) + ts.NoError(err) + // Wait for completion + err = runNonBenign.Get(context.Background(), nil) + // Expect a non-benign application error. + ts.Error(err) + var appErr *temporal.ApplicationError + ts.True(errors.As(err, &appErr)) + ts.False(internal.IsBenignApplicationError(err)) + ts.Equal("Non-benign failure", appErr.Error()) + + // Expect initial count to have incremented because the workflow failed with non-benign err. + currCount++ + ts.assertMetricCount(metrics.WorkflowFailedCounter, currCount) + + runBenign, err := ts.client.ExecuteWorkflow( + context.Background(), + ts.startWorkflowOptions("test-benign-failure-metric"), + wfWithApplicationErr, + true, + ) + ts.NoError(err) + // Wait for completion + err = runBenign.Get(context.Background(), nil) + // Expect a benign application error. + ts.Error(err) + ts.True(errors.As(err, &appErr)) + ts.True(internal.IsBenignApplicationError(err)) + // Expect count to not have incremented because the workflow failed with benign err. + ts.assertMetricCount(metrics.WorkflowFailedCounter, currCount) +} + +func (ts *IntegrationTestSuite) TestActivityFailureMetric_BenignHandling() { + actWithAppErr := func(ctx context.Context, isBenign bool) error { + if isBenign { + return temporal.NewApplicationErrorWithOptions("Benign act failure", "", + temporal.ApplicationErrorOptions{Category: internal.ErrorCategoryBenign}) + } + return temporal.NewApplicationError("Non-benign act failure", "", false, nil) + } + + wfWithAppErrActivity := func(ctx workflow.Context, isBenign bool) error { + ctx = workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 3 * time.Second, + // Don't retry + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }) + return workflow.ExecuteActivity(ctx, actWithAppErr, isBenign).Get(ctx, nil) + } + + // Configure client/worker with logger, capture logs + logger := ilog.NewMemoryLogger() + c, err := client.Dial(client.Options{ + HostPort: ts.config.ServiceAddr, + Namespace: ts.config.Namespace, + Logger: logger, + ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, + MetricsHandler: ts.metricsHandler, + }) + ts.NoError(err) + defer c.Close() + + testWorker := worker.New(c, ts.taskQueueName, worker.Options{}) + testWorker.RegisterActivity(actWithAppErr) + testWorker.RegisterWorkflow(wfWithAppErrActivity) + err = testWorker.Start() + ts.NoError(err) + defer testWorker.Stop() + + var appErr *temporal.ApplicationError + currCount := ts.metricCount(metrics.ActivityExecutionFailedCounter) + + runNonBenign, err := c.ExecuteWorkflow( + context.Background(), + ts.startWorkflowOptions(ts.T().Name()+"-non-benign-err"), + wfWithAppErrActivity, + false, + ) + ts.NoError(err) + // Wait for completion + err = runNonBenign.Get(context.Background(), nil) + ts.Error(err) + ts.True(errors.As(err, &appErr)) + // Expect non-benign error + ts.False(internal.IsBenignApplicationError(err)) + // Expect warn log for activity failure + ts.True(slices.ContainsFunc(logger.Lines(), func(line string) bool { + return strings.Contains(line, "ERROR") && strings.Contains(line, "Activity error.") + })) + + // Expect initial count to have incremented because the activity failed with non-benign err. + currCount++ + ts.assertMetricCount(metrics.ActivityExecutionFailedCounter, currCount) + + runBenign, err := c.ExecuteWorkflow( + context.Background(), + ts.startWorkflowOptions(ts.T().Name()+"-benign-err"), + wfWithAppErrActivity, true, + ) + ts.NoError(err) + // Wait for completion + err = runBenign.Get(context.Background(), nil) + ts.Error(err) + ts.True(errors.As(err, &appErr)) + // Expect benign error + ts.True(internal.IsBenignApplicationError(err)) + // Expect debug log for activity failure + ts.True(slices.ContainsFunc(logger.Lines(), func(line string) bool { + return strings.Contains(line, "DEBUG") && strings.Contains(line, "Activity error.") + })) + + // Expect count to not have incremented because the activity failed with benign err. + ts.assertMetricCount(metrics.ActivityExecutionFailedCounter, currCount) +} + +func (ts *IntegrationTestSuite) TestLocalActivityFailureMetric_BenignHandling() { + localActWithAppErr := func(ctx context.Context, isBenign bool) error { + if isBenign { + return temporal.NewApplicationErrorWithOptions("Benign local act failure", "", + temporal.ApplicationErrorOptions{Category: internal.ErrorCategoryBenign}) + } + return temporal.NewApplicationError("Non-benign local act failure", "", false, nil) + } + + wfWithLocalActAppErr := func(ctx workflow.Context, isBenign bool) error { + ctx = workflow.WithLocalActivityOptions(ctx, workflow.LocalActivityOptions{ + ScheduleToCloseTimeout: 3 * time.Second, + // Don't retry + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }) + return workflow.ExecuteLocalActivity(ctx, localActWithAppErr, isBenign).Get(ctx, nil) + } + + ts.worker.RegisterActivity(localActWithAppErr) + ts.worker.RegisterWorkflow(wfWithLocalActAppErr) + + var appErr *temporal.ApplicationError + currCount := ts.metricCount(metrics.LocalActivityExecutionFailedCounter) + + runNonBenign, err := ts.client.ExecuteWorkflow( + context.Background(), + ts.startWorkflowOptions(ts.T().Name()+"-non-benign-err"), + wfWithLocalActAppErr, false, + ) + ts.NoError(err) + // Wait for completion + err = runNonBenign.Get(context.Background(), nil) + ts.Error(err) + ts.True(errors.As(err, &appErr)) + // Expect non-benign error + ts.False(internal.IsBenignApplicationError(err)) + + // Expect initial count to have incremented because the activity failed with non-benign err. + currCount++ + ts.assertMetricCount(metrics.LocalActivityExecutionFailedCounter, currCount) + + runBenign, err := ts.client.ExecuteWorkflow( + context.Background(), + ts.startWorkflowOptions(ts.T().Name()+"-benign-err"), + wfWithLocalActAppErr, true, + ) + ts.NoError(err) + // Wait for completion + err = runBenign.Get(context.Background(), nil) + ts.Error(err) + ts.True(errors.As(err, &appErr)) + // Expect benign error + ts.True(internal.IsBenignApplicationError(err)) + + // Expect count to remain unchanged + ts.assertMetricCount(metrics.LocalActivityExecutionFailedCounter, currCount) +} From 5646e0ad3835433de258939a1a4d88689625791c Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Thu, 24 Apr 2025 10:27:55 -0700 Subject: [PATCH 5/7] Address PR review: - fully qualified names ApplicationErrorCategory - export category for external usage... - make enum int instead of string --- internal/error.go | 52 +++++++++++------------------- internal/error_test.go | 10 +++--- internal/failure_converter.go | 4 +-- internal/internal_task_handlers.go | 4 +-- internal/internal_task_pollers.go | 18 +++++------ temporal/error.go | 11 +++++++ test/integration_test.go | 18 +++++------ 7 files changed, 55 insertions(+), 62 deletions(-) diff --git a/internal/error.go b/internal/error.go index 870fabb9a..beef48522 100644 --- a/internal/error.go +++ b/internal/error.go @@ -119,9 +119,6 @@ Workflow consumers will get an instance of *WorkflowExecutionError. This error w */ type ( - // Category of the error. Maps to logging/metrics behaviours. - ApplicationErrorCategory string - // ApplicationErrorOptions represents a combination of error attributes and additional requests. // All fields are optional, providing flexibility in error customization. // @@ -140,7 +137,8 @@ type ( // // NOTE: This option is supported by Temporal Server >= v1.24.2 older version will ignore this value. NextRetryDelay time.Duration - Category ApplicationErrorCategory + // Category of the error. Maps to logging/metrics behaviours. + Category ApplicationErrorCategory } // ApplicationError returned from activity implementations with message and optional details. @@ -385,9 +383,21 @@ var ( ErrMissingWorkflowID = errors.New("workflow ID is unset for Nexus operation") ) +// ApplicationErrorCategory sets the category of the error. The category of the error +// maps to logging/metrics behaviours. +// +// Exposed as: [go.temporal.io/sdk/temporal.ApplicationErrorCategory] +type ApplicationErrorCategory int + const ( - // ErrorCategoryBenign indicates an error that is expected under normal operation and should not trigger alerts. - ErrorCategoryBenign ApplicationErrorCategory = "BENIGN" + // ApplicationErrorCategoryUnspecified represents an error with an unspecified category. + // + // Exposed as: [go.temporal.io/sdk/temporal.ApplicationErrorCategoryUnspecified] + ApplicationErrorCategoryUnspecified ApplicationErrorCategory = iota + // ApplicationErrorCategoryBenign indicates an error that is expected under normal operation and should not trigger alerts. + // + // Exposed as: [go.temporal.io/sdk/temporal.ApplicationErrorCategoryBenign] + ApplicationErrorCategoryBenign ) // NewApplicationError create new instance of *ApplicationError with message, type, and optional details. @@ -1046,35 +1056,9 @@ func getErrType(err error) string { return t.Name() } -func applicationErrorCategoryToProto(category ApplicationErrorCategory) enumspb.ApplicationErrorCategory { - switch category { - case ErrorCategoryBenign: - return enumspb.APPLICATION_ERROR_CATEGORY_BENIGN - case "": - // Zero value maps to unspecified - return enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED - default: - // Fallback to unspecified if unknown case - return enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED - } -} - -func applicationErrorCategoryFromProto(category enumspb.ApplicationErrorCategory) ApplicationErrorCategory { - switch category { - case enumspb.APPLICATION_ERROR_CATEGORY_BENIGN: - return ErrorCategoryBenign - case enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED: - // Unspecified maps to zero value - return "" - default: - // Fallback to zero value if unknown case - return "" - } -} - -func IsBenignApplicationError(err error) bool { +func isBenignApplicationError(err error) bool { var appError *ApplicationError - return errors.As(err, &appError) && appError.Category() == ErrorCategoryBenign + return errors.As(err, &appError) && appError.Category() == ApplicationErrorCategoryBenign } func isBenignProtoApplicationFailure(failure *failurepb.Failure) bool { diff --git a/internal/error_test.go b/internal/error_test.go index 51e1c9f0d..e6bab289a 100644 --- a/internal/error_test.go +++ b/internal/error_test.go @@ -723,7 +723,7 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T) NonRetryable: true, Cause: errors.New("cause error"), Details: []interface{}{"details", 2208}, - Category: ErrorCategoryBenign, + Category: ApplicationErrorCategoryBenign, }, ) f := fc.ErrorToFailure(err) @@ -741,7 +741,7 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T) var applicationErr *ApplicationError require.True(errors.As(err2, &applicationErr)) require.Equal("message (type: customType, retryable: false): cause error", applicationErr.Error()) - require.Equal(ErrorCategoryBenign, applicationErr.Category()) + require.Equal(ApplicationErrorCategoryBenign, applicationErr.Category()) err2 = errors.Unwrap(err2) require.True(errors.As(err2, &applicationErr)) @@ -751,7 +751,7 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T) "another message", "another customType", ApplicationErrorOptions{ - Category: "", + Category: ApplicationErrorCategoryUnspecified, }, ) f = fc.ErrorToFailure(err) @@ -1134,7 +1134,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { require.Equal("message (type: MyCoolType, retryable: false): cause message (type: UnknownType, retryable: true)", applicationErr.Error()) require.Equal("MyCoolType", applicationErr.Type()) require.Equal(true, applicationErr.NonRetryable()) - require.Equal(ErrorCategoryBenign, applicationErr.Category()) + require.Equal(ApplicationErrorCategoryBenign, applicationErr.Category()) var str string var n int require.NoError(applicationErr.Details(&str, &n)) @@ -1176,7 +1176,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) { require.Equal("message (type: CoolError, retryable: true)", coolErr.Error()) require.Equal("CoolError", coolErr.Type()) require.Equal(false, coolErr.NonRetryable()) - require.Equal(ApplicationErrorCategory(""), coolErr.Category()) + require.Equal(ApplicationErrorCategoryUnspecified, coolErr.Category()) } func Test_convertFailureToError_CanceledFailure(t *testing.T) { diff --git a/internal/failure_converter.go b/internal/failure_converter.go index dfd6773d1..da3686c1d 100644 --- a/internal/failure_converter.go +++ b/internal/failure_converter.go @@ -115,7 +115,7 @@ func (dfc *DefaultFailureConverter) ErrorToFailure(err error) *failurepb.Failure NonRetryable: err.NonRetryable(), Details: convertErrDetailsToPayloads(err.details, dfc.dataConverter), NextRetryDelay: delay, - Category: applicationErrorCategoryToProto(err.Category()), + Category: enumspb.ApplicationErrorCategory(err.Category()), } failure.FailureInfo = &failurepb.Failure_ApplicationFailureInfo{ApplicationFailureInfo: failureInfo} case *CanceledError: @@ -251,7 +251,7 @@ func (dfc *DefaultFailureConverter) FailureToError(failure *failurepb.Failure) e Cause: dfc.FailureToError(failure.GetCause()), Details: []interface{}{details}, NextRetryDelay: nextRetryDelay, - Category: applicationErrorCategoryFromProto(applicationFailureInfo.GetCategory()), + Category: ApplicationErrorCategory(applicationFailureInfo.GetCategory()), }, ) } diff --git a/internal/internal_task_handlers.go b/internal/internal_task_handlers.go index d06bede67..d1ac651a8 100644 --- a/internal/internal_task_handlers.go +++ b/internal/internal_task_handlers.go @@ -1894,7 +1894,7 @@ func (wth *workflowTaskHandlerImpl) completeWorkflow( }} } else if workflowContext.err != nil { // Workflow failures - if !IsBenignApplicationError(workflowContext.err) { + if !isBenignApplicationError(workflowContext.err) { metricsHandler.Counter(metrics.WorkflowFailedCounter).Inc(1) } closeCommand = createNewCommand(enumspb.COMMAND_TYPE_FAIL_WORKFLOW_EXECUTION) @@ -2314,7 +2314,7 @@ func (ath *activityTaskHandlerImpl) Execute(taskQueue string, t *workflowservice } if err != nil && err != ErrActivityResultPending { logFunc := ath.logger.Error // Default to Error - if IsBenignApplicationError(err) { + if isBenignApplicationError(err) { logFunc = ath.logger.Debug // Downgrade to Debug for benign application errors } logFunc("Activity error.", diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index 09d9d2cea..023c1939c 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -474,21 +474,19 @@ func (wtp *workflowTaskPoller) RespondTaskCompletedWithMetrics( ) (response *workflowservice.RespondWorkflowTaskCompletedResponse, err error) { metricsHandler := wtp.metricsHandler.WithTags(metrics.WorkflowTags(task.WorkflowType.GetName())) if taskErr != nil { - failWorkflowTask := wtp.errorToFailWorkflowTask(task.TaskToken, taskErr) - completedRequest = failWorkflowTask - - failureReason := "WorkflowError" - if failWorkflowTask.Cause == enumspb.WORKFLOW_TASK_FAILED_CAUSE_NON_DETERMINISTIC_ERROR { - failureReason = "NonDeterminismError" - } - - incrementWorkflowTaskFailureCounter(metricsHandler, failureReason) wtp.logger.Warn("Failed to process workflow task.", tagWorkflowType, task.WorkflowType.GetName(), tagWorkflowID, task.WorkflowExecution.GetWorkflowId(), tagRunID, task.WorkflowExecution.GetRunId(), tagAttempt, task.Attempt, tagError, taskErr) + failWorkflowTask := wtp.errorToFailWorkflowTask(task.TaskToken, taskErr) + failureReason := "WorkflowError" + if failWorkflowTask.Cause == enumspb.WORKFLOW_TASK_FAILED_CAUSE_NON_DETERMINISTIC_ERROR { + failureReason = "NonDeterminismError" + } + incrementWorkflowTaskFailureCounter(metricsHandler, failureReason) + completedRequest = failWorkflowTask } metricsHandler.Timer(metrics.WorkflowTaskExecutionLatency).Record(time.Since(startTime)) @@ -707,7 +705,7 @@ func (lath *localActivityTaskHandler) executeLocalActivityTask(task *localActivi metricsHandler.Counter(metrics.LocalActivityErrorCounter).Inc(1) err = newPanicError(p, st) } - if err != nil && !IsBenignApplicationError(err) { + if err != nil && !isBenignApplicationError(err) { metricsHandler.Counter(metrics.LocalActivityFailedCounter).Inc(1) metricsHandler.Counter(metrics.LocalActivityExecutionFailedCounter).Inc(1) } diff --git a/temporal/error.go b/temporal/error.go index 736b47701..306a5bb57 100644 --- a/temporal/error.go +++ b/temporal/error.go @@ -273,3 +273,14 @@ func NewTimeoutError(timeoutType enumspb.TimeoutType, lastErr error, details ... func NewHeartbeatTimeoutError(details ...interface{}) error { return internal.NewHeartbeatTimeoutError(details...) } + +// ApplicationErrorCategory sets the category of the error. The category of the error +// maps to logging/metrics behaviours. +type ApplicationErrorCategory = internal.ApplicationErrorCategory + +const ( + // ApplicationErrorCategoryUnspecified represents an error with an unspecified category. + ApplicationErrorCategoryUnspecified = internal.ApplicationErrorCategoryUnspecified + // ApplicationErrorCategoryBenign indicates an error that is expected under normal operation and should not trigger alerts. + ApplicationErrorCategoryBenign = internal.ApplicationErrorCategoryBenign +) diff --git a/test/integration_test.go b/test/integration_test.go index 81cd65df2..7483ce53f 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -7533,7 +7533,7 @@ func (ts *IntegrationTestSuite) TestWorkflowTaskFailureMetric_BenignHandling() { "Benign failure", "", temporal.ApplicationErrorOptions{ - Category: internal.ErrorCategoryBenign, + Category: temporal.ApplicationErrorCategoryBenign, }, ) } @@ -7554,7 +7554,7 @@ func (ts *IntegrationTestSuite) TestWorkflowTaskFailureMetric_BenignHandling() { ts.Error(err) var appErr *temporal.ApplicationError ts.True(errors.As(err, &appErr)) - ts.False(internal.IsBenignApplicationError(err)) + ts.False(appErr.Category() == temporal.ApplicationErrorCategoryBenign) ts.Equal("Non-benign failure", appErr.Error()) // Expect initial count to have incremented because the workflow failed with non-benign err. @@ -7573,7 +7573,7 @@ func (ts *IntegrationTestSuite) TestWorkflowTaskFailureMetric_BenignHandling() { // Expect a benign application error. ts.Error(err) ts.True(errors.As(err, &appErr)) - ts.True(internal.IsBenignApplicationError(err)) + ts.True(appErr.Category() == temporal.ApplicationErrorCategoryBenign) // Expect count to not have incremented because the workflow failed with benign err. ts.assertMetricCount(metrics.WorkflowFailedCounter, currCount) } @@ -7582,7 +7582,7 @@ func (ts *IntegrationTestSuite) TestActivityFailureMetric_BenignHandling() { actWithAppErr := func(ctx context.Context, isBenign bool) error { if isBenign { return temporal.NewApplicationErrorWithOptions("Benign act failure", "", - temporal.ApplicationErrorOptions{Category: internal.ErrorCategoryBenign}) + temporal.ApplicationErrorOptions{Category: temporal.ApplicationErrorCategoryBenign}) } return temporal.NewApplicationError("Non-benign act failure", "", false, nil) } @@ -7630,7 +7630,7 @@ func (ts *IntegrationTestSuite) TestActivityFailureMetric_BenignHandling() { ts.Error(err) ts.True(errors.As(err, &appErr)) // Expect non-benign error - ts.False(internal.IsBenignApplicationError(err)) + ts.False(appErr.Category() == temporal.ApplicationErrorCategoryBenign) // Expect warn log for activity failure ts.True(slices.ContainsFunc(logger.Lines(), func(line string) bool { return strings.Contains(line, "ERROR") && strings.Contains(line, "Activity error.") @@ -7651,7 +7651,7 @@ func (ts *IntegrationTestSuite) TestActivityFailureMetric_BenignHandling() { ts.Error(err) ts.True(errors.As(err, &appErr)) // Expect benign error - ts.True(internal.IsBenignApplicationError(err)) + ts.True(appErr.Category() == temporal.ApplicationErrorCategoryBenign) // Expect debug log for activity failure ts.True(slices.ContainsFunc(logger.Lines(), func(line string) bool { return strings.Contains(line, "DEBUG") && strings.Contains(line, "Activity error.") @@ -7665,7 +7665,7 @@ func (ts *IntegrationTestSuite) TestLocalActivityFailureMetric_BenignHandling() localActWithAppErr := func(ctx context.Context, isBenign bool) error { if isBenign { return temporal.NewApplicationErrorWithOptions("Benign local act failure", "", - temporal.ApplicationErrorOptions{Category: internal.ErrorCategoryBenign}) + temporal.ApplicationErrorOptions{Category: temporal.ApplicationErrorCategoryBenign}) } return temporal.NewApplicationError("Non-benign local act failure", "", false, nil) } @@ -7696,7 +7696,7 @@ func (ts *IntegrationTestSuite) TestLocalActivityFailureMetric_BenignHandling() ts.Error(err) ts.True(errors.As(err, &appErr)) // Expect non-benign error - ts.False(internal.IsBenignApplicationError(err)) + ts.False(appErr.Category() == temporal.ApplicationErrorCategoryBenign) // Expect initial count to have incremented because the activity failed with non-benign err. currCount++ @@ -7713,7 +7713,7 @@ func (ts *IntegrationTestSuite) TestLocalActivityFailureMetric_BenignHandling() ts.Error(err) ts.True(errors.As(err, &appErr)) // Expect benign error - ts.True(internal.IsBenignApplicationError(err)) + ts.True(appErr.Category() == temporal.ApplicationErrorCategoryBenign) // Expect count to remain unchanged ts.assertMetricCount(metrics.LocalActivityExecutionFailedCounter, currCount) From b9bdbf881f115e9643f4a50269a48b5e5b383310 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Fri, 25 Apr 2025 09:20:06 -0700 Subject: [PATCH 6/7] only check top-level error --- internal/error.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/error.go b/internal/error.go index beef48522..953dfc291 100644 --- a/internal/error.go +++ b/internal/error.go @@ -1057,8 +1057,8 @@ func getErrType(err error) string { } func isBenignApplicationError(err error) bool { - var appError *ApplicationError - return errors.As(err, &appError) && appError.Category() == ApplicationErrorCategoryBenign + appError, _ := err.(*ApplicationError) + return appError != nil && appError.Category() == ApplicationErrorCategoryBenign } func isBenignProtoApplicationFailure(failure *failurepb.Failure) bool { From 4b3e520405f554376a85ae71811a8675896e946d Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Fri, 25 Apr 2025 12:25:28 -0700 Subject: [PATCH 7/7] update comment --- temporal/error.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/temporal/error.go b/temporal/error.go index 306a5bb57..2ef1e3e80 100644 --- a/temporal/error.go +++ b/temporal/error.go @@ -275,7 +275,7 @@ func NewHeartbeatTimeoutError(details ...interface{}) error { } // ApplicationErrorCategory sets the category of the error. The category of the error -// maps to logging/metrics behaviours. +// maps to logging/metrics SDK behaviours, does not impact server-side logging/metrics. type ApplicationErrorCategory = internal.ApplicationErrorCategory const (