Skip to content

Commit 77bb6d2

Browse files
committed
fix: randomise the initial grace period to avoid collisions
The previous algorithm was using binary exponential-backoff with a +- 10% jitter to calculate the grace period. Because there can be multiple lambda environments we need to mitigate collisions: We cannot use 0 as the first delay because functions failing closer to each other will collide. The issue would then be propagated by the small jitter for lower delays. This change adds an initial delay of n seconds to the first reconnection attempt. n is randomly generated in a closed interval to account for collisions while keeping in mind usability and user experience.
1 parent 6d5b175 commit 77bb6d2

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

apm-lambda-extension/extension/apm_server_transport.go

+7
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,13 @@ func (transport *ApmServerTransport) SetApmServerTransportState(ctx context.Cont
232232

233233
// ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors
234234
func (transport *ApmServerTransport) computeGracePeriod() time.Duration {
235+
// If reconnectionCount is 0, returns a random number in an interval.
236+
// The grace period for the first reconnection count was 0 but that
237+
// leads to collisions with multiple environment.
238+
if transport.reconnectionCount == 0 {
239+
gracePeriod := rand.Float64() * 5
240+
return time.Duration(gracePeriod * float64(time.Second))
241+
}
235242
gracePeriodWithoutJitter := math.Pow(math.Min(float64(transport.reconnectionCount), 6), 2)
236243
jitter := rand.Float64()/5 - 0.1
237244
return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second))

apm-lambda-extension/extension/apm_server_transport_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ func TestGracePeriod(t *testing.T) {
125125

126126
transport.reconnectionCount = 0
127127
val0 := transport.computeGracePeriod().Seconds()
128-
assert.Equal(t, val0, float64(0))
128+
assert.LessOrEqual(t, val0, 5.0)
129129

130130
transport.reconnectionCount = 1
131131
val1 := transport.computeGracePeriod().Seconds()

0 commit comments

Comments
 (0)