From 514cff878f261f847e3b6f9c0b05e60316404f23 Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Tue, 22 Jul 2025 18:07:18 -0700 Subject: [PATCH 1/3] New metric for ingester errors Signed-off-by: Daniel Deluiggi --- pkg/ingester/ingester.go | 11 ++++++----- pkg/ingester/ingester_test.go | 8 +++++++- pkg/ingester/instance_limits.go | 2 ++ pkg/ingester/metrics.go | 6 ++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 44158954c48..dd2dc4f1666 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1167,6 +1167,11 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte span, ctx := opentracing.StartSpanFromContext(ctx, "Ingester.Push") defer span.Finish() + userID, err := tenant.TenantID(ctx) + if err != nil { + return nil, err + } + // We will report *this* request in the error too. inflight := i.inflightPushRequests.Inc() i.maxInflightPushRequests.Track(inflight) @@ -1175,6 +1180,7 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte gl := i.getInstanceLimits() if gl != nil && gl.MaxInflightPushRequests > 0 { if inflight > gl.MaxInflightPushRequests { + i.metrics.pushErrorsTotal.WithLabelValues(userID, pushErrTooManyInflightRequests).Inc() return nil, errTooManyInflightPushRequests } } @@ -1186,11 +1192,6 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte defer req.Free() defer cortexpb.ReuseSlice(req.Timeseries) - userID, err := tenant.TenantID(ctx) - if err != nil { - return nil, err - } - il := i.getInstanceLimits() if il != nil && il.MaxIngestionRate > 0 { if rate := i.ingestionRate.Rate(); rate >= il.MaxIngestionRate { diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 1010a6834a5..79bcde28301 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -6515,7 +6515,8 @@ func TestIngester_inflightPushRequests(t *testing.T) { cfg.InstanceLimitsFn = func() *InstanceLimits { return &limits } cfg.LifecyclerConfig.JoinAfter = 0 - i, err := prepareIngesterWithBlocksStorage(t, cfg, prometheus.NewRegistry()) + reg := prometheus.NewRegistry() + i, err := prepareIngesterWithBlocksStorage(t, cfg, reg) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck @@ -6553,6 +6554,11 @@ func TestIngester_inflightPushRequests(t *testing.T) { _, err := i.Push(ctx, req) require.Equal(t, errTooManyInflightPushRequests, err) + require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` + # HELP cortex_ingester_push_errors_total The total number of push errors per user. + # TYPE cortex_ingester_push_errors_total counter + cortex_ingester_push_errors_total{reason="errTooManyInflightRequests",user="test"} 1 + `), "cortex_ingester_push_errors_total")) return nil }) diff --git a/pkg/ingester/instance_limits.go b/pkg/ingester/instance_limits.go index cc6a8d52b32..fa0713e2c46 100644 --- a/pkg/ingester/instance_limits.go +++ b/pkg/ingester/instance_limits.go @@ -13,6 +13,8 @@ var ( errMaxSeriesLimitReached = errors.New("cannot add series: ingesters's max series limit reached") errTooManyInflightPushRequests = errors.New("cannot push: too many inflight push requests in ingester") errTooManyInflightQueryRequests = errors.New("cannot push: too many inflight query requests in ingester") + + pushErrTooManyInflightRequests = "errTooManyInflightRequests" ) // InstanceLimits describes limits used by ingester. Reaching any of these will result in error response to the call. diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index e1b93a28718..fc05b9764bb 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -44,6 +44,7 @@ type ingesterMetrics struct { memMetadataCreatedTotal *prometheus.CounterVec memSeriesRemovedTotal *prometheus.CounterVec memMetadataRemovedTotal *prometheus.CounterVec + pushErrorsTotal *prometheus.CounterVec activeSeriesPerUser *prometheus.GaugeVec activeNHSeriesPerUser *prometheus.GaugeVec @@ -165,6 +166,10 @@ func newIngesterMetrics(r prometheus.Registerer, Name: "cortex_ingester_memory_metadata_removed_total", Help: "The total number of metadata that were removed per user.", }, []string{"user"}), + pushErrorsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_push_errors_total", + Help: "The total number of push errors per user.", + }, []string{"user", "reason"}), maxUsersGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ Name: instanceLimits, @@ -295,6 +300,7 @@ func (m *ingesterMetrics) deletePerUserMetrics(userID string) { m.activeNHSeriesPerUser.DeleteLabelValues(userID) m.usagePerLabelSet.DeletePartialMatch(prometheus.Labels{"user": userID}) m.limitsPerLabelSet.DeletePartialMatch(prometheus.Labels{"user": userID}) + m.pushErrorsTotal.DeletePartialMatch(prometheus.Labels{"user": userID}) if m.memSeriesCreatedTotal != nil { m.memSeriesCreatedTotal.DeleteLabelValues(userID) From 0ad061c3838bc6902eaaca7d301a765206242e4e Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Tue, 22 Jul 2025 18:13:04 -0700 Subject: [PATCH 2/3] changelog Signed-off-by: Daniel Deluiggi --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56f3e485ed9..dea90d07571 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ * [ENHANCEMENT] Distributor: Add native histograms max sample size bytes limit validation. #6834 * [ENHANCEMENT] Querier: Support caching parquet labels file in parquet queryable. #6835 * [ENHANCEMENT] Querier: Support query limits in parquet queryable. #6870 +* [ENHANCEMENT] Ingester: Add new metric `cortex_ingester_push_errors_total` to track reasons for ingester request failures. #6901 * [BUGFIX] Ingester: Avoid error or early throttling when READONLY ingesters are present in the ring #6517 * [BUGFIX] Ingester: Fix labelset data race condition. #6573 * [BUGFIX] Compactor: Cleaner should not put deletion marker for blocks with no-compact marker. #6576 From 0bafbdf957db7323da3f057788dd25dc17002ef4 Mon Sep 17 00:00:00 2001 From: Daniel Deluiggi Date: Tue, 22 Jul 2025 19:38:18 -0700 Subject: [PATCH 3/3] change error reason Signed-off-by: Daniel Deluiggi --- pkg/ingester/ingester_test.go | 2 +- pkg/ingester/instance_limits.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 79bcde28301..c9948f9ec66 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -6557,7 +6557,7 @@ func TestIngester_inflightPushRequests(t *testing.T) { require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_ingester_push_errors_total The total number of push errors per user. # TYPE cortex_ingester_push_errors_total counter - cortex_ingester_push_errors_total{reason="errTooManyInflightRequests",user="test"} 1 + cortex_ingester_push_errors_total{reason="tooManyInflightRequests",user="test"} 1 `), "cortex_ingester_push_errors_total")) return nil }) diff --git a/pkg/ingester/instance_limits.go b/pkg/ingester/instance_limits.go index fa0713e2c46..cb48df3687e 100644 --- a/pkg/ingester/instance_limits.go +++ b/pkg/ingester/instance_limits.go @@ -14,7 +14,7 @@ var ( errTooManyInflightPushRequests = errors.New("cannot push: too many inflight push requests in ingester") errTooManyInflightQueryRequests = errors.New("cannot push: too many inflight query requests in ingester") - pushErrTooManyInflightRequests = "errTooManyInflightRequests" + pushErrTooManyInflightRequests = "tooManyInflightRequests" ) // InstanceLimits describes limits used by ingester. Reaching any of these will result in error response to the call.