Skip to content

Commit dbb3cca

Browse files
committed
Add query metrics to ruler query stats
Signed-off-by: SungJin1212 <[email protected]>
1 parent 90ad777 commit dbb3cca

File tree

7 files changed

+70
-23
lines changed

7 files changed

+70
-23
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
* [ENHANCEMENT] Compactor: Introduce cleaner visit marker. #6113
5454
* [ENHANCEMENT] Query Frontend: Add cortex_query_samples_total metric. #6142
5555
* [ENHANCEMENT] Ingester: Implement metadata API limit. #6128
56+
* [ENHANCEMENT] Ruler: Add query statistics metrics when --ruler.query-stats-enabled=true. #6173
5657
* [BUGFIX] Configsdb: Fix endline issue in db password. #5920
5758
* [BUGFIX] Ingester: Fix `user` and `type` labels for the `cortex_ingester_tsdb_head_samples_appended_total` TSDB metric. #5952
5859
* [BUGFIX] Querier: Enforce max query length check for `/api/v1/series` API even though `ignoreMaxQueryLength` is set to true. #6018

docs/configuration/config-file-reference.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -4366,8 +4366,8 @@ ring:
43664366
# CLI flag: -ruler.disabled-tenants
43674367
[disabled_tenants: <string> | default = ""]
43684368
4369-
# Report the wall time for ruler queries to complete as a per user metric and as
4370-
# an info level log message.
4369+
# Report query statistics for ruler queries to complete as a per user metric and
4370+
# as an info level log message.
43714371
# CLI flag: -ruler.query-stats-enabled
43724372
[query_stats_enabled: <boolean> | default = false]
43734373

pkg/ruler/compat.go

+18-12
Original file line numberDiff line numberDiff line change
@@ -229,19 +229,19 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun
229229
}
230230
}
231231

232-
func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc {
233-
if queryTime == nil {
234-
return qf
235-
}
236-
232+
func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime, querySeries, querySample, queryChunkBytes, queryDataBytes prometheus.Counter, logger log.Logger) rules.QueryFunc {
237233
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
238234
queryStats, ctx := stats.ContextWithEmptyStats(ctx)
239235
// If we've been passed a counter we want to record the wall time spent executing this request.
240236
timer := prometheus.NewTimer(nil)
237+
241238
defer func() {
242239
querySeconds := timer.ObserveDuration().Seconds()
243240
queryTime.Add(querySeconds)
244-
241+
querySeries.Add(float64(queryStats.FetchedSeriesCount))
242+
querySample.Add(float64(queryStats.FetchedSamplesCount))
243+
queryChunkBytes.Add(float64(queryStats.FetchedChunkBytes))
244+
queryDataBytes.Add(float64(queryStats.FetchedDataBytes))
245245
// Log ruler query stats.
246246
logMessage := []interface{}{
247247
"msg", "query stats",
@@ -303,23 +303,29 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
303303
q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors)
304304

305305
return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {
306-
var queryTime prometheus.Counter
307-
if evalMetrics.RulerQuerySeconds != nil {
308-
queryTime = evalMetrics.RulerQuerySeconds.WithLabelValues(userID)
309-
}
310-
311306
failedQueries := evalMetrics.FailedQueriesVec.WithLabelValues(userID)
312307
totalQueries := evalMetrics.TotalQueriesVec.WithLabelValues(userID)
313308
totalWrites := evalMetrics.TotalWritesVec.WithLabelValues(userID)
314309
failedWrites := evalMetrics.FailedWritesVec.WithLabelValues(userID)
315310

311+
var queryFunc rules.QueryFunc
316312
engineQueryFunc := EngineQueryFunc(engine, q, overrides, userID, cfg.LookbackDelta)
317313
metricsQueryFunc := MetricsQueryFunc(engineQueryFunc, totalQueries, failedQueries)
314+
if cfg.EnableQueryStats {
315+
queryTime := evalMetrics.RulerQuerySeconds.WithLabelValues(userID)
316+
querySeries := evalMetrics.RulerQuerySeries.WithLabelValues(userID)
317+
querySample := evalMetrics.RulerQuerySamples.WithLabelValues(userID)
318+
queryChunkBytes := evalMetrics.RulerQuerySamples.WithLabelValues(userID)
319+
queryDataBytes := evalMetrics.RulerQueryDataBytes.WithLabelValues(userID)
320+
queryFunc = RecordAndReportRuleQueryMetrics(metricsQueryFunc, queryTime, querySeries, querySample, queryChunkBytes, queryDataBytes, logger)
321+
} else {
322+
queryFunc = metricsQueryFunc
323+
}
318324

319325
return rules.NewManager(&rules.ManagerOptions{
320326
Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
321327
Queryable: q,
322-
QueryFunc: RecordAndReportRuleQueryMetrics(metricsQueryFunc, queryTime, logger),
328+
QueryFunc: queryFunc,
323329
Context: user.InjectOrgID(ctx, userID),
324330
ExternalURL: cfg.ExternalURL.URL,
325331
NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()),

pkg/ruler/compat_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ func TestRecordAndReportRuleQueryMetrics(t *testing.T) {
398398
time.Sleep(1 * time.Second)
399399
return promql.Vector{}, nil
400400
}
401-
qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger())
401+
qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), nil, nil, nil, nil, log.NewNopLogger())
402402
_, _ = qf(context.Background(), "test", time.Now())
403403

404404
require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1))

pkg/ruler/manager_metrics.go

+37-5
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,15 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
225225
}
226226

227227
type RuleEvalMetrics struct {
228-
TotalWritesVec *prometheus.CounterVec
229-
FailedWritesVec *prometheus.CounterVec
230-
TotalQueriesVec *prometheus.CounterVec
231-
FailedQueriesVec *prometheus.CounterVec
232-
RulerQuerySeconds *prometheus.CounterVec
228+
TotalWritesVec *prometheus.CounterVec
229+
FailedWritesVec *prometheus.CounterVec
230+
TotalQueriesVec *prometheus.CounterVec
231+
FailedQueriesVec *prometheus.CounterVec
232+
RulerQuerySeconds *prometheus.CounterVec
233+
RulerQuerySeries *prometheus.CounterVec
234+
RulerQuerySamples *prometheus.CounterVec
235+
RulerQueryChunkBytes *prometheus.CounterVec
236+
RulerQueryDataBytes *prometheus.CounterVec
233237
}
234238

235239
func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics {
@@ -256,6 +260,22 @@ func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics
256260
Name: "cortex_ruler_query_seconds_total",
257261
Help: "Total amount of wall clock time spent processing queries by the ruler.",
258262
}, []string{"user"})
263+
m.RulerQuerySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
264+
Name: "cortex_ruler_fetched_series_total",
265+
Help: "Number of series fetched to execute a query by the ruler.",
266+
}, []string{"user"})
267+
m.RulerQuerySamples = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
268+
Name: "cortex_ruler_samples_total",
269+
Help: "Number of samples fetched to execute a query by the ruler.",
270+
}, []string{"user"})
271+
m.RulerQueryChunkBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
272+
Name: "cortex_ruler_fetched_chunks_bytes_total",
273+
Help: "Size of all chunks fetched to execute a query in bytes by the ruler.",
274+
}, []string{"user"})
275+
m.RulerQueryDataBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
276+
Name: "cortex_ruler_fetched_data_bytes_total",
277+
Help: "Size of all data fetched to execute a query in bytes by the ruler.",
278+
}, []string{"user"})
259279
}
260280

261281
return m
@@ -270,4 +290,16 @@ func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) {
270290
if m.RulerQuerySeconds != nil {
271291
m.RulerQuerySeconds.DeleteLabelValues(userID)
272292
}
293+
if m.RulerQuerySeries != nil {
294+
m.RulerQuerySeries.DeleteLabelValues(userID)
295+
}
296+
if m.RulerQuerySamples != nil {
297+
m.RulerQuerySamples.DeleteLabelValues(userID)
298+
}
299+
if m.RulerQueryChunkBytes != nil {
300+
m.RulerQueryChunkBytes.DeleteLabelValues(userID)
301+
}
302+
if m.RulerQueryDataBytes != nil {
303+
m.RulerQueryDataBytes.DeleteLabelValues(userID)
304+
}
273305
}

pkg/ruler/manager_metrics_test.go

+10-2
Original file line numberDiff line numberDiff line change
@@ -574,8 +574,16 @@ func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) {
574574
m.FailedQueriesVec.WithLabelValues("fake2").Add(10)
575575
m.RulerQuerySeconds.WithLabelValues("fake1").Add(10)
576576
m.RulerQuerySeconds.WithLabelValues("fake2").Add(10)
577-
578-
metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total"}
577+
m.RulerQuerySeries.WithLabelValues("fake1").Add(10)
578+
m.RulerQuerySeries.WithLabelValues("fake2").Add(10)
579+
m.RulerQuerySamples.WithLabelValues("fake1").Add(10)
580+
m.RulerQuerySamples.WithLabelValues("fake2").Add(10)
581+
m.RulerQueryChunkBytes.WithLabelValues("fake1").Add(10)
582+
m.RulerQueryChunkBytes.WithLabelValues("fake2").Add(10)
583+
m.RulerQueryDataBytes.WithLabelValues("fake1").Add(10)
584+
m.RulerQueryDataBytes.WithLabelValues("fake2").Add(10)
585+
586+
metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total", "cortex_ruler_fetched_series_total", "cortex_ruler_samples_total", "cortex_ruler_fetched_chunks_bytes_total", "cortex_ruler_fetched_data_bytes_total"}
579587
gm, err := reg.Gather()
580588
require.NoError(t, err)
581589
mfm, err := util.NewMetricFamilyMap(gm)

pkg/ruler/ruler.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
217217
f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.")
218218
f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.")
219219

220-
f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.")
220+
f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report query statistics for ruler queries to complete as a per user metric and as an info level log message.")
221221
f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics")
222222

223223
cfg.RingCheckPeriod = 5 * time.Second

0 commit comments

Comments
 (0)