Skip to content

Commit fc61482

Browse files
Add metric to measure rule group load time (#5609)
Signed-off-by: Anand Rajagopal <[email protected]>
1 parent d321d46 commit fc61482

File tree

2 files changed

+40
-6
lines changed

2 files changed

+40
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## master / unreleased
4+
* [CHANGE] Ruler: Add `cortex_ruler_rule_group_load_duration_seconds` and `cortex_ruler_rule_group_sync_duration_seconds` metrics. #5609
45
* [CHANGE] Ruler: Add contextual info and query statistics to log
56
* [FEATURE] Ruler: Add support for disabling rule groups. #5521
67
* [FEATURE] Added the flag `-alertmanager.alerts-gc-interval` to configure alert manager alerts Garbage collection interval. #5550

pkg/ruler/ruler.go

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,10 @@ type Ruler struct {
254254
// Pool of clients used to connect to other ruler replicas.
255255
clientsPool ClientsPool
256256

257-
ringCheckErrors prometheus.Counter
258-
rulerSync *prometheus.CounterVec
257+
ringCheckErrors prometheus.Counter
258+
rulerSync *prometheus.CounterVec
259+
ruleGroupStoreLoadDuration prometheus.Gauge
260+
ruleGroupSyncDuration prometheus.Gauge
259261

260262
allowedTenants *util.AllowedTenants
261263

@@ -288,6 +290,16 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer,
288290
Name: "cortex_ruler_sync_rules_total",
289291
Help: "Total number of times the ruler sync operation triggered.",
290292
}, []string{"reason"}),
293+
294+
ruleGroupStoreLoadDuration: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
295+
Name: "cortex_ruler_rule_group_load_duration_seconds",
296+
Help: "Time taken to load rule groups from storage",
297+
}),
298+
299+
ruleGroupSyncDuration: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
300+
Name: "cortex_ruler_rule_group_sync_duration_seconds",
301+
Help: "The duration in seconds required to sync and load rule groups from storage.",
302+
}),
291303
}
292304

293305
if len(cfg.EnabledTenants) > 0 {
@@ -512,20 +524,41 @@ func (r *Ruler) run(ctx context.Context) error {
512524
func (r *Ruler) syncRules(ctx context.Context, reason string) {
513525
level.Debug(r.logger).Log("msg", "syncing rules", "reason", reason)
514526
r.rulerSync.WithLabelValues(reason).Inc()
527+
timer := prometheus.NewTimer(nil)
528+
529+
defer func() {
530+
ruleGroupSyncDuration := timer.ObserveDuration().Seconds()
531+
r.ruleGroupSyncDuration.Set(ruleGroupSyncDuration)
532+
}()
533+
534+
loadedConfigs, err := r.loadRuleGroups(ctx)
535+
if err != nil {
536+
return
537+
}
538+
539+
// This will also delete local group files for users that are no longer in 'configs' map.
540+
r.manager.SyncRuleGroups(ctx, loadedConfigs)
541+
}
542+
543+
func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
544+
timer := prometheus.NewTimer(nil)
545+
546+
defer func() {
547+
storeLoadSeconds := timer.ObserveDuration().Seconds()
548+
r.ruleGroupStoreLoadDuration.Set(storeLoadSeconds)
549+
}()
515550

516551
configs, err := r.listRules(ctx)
517552
if err != nil {
518553
level.Error(r.logger).Log("msg", "unable to list rules", "err", err)
519-
return
554+
return nil, err
520555
}
521556

522557
loadedConfigs, err := r.store.LoadRuleGroups(ctx, configs)
523558
if err != nil {
524559
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(configs)-len(loadedConfigs), "err", err)
525560
}
526-
527-
// This will also delete local group files for users that are no longer in 'configs' map.
528-
r.manager.SyncRuleGroups(ctx, loadedConfigs)
561+
return loadedConfigs, nil
529562
}
530563

531564
func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGroupList, err error) {

0 commit comments

Comments
 (0)