Skip to content

Commit 7075adc

Browse files
disable rule groups
Signed-off-by: Anand Rajagopal <[email protected]>
1 parent 526a6d9 commit 7075adc

File tree

6 files changed

+80
-14
lines changed

6 files changed

+80
-14
lines changed

docs/configuration/config-file-reference.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3084,6 +3084,9 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
30843084
# alerts will fail with a log message and metric increment. 0 = no limit.
30853085
# CLI flag: -alertmanager.max-alerts-size-bytes
30863086
[alertmanager_max_alerts_size_bytes: <int> | default = 0]
3087+
3088+
# list of rule groups to disable
3089+
[disabled_rule_groups: <list of rule groups to disable> | default = ]
30873090
```
30883091
30893092
### `memberlist_config`

pkg/ruler/compat.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"errors"
66
"time"
77

8+
"github.com/cortexproject/cortex/pkg/util/validation"
9+
810
"github.com/go-kit/log"
911
"github.com/go-kit/log/level"
1012
"github.com/prometheus/client_golang/prometheus"
@@ -142,6 +144,7 @@ type RulesLimits interface {
142144
RulerTenantShardSize(userID string) int
143145
RulerMaxRuleGroupsPerTenant(userID string) int
144146
RulerMaxRulesPerRuleGroup(userID string) int
147+
DisabledRuleGroups(userID string) validation.DisabledRuleGroups
145148
}
146149

147150
// EngineQueryFunc returns a new engine query function by passing an altered timestamp.

pkg/ruler/ruler.go

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ const (
7171
recordingRuleFilter string = "record"
7272
)
7373

74+
type DisabledRuleGroupErr struct {
75+
Message string
76+
}
77+
78+
func (e *DisabledRuleGroupErr) Error() string {
79+
return e.Message
80+
}
81+
7482
// Config is the configuration for the recording rules server.
7583
type Config struct {
7684
// This is used for template expansion in alerts; must be a valid URL.
@@ -415,9 +423,19 @@ func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 {
415423
return ringHasher.Sum32()
416424
}
417425

418-
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, instanceAddr string) (bool, error) {
426+
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, instanceAddr string) (bool, error) {
419427
hash := tokenForGroup(g)
420428

429+
for _, disabledGroup := range disabledRuleGroups {
430+
431+
if hash == tokenForGroup(&rulespb.RuleGroupDesc{
432+
Name: disabledGroup.Name,
433+
Namespace: disabledGroup.Namespace,
434+
User: disabledGroup.User,
435+
}) {
436+
return false, &DisabledRuleGroupErr{Message: fmt.Sprintf("skipping rule group %s, within namespace %s, owned by %s", g.Name, g.Namespace, g.User)}
437+
}
438+
}
421439
rlrs, err := r.Get(hash, RingOp, nil, nil, nil)
422440
if err != nil {
423441
return false, errors.Wrap(err, "error reading ring to verify rule group ownership")
@@ -544,7 +562,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
544562

545563
filteredConfigs := make(map[string]rulespb.RuleGroupList)
546564
for userID, groups := range configs {
547-
filtered := filterRuleGroups(userID, groups, r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
565+
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
548566
if len(filtered) > 0 {
549567
filteredConfigs[userID] = filtered
550568
}
@@ -602,7 +620,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
602620
return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID)
603621
}
604622

605-
filtered := filterRuleGroups(userID, groups, userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
623+
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
606624
if len(filtered) == 0 {
607625
continue
608626
}
@@ -624,15 +642,21 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
624642
//
625643
// Reason why this function is not a method on Ruler is to make sure we don't accidentally use r.ring,
626644
// but only ring passed as parameter.
627-
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
645+
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
628646
// Prune the rule group to only contain rules that this ruler is responsible for, based on ring.
629647
var result []*rulespb.RuleGroupDesc
630648
for _, g := range ruleGroups {
631-
owned, err := instanceOwnsRuleGroup(ring, g, instanceAddr)
649+
owned, err := instanceOwnsRuleGroup(ring, g, disabledRuleGroups, instanceAddr)
632650
if err != nil {
633-
ringCheckErrors.Inc()
634-
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
635-
continue
651+
switch e := err.(type) {
652+
case *DisabledRuleGroupErr:
653+
level.Info(log).Log("msg", e.Message)
654+
continue
655+
default:
656+
ringCheckErrors.Inc()
657+
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
658+
continue
659+
}
636660
}
637661

638662
if owned {

pkg/ruler/ruler_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ type ruleLimits struct {
8484
tenantShard int
8585
maxRulesPerRuleGroup int
8686
maxRuleGroups int
87+
disabledRuleGroups validation.DisabledRuleGroups
8788
}
8889

8990
func (r ruleLimits) EvaluationDelay(_ string) time.Duration {
@@ -102,6 +103,10 @@ func (r ruleLimits) RulerMaxRulesPerRuleGroup(_ string) int {
102103
return r.maxRulesPerRuleGroup
103104
}
104105

106+
func (r ruleLimits) DisabledRuleGroups(userID string) validation.DisabledRuleGroups {
107+
return r.disabledRuleGroups
108+
}
109+
105110
func newEmptyQueryable() storage.Queryable {
106111
return storage.QueryableFunc(func(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
107112
return emptyQuerier{}, nil

pkg/util/validation/limits.go

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ func (e LimitError) Error() string {
3838
return string(e)
3939
}
4040

41+
type DisabledRuleGroup struct {
42+
Namespace string `yaml:"namespace"`
43+
Name string `yaml:"name"`
44+
User string `yaml:"user,omitempty"`
45+
}
46+
47+
type DisabledRuleGroups []DisabledRuleGroup
48+
4149
// Limits describe all the limits for users; can be used to describe global default
4250
// limits via flags, or per-user limits via yaml config.
4351
type Limits struct {
@@ -122,12 +130,13 @@ type Limits struct {
122130
NotificationRateLimit float64 `yaml:"alertmanager_notification_rate_limit" json:"alertmanager_notification_rate_limit"`
123131
NotificationRateLimitPerIntegration NotificationRateLimitMap `yaml:"alertmanager_notification_rate_limit_per_integration" json:"alertmanager_notification_rate_limit_per_integration"`
124132

125-
AlertmanagerMaxConfigSizeBytes int `yaml:"alertmanager_max_config_size_bytes" json:"alertmanager_max_config_size_bytes"`
126-
AlertmanagerMaxTemplatesCount int `yaml:"alertmanager_max_templates_count" json:"alertmanager_max_templates_count"`
127-
AlertmanagerMaxTemplateSizeBytes int `yaml:"alertmanager_max_template_size_bytes" json:"alertmanager_max_template_size_bytes"`
128-
AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"`
129-
AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"`
130-
AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"`
133+
AlertmanagerMaxConfigSizeBytes int `yaml:"alertmanager_max_config_size_bytes" json:"alertmanager_max_config_size_bytes"`
134+
AlertmanagerMaxTemplatesCount int `yaml:"alertmanager_max_templates_count" json:"alertmanager_max_templates_count"`
135+
AlertmanagerMaxTemplateSizeBytes int `yaml:"alertmanager_max_template_size_bytes" json:"alertmanager_max_template_size_bytes"`
136+
AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"`
137+
AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"`
138+
AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"`
139+
DisabledRuleGroups DisabledRuleGroups `yaml:"disabled_rule_groups" json:"disabled_rule_groups" doc:"nocli|description=list of rule groups to disable"`
131140
}
132141

133142
// RegisterFlags adds the flags required to config this to the given FlagSet
@@ -667,6 +676,26 @@ func (o *Overrides) AlertmanagerMaxAlertsSizeBytes(userID string) int {
667676
return o.GetOverridesForUser(userID).AlertmanagerMaxAlertsSizeBytes
668677
}
669678

679+
func (o *Overrides) DisabledRuleGroups(userID string) DisabledRuleGroups {
680+
if o.tenantLimits != nil {
681+
l := o.tenantLimits.ByUserID(userID)
682+
if l != nil {
683+
disabledRuleGroupsForUser := make(DisabledRuleGroups, len(l.DisabledRuleGroups))
684+
685+
for i, disabledRuleGroup := range l.DisabledRuleGroups {
686+
disabledRuleGroupForUser := DisabledRuleGroup{
687+
Namespace: disabledRuleGroup.Namespace,
688+
Name: disabledRuleGroup.Name,
689+
User: userID,
690+
}
691+
disabledRuleGroupsForUser[i] = disabledRuleGroupForUser
692+
}
693+
return disabledRuleGroupsForUser
694+
}
695+
}
696+
return o.defaultLimits.DisabledRuleGroups
697+
}
698+
670699
// GetOverridesForUser returns the per-tenant limits with overrides.
671700
func (o *Overrides) GetOverridesForUser(userID string) *Limits {
672701
if o.tenantLimits != nil {

tools/doc-generator/parser.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ func getFieldType(t reflect.Type) (string, error) {
259259
return "relabel_config...", nil
260260
case "labels.Labels":
261261
return "map of string to string", nil
262+
case "validation.DisabledRuleGroups":
263+
return "list of rule groups to disable", nil
262264
}
263265

264266
// Fallback to auto-detection of built-in data types

0 commit comments

Comments
 (0)