Skip to content

Commit a37e449

Browse files
committed
Allow rules to be loaded to rulers as backup for List rules API HA
Signed-off-by: Emmanuel Lodovice <[email protected]>
1 parent 9704cc3 commit a37e449

File tree

11 files changed

+1124
-124
lines changed

11 files changed

+1124
-124
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* [FEATURE] Ruler: Add `ruler.concurrent-evals-enabled` flag to enable concurrent evaluation within a single rule group for independent rules. Maximum concurrency can be configured via `ruler.max-concurrent-evals`. #5766
1515
* [FEATURE] Distributor Queryable: Experimental: Add config `zone_results_quorum_metadata`. When querying ingesters using metadata APIs such as label names and values, only results from quorum number of zones will be included and merged. #5779
1616
* [FEATURE] Storage Cache Clients: Add config `set_async_circuit_breaker_config` to utilize the circuit breaker pattern for dynamically thresholding asynchronous set operations. Implemented in both memcached and redis cache clients. #5789
17+
* [FEATURE] Ruler: Add `ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will just hold a copy as backup. Add `ruler.api-enable-rules-backup` flag to configure rulers to send the rule group backups stored in the replicaset to handle events when a ruler is down during an API request to list rules. #5782
1718
* [ENHANCEMENT] Store Gateway: Added `-store-gateway.enabled-tenants` and `-store-gateway.disabled-tenants` to explicitly enable or disable store-gateway for specific tenants. #5638
1819
* [ENHANCEMENT] Compactor: Add new compactor metric `cortex_compactor_start_duration_seconds`. #5683
1920
* [ENHANCEMENT] Upgraded Docker base images to `alpine:3.18`. #5684

docs/configuration/config-file-reference.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4254,6 +4254,21 @@ ring:
42544254
# CLI flag: -experimental.ruler.enable-api
42554255
[enable_api: <boolean> | default = false]
42564256
4257+
# Enable rulers to store a copy of rules owned by other rulers with default
4258+
# state (state before any evaluation) and send this copy in list API requests as
4259+
# backup in case the ruler who owns the rule fails to send its rules. This
4260+
# allows the rules API to handle ruler outage by returning rules with default
4261+
# state. Ring replication-factor needs to be set to 3 or more for this to be
4262+
# useful.
4263+
# CLI flag: -ruler.api-enable-rules-backup
4264+
[api_enable_rules_backup: <boolean> | default = false]
4265+
4266+
# Remove duplicate rules in the prometheus rules and alerts API response. If
4267+
# there are duplicate rules the rule with the latest evaluation timestamp will
4268+
# be kept.
4269+
# CLI flag: -ruler.api-deduplicate-rules
4270+
[api_deduplicate_rules: <boolean> | default = false]
4271+
42574272
# Comma separated list of tenants whose rules this ruler can evaluate. If
42584273
# specified, only these tenants will be handled by ruler, otherwise this ruler
42594274
# can process rules from all tenants. Subject to sharding.

pkg/ruler/api.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,9 @@ func (a *API) PrometheusRules(w http.ResponseWriter, req *http.Request) {
251251

252252
// keep data.groups are in order
253253
sort.Slice(groups, func(i, j int) bool {
254+
if groups[i].File == groups[j].File {
255+
return groups[i].Name < groups[j].Name
256+
}
254257
return groups[i].File < groups[j].File
255258
})
256259

pkg/ruler/manager.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ type DefaultMultiTenantManager struct {
4444
notifiers map[string]*rulerNotifier
4545
notifiersDiscoveryMetrics map[string]discovery.DiscovererMetrics
4646

47+
// rules backup
48+
rulesBackupManager *rulesBackupManager
49+
4750
managersTotal prometheus.Gauge
4851
lastReloadSuccessful *prometheus.GaugeVec
4952
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
@@ -85,6 +88,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva
8588
mapper: newMapper(cfg.RulePath, logger),
8689
userManagers: map[string]RulesManager{},
8790
userManagerMetrics: userManagerMetrics,
91+
rulesBackupManager: newRulesBackupManager(cfg, logger, reg),
8892
managersTotal: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
8993
Namespace: "cortex",
9094
Name: "ruler_managers_total",
@@ -142,8 +146,12 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
142146
r.managersTotal.Set(float64(len(r.userManagers)))
143147
}
144148

149+
func (r *DefaultMultiTenantManager) BackUpRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) {
150+
r.rulesBackupManager.backUpRuleGroups(ctx, ruleGroups)
151+
}
152+
145153
// syncRulesToManager maps the rule files to disk, detects any changes and will create/update the
146-
// the users Prometheus Rules Manager.
154+
// users Prometheus Rules Manager.
147155
func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user string, groups rulespb.RuleGroupList) {
148156
// Map the files to disk and return the file names to be passed to the users manager if they
149157
// have been updated
@@ -279,6 +287,10 @@ func (r *DefaultMultiTenantManager) GetRules(userID string) []*promRules.Group {
279287
return groups
280288
}
281289

290+
func (r *DefaultMultiTenantManager) GetBackupRules(userID string) []*promRules.Group {
291+
return r.rulesBackupManager.getRuleGroups(userID)
292+
}
293+
282294
func (r *DefaultMultiTenantManager) Stop() {
283295
r.notifiersMtx.Lock()
284296
for _, n := range r.notifiers {

pkg/ruler/manager_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,55 @@ func TestSyncRuleGroupsCleanUpPerUserMetrics(t *testing.T) {
138138
require.NotContains(t, mfm["cortex_ruler_config_last_reload_successful"].String(), "value:\""+user+"\"")
139139
}
140140

141+
func TestBackupRules(t *testing.T) {
142+
dir := t.TempDir()
143+
reg := prometheus.NewPedanticRegistry()
144+
evalMetrics := NewRuleEvalMetrics(Config{RulePath: dir, EnableQueryStats: true}, reg)
145+
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, factory, evalMetrics, reg, log.NewNopLogger())
146+
require.NoError(t, err)
147+
148+
const user1 = "testUser"
149+
const user2 = "testUser2"
150+
151+
require.Equal(t, 0, len(m.GetBackupRules(user1)))
152+
require.Equal(t, 0, len(m.GetBackupRules(user2)))
153+
154+
userRules := map[string]rulespb.RuleGroupList{
155+
user1: {
156+
&rulespb.RuleGroupDesc{
157+
Name: "group1",
158+
Namespace: "ns",
159+
Interval: 1 * time.Minute,
160+
User: user1,
161+
},
162+
},
163+
user2: {
164+
&rulespb.RuleGroupDesc{
165+
Name: "group2",
166+
Namespace: "ns",
167+
Interval: 1 * time.Minute,
168+
User: user1,
169+
},
170+
},
171+
}
172+
m.BackUpRuleGroups(context.TODO(), userRules)
173+
managerOptions := &promRules.ManagerOptions{}
174+
g1 := promRules.NewGroup(promRules.GroupOptions{
175+
Name: userRules[user1][0].Name,
176+
File: userRules[user1][0].Namespace,
177+
Interval: userRules[user1][0].Interval,
178+
Opts: managerOptions,
179+
})
180+
g2 := promRules.NewGroup(promRules.GroupOptions{
181+
Name: userRules[user2][0].Name,
182+
File: userRules[user2][0].Namespace,
183+
Interval: userRules[user2][0].Interval,
184+
Opts: managerOptions,
185+
})
186+
requireGroupsEqual(t, m.GetBackupRules(user1), []*promRules.Group{g1})
187+
requireGroupsEqual(t, m.GetBackupRules(user2), []*promRules.Group{g2})
188+
}
189+
141190
func getManager(m *DefaultMultiTenantManager, user string) RulesManager {
142191
m.userManagerMtx.Lock()
143192
defer m.userManagerMtx.Unlock()

pkg/ruler/merger.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package ruler
2+
3+
import (
4+
"time"
5+
6+
promRules "github.com/prometheus/prometheus/rules"
7+
)
8+
9+
// mergeGroupStateDesc removes duplicates from the provided []*GroupStateDesc by keeping the GroupStateDesc with the
10+
// latest information. It uses the EvaluationTimestamp of the GroupStateDesc and the EvaluationTimestamp of the
11+
// ActiveRules in a GroupStateDesc to determine the which GroupStateDesc has the latest information.
12+
func mergeGroupStateDesc(in []*GroupStateDesc) []*GroupStateDesc {
13+
states := make(map[string]*GroupStateDesc)
14+
rgTime := make(map[string]time.Time)
15+
for _, state := range in {
16+
latestTs := state.EvaluationTimestamp
17+
for _, r := range state.ActiveRules {
18+
if latestTs.Before(r.EvaluationTimestamp) {
19+
latestTs = r.EvaluationTimestamp
20+
}
21+
}
22+
key := promRules.GroupKey(state.Group.Namespace, state.Group.Name)
23+
ts, ok := rgTime[key]
24+
if !ok || ts.Before(latestTs) {
25+
states[key] = state
26+
rgTime[key] = latestTs
27+
}
28+
}
29+
groups := make([]*GroupStateDesc, 0, len(states))
30+
for _, state := range states {
31+
groups = append(groups, state)
32+
}
33+
return groups
34+
}

pkg/ruler/merger_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
package ruler
2+
3+
import (
4+
"reflect"
5+
"slices"
6+
"strings"
7+
"testing"
8+
"time"
9+
10+
"github.com/stretchr/testify/require"
11+
12+
"github.com/cortexproject/cortex/pkg/ruler/rulespb"
13+
)
14+
15+
func TestMergeGroupStateDesc(t *testing.T) {
16+
curTime := time.Now()
17+
r := rulespb.RuleDesc{
18+
Expr: "1 > 1",
19+
}
20+
g1 := rulespb.RuleGroupDesc{
21+
Name: "g1",
22+
Namespace: "ns1",
23+
}
24+
g2 := rulespb.RuleGroupDesc{
25+
Name: "g2",
26+
Namespace: "ns1",
27+
}
28+
rs1 := RuleStateDesc{
29+
Rule: &r,
30+
EvaluationTimestamp: curTime,
31+
}
32+
rs1NotRun := RuleStateDesc{
33+
Rule: &r,
34+
}
35+
rs2 := RuleStateDesc{
36+
Rule: &r,
37+
EvaluationTimestamp: curTime,
38+
}
39+
rs2NotRun := RuleStateDesc{
40+
Rule: &r,
41+
}
42+
rs3 := RuleStateDesc{
43+
Rule: &r,
44+
EvaluationTimestamp: curTime.Add(10 * time.Second),
45+
}
46+
47+
gs1 := GroupStateDesc{
48+
Group: &g1,
49+
ActiveRules: []*RuleStateDesc{&rs1, &rs2},
50+
EvaluationTimestamp: curTime,
51+
}
52+
gs1NotRun := GroupStateDesc{
53+
Group: &g1,
54+
ActiveRules: []*RuleStateDesc{&rs1NotRun, &rs2NotRun},
55+
}
56+
gs2 := GroupStateDesc{
57+
Group: &g2,
58+
ActiveRules: []*RuleStateDesc{&rs1, &rs2},
59+
EvaluationTimestamp: curTime,
60+
}
61+
gs2NotRun := GroupStateDesc{
62+
Group: &g2,
63+
ActiveRules: []*RuleStateDesc{&rs1NotRun, &rs2NotRun},
64+
}
65+
gs3 := GroupStateDesc{
66+
Group: &g2,
67+
ActiveRules: []*RuleStateDesc{&rs1, &rs3},
68+
EvaluationTimestamp: curTime,
69+
}
70+
71+
type testCase struct {
72+
input []*GroupStateDesc
73+
expectedOutput []*GroupStateDesc
74+
}
75+
76+
testCases := map[string]testCase{
77+
"No duplicate": {
78+
input: []*GroupStateDesc{&gs1, &gs2},
79+
expectedOutput: []*GroupStateDesc{&gs1, &gs2},
80+
},
81+
"No duplicate but not evaluated": {
82+
input: []*GroupStateDesc{&gs1NotRun, &gs2NotRun},
83+
expectedOutput: []*GroupStateDesc{&gs1NotRun, &gs2NotRun},
84+
},
85+
"With exact duplicate": {
86+
input: []*GroupStateDesc{&gs1, &gs2NotRun, &gs1, &gs2NotRun},
87+
expectedOutput: []*GroupStateDesc{&gs1, &gs2NotRun},
88+
},
89+
"With duplicates that are not evaluated": {
90+
input: []*GroupStateDesc{&gs1, &gs2, &gs1NotRun, &gs2NotRun},
91+
expectedOutput: []*GroupStateDesc{&gs1, &gs2},
92+
},
93+
"With duplicate with a new newer rule evaluation": {
94+
input: []*GroupStateDesc{&gs3, &gs1, &gs2, &gs1NotRun},
95+
expectedOutput: []*GroupStateDesc{&gs1, &gs3},
96+
},
97+
}
98+
99+
for name, tc := range testCases {
100+
t.Run(name, func(t *testing.T) {
101+
out := mergeGroupStateDesc(tc.input)
102+
slices.SortFunc(out, func(a, b *GroupStateDesc) int {
103+
fileCompare := strings.Compare(a.Group.Namespace, b.Group.Namespace)
104+
if fileCompare != 0 {
105+
return fileCompare
106+
}
107+
return strings.Compare(a.Group.Name, b.Group.Name)
108+
})
109+
require.Equal(t, len(tc.expectedOutput), len(out))
110+
require.True(t, reflect.DeepEqual(tc.expectedOutput, out))
111+
})
112+
}
113+
114+
}

0 commit comments

Comments
 (0)