Skip to content

Commit 927b0f8

Browse files
Minimize missed rule group evaluations
Signed-off-by: Anand Rajagopal <[email protected]>
1 parent e5f47e1 commit 927b0f8

File tree

9 files changed

+1073
-77
lines changed

9 files changed

+1073
-77
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* [CHANGE] Ingester: Remove `-querier.query-store-for-labels-enabled` flag. Querying long-term store for labels is always enabled. #5984
66
* [CHANGE] Server: Instrument `cortex_request_duration_seconds` metric with native histogram. If `native-histograms` feature is enabled in monitoring Prometheus then the metric name needs to be updated in your dashboards. #6056
77
* [CHANGE] Distributor/Ingester: Change `cortex_distributor_ingester_appends_total`, `cortex_distributor_ingester_append_failures_total`, `cortex_distributor_ingester_queries_total`, and `cortex_distributor_ingester_query_failures_total` metrics to use the ingester ID instead of its IP as the label value. #6078
8+
* [FEATURE] Ruler: Minimize rule group missed evaluations via `-ruler.enable-ha` flag. #6129
9+
* [FEATURE] Ingester: Experimental: Enable native histogram ingestion via `-blocks-storage.tsdb.enable-native-histograms` flag. #5986
810
* [FEATURE] Ingester/Distributor: Experimental: Enable native histogram ingestion via `-blocks-storage.tsdb.enable-native-histograms` flag. #5986 #6010 #6020
911
* [FEATURE] Querier: Enable querying native histogram chunks. #5944 #6031
1012
* [FEATURE] Query Frontend: Support native histogram in query frontend response. #5996 #6043

docs/configuration/config-file-reference.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4337,6 +4337,10 @@ ring:
43374337
# CLI flag: -ruler.ring.final-sleep
43384338
[final_sleep: <duration> | default = 0s]
43394339
4340+
# Keep instance in the ring on shut down.
4341+
# CLI flag: -ruler.ring.keep-instance-in-the-ring-on-shutdown
4342+
[keep_instance_in_the_ring_on_shutdown: <boolean> | default = false]
4343+
43404344
# Period with which to attempt to flush rule groups.
43414345
# CLI flag: -ruler.flush-period
43424346
[flush_period: <duration> | default = 1m]
@@ -4371,6 +4375,18 @@ ring:
43714375
# Disable the rule_group label on exported metrics
43724376
# CLI flag: -ruler.disable-rule-group-label
43734377
[disable_rule_group_label: <boolean> | default = false]
4378+
4379+
# Enable high availability
4380+
# CLI flag: -ruler.enable-ha
4381+
[enable_ha: <boolean> | default = false]
4382+
4383+
# Timeout for fanout calls to other rulers
4384+
# CLI flag: -ruler.list-rules-fanout-timeout
4385+
[list_rules_fanout_timeout: <duration> | default = 2m]
4386+
4387+
# Timeout for liveness checks performed during rule sync
4388+
# CLI flag: -ruler.liveness-check-timeout
4389+
[liveness_check_timeout: <duration> | default = 1s]
43744390
```
43754391

43764392
### `ruler_storage_config`

integration/ruler_test.go

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,6 +1093,148 @@ func TestRulerDisablesRuleGroups(t *testing.T) {
10931093
})
10941094
}
10951095

1096+
func TestRulerHA(t *testing.T) {
1097+
const numRulesGroups = 20
1098+
1099+
random := rand.New(rand.NewSource(time.Now().UnixNano()))
1100+
s, err := e2e.NewScenario(networkName)
1101+
require.NoError(t, err)
1102+
defer s.Close()
1103+
1104+
// Generate multiple rule groups, with 1 rule each.
1105+
ruleGroups := make([]rulefmt.RuleGroup, numRulesGroups)
1106+
expectedNames := make([]string, numRulesGroups)
1107+
alertCount := 0
1108+
evalInterval, _ := model.ParseDuration("5s")
1109+
for i := 0; i < numRulesGroups; i++ {
1110+
num := random.Intn(10)
1111+
var ruleNode yaml.Node
1112+
var exprNode yaml.Node
1113+
1114+
ruleNode.SetString(fmt.Sprintf("rule_%d", i))
1115+
exprNode.SetString(strconv.Itoa(i))
1116+
ruleName := fmt.Sprintf("test_%d", i)
1117+
1118+
expectedNames[i] = ruleName
1119+
1120+
if num%2 == 0 {
1121+
alertCount++
1122+
ruleGroups[i] = rulefmt.RuleGroup{
1123+
Name: ruleName,
1124+
Interval: evalInterval,
1125+
Rules: []rulefmt.RuleNode{{
1126+
Alert: ruleNode,
1127+
Expr: exprNode,
1128+
}},
1129+
}
1130+
} else {
1131+
ruleGroups[i] = rulefmt.RuleGroup{
1132+
Name: ruleName,
1133+
Interval: evalInterval,
1134+
Rules: []rulefmt.RuleNode{{
1135+
Record: ruleNode,
1136+
Expr: exprNode,
1137+
}},
1138+
}
1139+
}
1140+
}
1141+
1142+
// Start dependencies.
1143+
consul := e2edb.NewConsul()
1144+
minio := e2edb.NewMinio(9000, rulestoreBucketName)
1145+
require.NoError(t, s.StartAndWaitReady(consul, minio))
1146+
1147+
// Configure the ruler.
1148+
overrides := map[string]string{
1149+
// Since we're not going to run any rule, we don't need the
1150+
// store-gateway to be configured to a valid address.
1151+
"-querier.store-gateway-addresses": "localhost:12345",
1152+
// Enable the bucket index so we can skip the initial bucket scan.
1153+
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
1154+
"-ruler.ring.replication-factor": "2",
1155+
"-ruler.enable-ha": "true",
1156+
"-ruler.poll-interval": "5s",
1157+
"-ruler.list-rules-fanout-timeout": "2s",
1158+
"-ruler.liveness-check-timeout": "50ms",
1159+
}
1160+
1161+
rulerFlags := mergeFlags(
1162+
BlocksStorageFlags(),
1163+
RulerFlags(),
1164+
RulerShardingFlags(consul.NetworkHTTPEndpoint()),
1165+
overrides,
1166+
)
1167+
1168+
// Start rulers.
1169+
ruler1 := e2ecortex.NewRuler("ruler-1", consul.NetworkHTTPEndpoint(), rulerFlags, "")
1170+
ruler2 := e2ecortex.NewRuler("ruler-2", consul.NetworkHTTPEndpoint(), rulerFlags, "")
1171+
ruler3 := e2ecortex.NewRuler("ruler-3", consul.NetworkHTTPEndpoint(), rulerFlags, "")
1172+
rulers := e2ecortex.NewCompositeCortexService(ruler1, ruler2, ruler3)
1173+
require.NoError(t, s.StartAndWaitReady(ruler1, ruler2, ruler3))
1174+
1175+
// Upload rule groups to one of the rulers.
1176+
c, err := e2ecortex.NewClient("", "", "", ruler1.HTTPEndpoint(), "user-1")
1177+
require.NoError(t, err)
1178+
namespaceNames := []string{"test1", "test2", "test3", "test4", "test5"}
1179+
namespaceNameCount := make([]int, 5)
1180+
nsRand := rand.New(rand.NewSource(time.Now().UnixNano()))
1181+
for _, ruleGroup := range ruleGroups {
1182+
index := nsRand.Intn(len(namespaceNames))
1183+
namespaceNameCount[index] = namespaceNameCount[index] + 1
1184+
require.NoError(t, c.SetRuleGroup(ruleGroup, namespaceNames[index]))
1185+
}
1186+
1187+
// Wait until rulers have loaded all rules.
1188+
require.NoError(t, rulers.WaitSumMetricsWithOptions(e2e.Equals(numRulesGroups), []string{"cortex_prometheus_rule_group_rules"}, e2e.WaitMissingMetrics))
1189+
1190+
ruler1SyncTotal, err := ruler1.SumMetrics([]string{"cortex_ruler_sync_rules_total"})
1191+
require.NoError(t, err)
1192+
ruler3SyncTotal, err := ruler3.SumMetrics([]string{"cortex_ruler_sync_rules_total"})
1193+
require.NoError(t, err)
1194+
1195+
err = consul.Kill() // kill consul so the rulers will operate with the tokens/instances they already have
1196+
require.NoError(t, err)
1197+
1198+
err = ruler2.Kill()
1199+
require.NoError(t, err)
1200+
1201+
// wait for another sync
1202+
require.NoError(t, ruler1.WaitSumMetrics(e2e.Greater(ruler1SyncTotal[0]), "cortex_ruler_sync_rules_total"))
1203+
require.NoError(t, ruler3.WaitSumMetrics(e2e.Greater(ruler3SyncTotal[0]), "cortex_ruler_sync_rules_total"))
1204+
1205+
rulers = e2ecortex.NewCompositeCortexService(ruler1, ruler3)
1206+
require.NoError(t, rulers.WaitSumMetricsWithOptions(e2e.Equals(numRulesGroups), []string{"cortex_prometheus_rule_group_rules"}, e2e.WaitMissingMetrics))
1207+
1208+
t.Log(ruler1.SumMetrics([]string{"cortex_prometheus_rule_group_rules"}))
1209+
t.Log(ruler3.SumMetrics([]string{"cortex_prometheus_rule_group_rules"}))
1210+
1211+
c3, err := e2ecortex.NewClient("", "", "", ruler3.HTTPEndpoint(), "user-1")
1212+
require.NoError(t, err)
1213+
1214+
ruler1Rules, err := c.GetRuleGroups()
1215+
require.NoError(t, err)
1216+
1217+
ruler3Rules, err := c3.GetRuleGroups()
1218+
require.NoError(t, err)
1219+
1220+
ruleCount := 0
1221+
countFunc := func(ruleGroups map[string][]rulefmt.RuleGroup) {
1222+
for _, v := range ruleGroups {
1223+
ruleCount += len(v)
1224+
}
1225+
}
1226+
1227+
countFunc(ruler1Rules)
1228+
require.Equal(t, numRulesGroups, ruleCount)
1229+
ruleCount = 0
1230+
countFunc(ruler3Rules)
1231+
require.Equal(t, numRulesGroups, ruleCount)
1232+
1233+
results, err := c.GetPrometheusRules(e2ecortex.RuleFilter{})
1234+
require.NoError(t, err)
1235+
require.Equal(t, numRulesGroups, len(results))
1236+
}
1237+
10961238
func TestRulerKeepFiring(t *testing.T) {
10971239
s, err := e2e.NewScenario(networkName)
10981240
require.NoError(t, err)

pkg/ruler/client_pool_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"net"
66
"testing"
77

8+
"github.com/cortexproject/cortex/pkg/util/services"
9+
810
"github.com/prometheus/client_golang/prometheus"
911
dto "github.com/prometheus/client_model/go"
1012
"github.com/stretchr/testify/assert"
@@ -63,6 +65,12 @@ func Test_newRulerClientFactory(t *testing.T) {
6365

6466
type mockRulerServer struct{}
6567

68+
func (m *mockRulerServer) LivenessCheck(ctx context.Context, request *LivenessCheckRequest) (*LivenessCheckResponse, error) {
69+
return &LivenessCheckResponse{
70+
State: int32(services.Running),
71+
}, nil
72+
}
73+
6674
func (m *mockRulerServer) Rules(context.Context, *RulesRequest) (*RulesResponse, error) {
6775
return &RulesResponse{}, nil
6876
}

0 commit comments

Comments
 (0)