Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
* [CHANGE] Ingester: Chunks flushed via /flush stay in memory until retention period is reached. This affects `cortex_ingester_memory_chunks` metric. #2778
* [CHANGE] Querier: the error message returned when the query time range exceeds `-store.max-query-length` has changed from `invalid query, length > limit (X > Y)` to `the query time range exceeds the limit (query length: X, limit: Y)`. #2826
* [CHANGE] KV: The `role` label which was a label of `multi` KV store client only has been added to metrics of every KV store client. If KV store client is not `multi`, then the value of `role` label is `primary`. #2837
* [CHANGE] Added the `engine` label to the metrics exposed by the Prometheus query engine, to distinguish between `ruler` and `querier` metrics. #2854
* [CHANGE] Added ruler to the single binary when started with `-target=all` (default). #2854
* [CHANGE] Experimental TSDB: compact head when opening TSDB. This should only affect ingester startup after it was unable to compact head in previous run. #2870
* [FEATURE] Introduced `ruler.for-outage-tolerance`, Max time to tolerate outage for restoring "for" state of alert. #2783
* [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783
* [FEATURE] Introduced `ruler.resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783
* [FEATURE] Ruler: added `local` filesystem support to store rules (read-only). #2854
* [ENHANCEMENT] Upgraded Docker base images to `alpine:3.12`. #2862
* [ENHANCEMENT] Experimental: Querier can now optionally query secondary store. This is specified by using `-querier.second-store-engine` option, with values `chunks` or `tsdb`. Standard configuration options for this store are used. Additionally, this querying can be configured to happen only for queries that need data older than `-querier.use-second-store-before-time`. Default value of zero will always query secondary store. #2747
* [ENHANCEMENT] Query-tee: increased the `cortex_querytee_request_duration_seconds` metric buckets granularity. #2799
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*
!cortex-rules
!cortex-tsdb
!.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,19 @@ tsdb:

storage:
engine: tsdb

ruler:
enable_api: true
enable_sharding: true
poll_interval: 2s
storage:
type: s3
s3:
bucketnames: cortex-rules
s3forcepathstyle: true
s3: http://cortex:supersecret@minio.:9000
ring:
kvstore:
store: consul
consul:
host: consul:8500
8 changes: 7 additions & 1 deletion docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,8 @@ ruler_client:
[poll_interval: <duration> | default = 1m]

storage:
# Method to use for backend rule storage (configdb, azure, gcs, s3)
# Method to use for backend rule storage (configdb, azure, gcs, s3, swift,
# local)
# CLI flag: -ruler.storage.type
[type: <string> | default = "configdb"]

Expand Down Expand Up @@ -998,6 +999,11 @@ storage:
# CLI flag: -ruler.storage.swift.container-name
[container_name: <string> | default = "cortex"]

local:
# Directory to scan for rules
# CLI flag: -ruler.storage.local.directory
[directory: <string> | default = ""]

# file path to store temporary rule files for the prometheus rule managers
# CLI flag: -ruler.rule-path
[rule_path: <string> | default = "/rules"]
Expand Down
13 changes: 12 additions & 1 deletion docs/configuration/single-process-config-blocks-gossip-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,15 @@ tsdb:
dir: /tmp/cortex/storage

frontend_worker:
match_max_concurrent: true
match_max_concurrent: true

ruler:
enable_api: true
enable_sharding: true
storage:
type: local
local:
directory: /tmp/cortex/rules
ring:
kvstore:
store: memberlist
13 changes: 12 additions & 1 deletion docs/configuration/single-process-config-blocks-gossip-2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,15 @@ tsdb:
dir: /tmp/cortex/storage

frontend_worker:
match_max_concurrent: true
match_max_concurrent: true

ruler:
enable_api: true
enable_sharding: true
storage:
type: local
local:
directory: /tmp/cortex/rules
ring:
kvstore:
store: memberlist
8 changes: 8 additions & 0 deletions docs/configuration/single-process-config-blocks-tls.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,11 @@ frontend_worker:
tls_cert_path: "client.crt"
tls_key_path: "client.key"
tls_ca_path: "root.crt"

ruler:
enable_api: true
enable_sharding: false
storage:
type: local
local:
directory: /tmp/cortex/rules
10 changes: 9 additions & 1 deletion docs/configuration/single-process-config-blocks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,12 @@ compactor:
store: inmemory

frontend_worker:
match_max_concurrent: true
match_max_concurrent: true

ruler:
enable_api: true
enable_sharding: false
storage:
type: local
local:
directory: /tmp/cortex/rules
10 changes: 10 additions & 0 deletions docs/configuration/single-process-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,14 @@ storage:
# to max_concurrent on the queriers.
frontend_worker:
match_max_concurrent: true

# Configure the ruler to scan the /tmp/cortex/rules directory for prometheus
# rules: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules
ruler:
enable_api: true
enable_sharding: false
storage:
type: local
local:
directory: /tmp/cortex/rules
```
10 changes: 9 additions & 1 deletion docs/configuration/single-process-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,12 @@ purger:
object_store_type: filesystem

frontend_worker:
match_max_concurrent: true
match_max_concurrent: true

ruler:
enable_api: true
enable_sharding: false
storage:
type: local
local:
directory: /tmp/cortex/rules
20 changes: 20 additions & 0 deletions docs/guides/sharded_ruler.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,23 @@ In addition the ruler requires it's own ring to be configured, for instance:
The only configuration that is required is to enable sharding and configure a key value store. From there the rulers will shard and handle the division of rules automatically.

Unlike ingesters, rulers do not hand over responsibility: all rules are re-sharded randomly every time a ruler is added to or removed from the ring.

## Ruler Storage

The ruler supports six kinds of storage (configdb, azure, gcs, s3, swift, local). Most kinds of storage work with the sharded ruler configuration in an obvious way. i.e. configure all rulers to use the same backend.

The local implementation reads [Prometheus recording rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) off of the local filesystem. This is a read only backend that does not support the creation and deletion of rules through [the API](https://cortexmetrics.io/docs/apis/#ruler). Despite the fact that it reads the local filesystem this method can still be used in a sharded ruler configuration if the operator takes care to load the same rules to every ruler. For instance this could be accomplished by mounting a [Kubernetes ConfigMap](https://kubernetes.io/docs/concepts/configuration/configmap/) onto every ruler pod.

A typical local config may look something like:
```
-ruler.storage.type=local
-ruler.storage.local.directory=/tmp/cortex/rules
```

With the above configuration the ruler would expect the following layout:
```
/tmp/cortex/rules/<tenant id>/rules1.yaml
/rules2.yaml
```
Yaml files are expected to be in the [Prometheus format](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules).

41 changes: 41 additions & 0 deletions integration/api_ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package main

import (
"path/filepath"
"testing"

"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -94,3 +95,43 @@ func TestRulerAPI(t *testing.T) {
// Ensure no service-specific metrics prefix is used by the wrong service.
assertServiceMetricsPrefixes(t, Ruler, ruler)
}

func TestRulerAPISingleBinary(t *testing.T) {
s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

namespace := "ns"
user := "fake"

configOverrides := map[string]string{
"-ruler.storage.local.directory": filepath.Join(e2e.ContainerSharedDir, "ruler_configs"),
"-ruler.poll-interval": "2s",
}

// Start Cortex components.
require.NoError(t, copyFileToSharedDir(s, "docs/configuration/single-process-config.yaml", cortexConfigFile))
require.NoError(t, writeFileToSharedDir(s, filepath.Join("ruler_configs", user, namespace), []byte(cortexRulerUserConfigYaml)))
cortex := e2ecortex.NewSingleBinaryWithConfigFile("cortex", cortexConfigFile, configOverrides, "", 9009, 9095)
require.NoError(t, s.StartAndWaitReady(cortex))

// Create a client with the ruler address configured
c, err := e2ecortex.NewClient("", "", "", cortex.HTTPEndpoint(), "")
require.NoError(t, err)

// Wait until the user manager is created
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(1), "cortex_ruler_managers_total"))

// Check to ensure the rules running in the cortex match what was set
rgs, err := c.GetRuleGroups()
require.NoError(t, err)

retrievedNamespace, exists := rgs[namespace]
require.True(t, exists)
require.Len(t, retrievedNamespace, 1)
require.Equal(t, retrievedNamespace[0].Name, "rule")

// Check to make sure prometheus engine metrics are available for both engine types
require.NoError(t, cortex.WaitForMetricWithLabels(e2e.EqualsSingle(0), "prometheus_engine_queries", map[string]string{"engine": "querier"}))
require.NoError(t, cortex.WaitForMetricWithLabels(e2e.EqualsSingle(0), "prometheus_engine_queries", map[string]string{"engine": "ruler"}))
}
12 changes: 12 additions & 0 deletions integration/configs.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ const (
receivers:
- name: "example_receiver"
`

cortexRulerUserConfigYaml = `groups:
- name: rule
interval: 100s
rules:
- record: test_rule
alert: ""
expr: up
for: 0s
labels: {}
annotations: {}
`
)

var (
Expand Down
17 changes: 17 additions & 0 deletions pkg/cortex/cortex_test.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
package cortex

import (
"net/url"
"testing"

"github.com/stretchr/testify/require"

"github.com/cortexproject/cortex/pkg/chunk/aws"
"github.com/cortexproject/cortex/pkg/chunk/storage"
"github.com/cortexproject/cortex/pkg/ingester"
"github.com/cortexproject/cortex/pkg/ring"
"github.com/cortexproject/cortex/pkg/ring/kv"
"github.com/cortexproject/cortex/pkg/ruler"
"github.com/cortexproject/cortex/pkg/storage/backend/s3"
"github.com/cortexproject/cortex/pkg/storage/tsdb"
"github.com/cortexproject/cortex/pkg/util/flagext"
"github.com/cortexproject/cortex/pkg/util/services"
)

func TestCortex(t *testing.T) {
rulerURL, err := url.Parse("inmemory:///rules")
require.NoError(t, err)

cfg := Config{
Storage: storage.Config{
Engine: storage.StorageEngineTSDB, // makes config easier
Expand Down Expand Up @@ -47,6 +54,16 @@ func TestCortex(t *testing.T) {
},
},
},
Ruler: ruler.Config{
StoreConfig: ruler.RuleStoreConfig{
Type: "s3",
S3: aws.S3Config{
S3: flagext.URLValue{
URL: rulerURL,
},
},
},
},
Target: All,
}

Expand Down
17 changes: 14 additions & 3 deletions pkg/cortex/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ func (t *Cortex) initDistributor() (serv services.Service, err error) {
}

func (t *Cortex) initQuerier() (serv services.Service, err error) {
queryable, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, prometheus.DefaultRegisterer)
querierRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "querier"}, prometheus.DefaultRegisterer)
queryable, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, querierRegisterer)

// Prometheus histograms for requests to the querier.
querierRequestDuration := promauto.With(prometheus.DefaultRegisterer).NewHistogramVec(prometheus.HistogramOpts{
Expand Down Expand Up @@ -454,9 +455,19 @@ func (t *Cortex) initTableManager() (services.Service, error) {
}

func (t *Cortex) initRuler() (serv services.Service, err error) {
// if the ruler is not configured and we're in single binary then let's just log an error and continue
// unfortunately there is no way to generate a "default" config and compare default against actual
// to determine if it's unconfigured. the following check, however, correctly tests this.
// Single binary integration tests will break if this ever drifts
if t.Cfg.Target == All && t.Cfg.Ruler.StoreConfig.Type == "configdb" && t.Cfg.Ruler.StoreConfig.ConfigDB.ConfigsAPIURL.URL == nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No active work is being done on the configdb so I doubt this will break in the near future.

level.Info(util.Logger).Log("msg", "Ruler is not configured in single binary mode and will not be started.")
return nil, nil
}

t.Cfg.Ruler.Ring.ListenPort = t.Cfg.Server.GRPCListenPort
t.Cfg.Ruler.Ring.KVStore.MemberlistKV = t.MemberlistKV.GetMemberlistKV
queryable, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, prometheus.DefaultRegisterer)
rulerRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "ruler"}, prometheus.DefaultRegisterer)
queryable, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, rulerRegisterer)

t.Ruler, err = ruler.NewRuler(t.Cfg.Ruler, engine, queryable, t.Distributor, prometheus.DefaultRegisterer, util.Logger)
if err != nil {
Expand Down Expand Up @@ -602,7 +613,7 @@ func (t *Cortex) setupModuleManager() error {
Compactor: {API},
StoreGateway: {API},
Purger: {Store, DeleteRequestsStore, API},
All: {QueryFrontend, Querier, Ingester, Distributor, TableManager, Purger, StoreGateway},
All: {QueryFrontend, Querier, Ingester, Distributor, TableManager, Purger, StoreGateway, Ruler},
}
for mod, targets := range deps {
if err := mm.AddDependency(mod, targets...); err != nil {
Expand Down
Loading