diff --git a/CHANGELOG.md b/CHANGELOG.md index 00ef0552776..fa7b99fc90a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ If you are running with a high `-ruler.num-workers` and if you're not able to ex Further, if you're using the configs service, we've upgraded the migration library and this requires some manual intervention. See full instructions below to upgrade your Postgres. +* [CHANGE] Remove unnecessary configs/flags from the ruler ring config to align with the pattern used in the distributor ring. #1987 + * Ruler ring related flags are now all prefixed with `ruler.ring.` as opposed to just `ruler.` + * Changed the default value for `-ruler.ring.prefix` from `collectors/` to `rulers/` in order to not clash with other keys (ie. ring) stored in the same key-value store. * [CHANGE] The frontend component now does not cache results if it finds a `Cache-Control` header and if one of its values is `no-store`. #1974 * [CHANGE] Flags changed with transition to upstream Prometheus rules manager: * `ruler.client-timeout` is now `ruler.configs.client-timeout` in order to match `ruler.configs.url` diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index d295e67409f..e7bc5ed76fc 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -679,91 +679,59 @@ alertmanagerurl: # CLI flag: -ruler.search-pending-for [searchpendingfor: | default = 5m0s] -lifecyclerconfig: - ring: - kvstore: - # Backend storage to use for the ring. Supported values are: consul, etcd, - # inmemory, multi, memberlist (experimental). - # CLI flag: -ruler.store - [store: | default = "consul"] +ring: + kvstore: + # Backend storage to use for the ring. Supported values are: consul, etcd, + # inmemory, multi, memberlist (experimental). + # CLI flag: -ruler.ring.store + [store: | default = "consul"] - # The prefix for the keys in the store. Should end with a /. - # CLI flag: -ruler.prefix - [prefix: | default = "collectors/"] + # The prefix for the keys in the store. Should end with a /. + # CLI flag: -ruler.ring.prefix + [prefix: | default = "rulers/"] - # The consul_config configures the consul client. - # The CLI flags prefix for this block config is: ruler - [consul: ] + # The consul_config configures the consul client. + # The CLI flags prefix for this block config is: ruler.ring + [consul: ] - # The etcd_config configures the etcd client. - # The CLI flags prefix for this block config is: ruler - [etcd: ] + # The etcd_config configures the etcd client. + # The CLI flags prefix for this block config is: ruler.ring + [etcd: ] - # The memberlist_config configures the Gossip memberlist. - # The CLI flags prefix for this block config is: ruler - [memberlist: ] + # The memberlist_config configures the Gossip memberlist. + # The CLI flags prefix for this block config is: ruler.ring + [memberlist: ] - multi: - # Primary backend storage used by multi-client. - # CLI flag: -ruler.multi.primary - [primary: | default = ""] + multi: + # Primary backend storage used by multi-client. + # CLI flag: -ruler.ring.multi.primary + [primary: | default = ""] - # Secondary backend storage used by multi-client. - # CLI flag: -ruler.multi.secondary - [secondary: | default = ""] + # Secondary backend storage used by multi-client. + # CLI flag: -ruler.ring.multi.secondary + [secondary: | default = ""] - # Mirror writes to secondary store. - # CLI flag: -ruler.multi.mirror-enabled - [mirror_enabled: | default = false] + # Mirror writes to secondary store. + # CLI flag: -ruler.ring.multi.mirror-enabled + [mirror_enabled: | default = false] - # Timeout for storing value to secondary store. - # CLI flag: -ruler.multi.mirror-timeout - [mirror_timeout: | default = 2s] + # Timeout for storing value to secondary store. + # CLI flag: -ruler.ring.multi.mirror-timeout + [mirror_timeout: | default = 2s] - # The heartbeat timeout after which ingesters are skipped for reads/writes. - # CLI flag: -ruler.ring.heartbeat-timeout - [heartbeat_timeout: | default = 1m0s] + # Period at which to heartbeat to the ring. + # CLI flag: -ruler.ring.heartbeat-period + [heartbeat_period: | default = 5s] - # The number of ingesters to write to and read from. - # CLI flag: -ruler.distributor.replication-factor - [replication_factor: | default = 3] + # The heartbeat timeout after which rulers are considered unhealthy within the + # ring. + # CLI flag: -ruler.ring.heartbeat-timeout + [heartbeat_timeout: | default = 1m0s] # Number of tokens for each ingester. - # CLI flag: -ruler.num-tokens + # CLI flag: -ruler.ring.num-tokens [num_tokens: | default = 128] - # Period at which to heartbeat to consul. - # CLI flag: -ruler.heartbeat-period - [heartbeat_period: | default = 5s] - - # Observe tokens after generating to resolve collisions. Useful when using - # gossiping ring. - # CLI flag: -ruler.observe-period - [observe_period: | default = 0s] - - # Period to wait for a claim from another member; will join automatically - # after this. - # CLI flag: -ruler.join-after - [join_after: | default = 0s] - - # Minimum duration to wait before becoming ready. This is to work around race - # conditions with ingesters exiting and updating the ring. - # CLI flag: -ruler.min-ready-duration - [min_ready_duration: | default = 1m0s] - - # Name of network interface to read address from. - # CLI flag: -ruler.lifecycler.interface - [interface_names: | default = [eth0 en0]] - - # Duration to sleep for before exiting, to ensure metrics are scraped. - # CLI flag: -ruler.final-sleep - [final_sleep: | default = 30s] - - # File path where tokens are stored. If empty, tokens are not stored at - # shutdown and restored at startup. - # CLI flag: -ruler.tokens-file-path - [tokens_file_path: | default = ""] - # Period with which to attempt to flush rule groups. # CLI flag: -ruler.flush-period [flushcheckperiod: | default = 1m0s] diff --git a/docs/guides/sharded_ruler.md b/docs/guides/sharded_ruler.md new file mode 100644 index 00000000000..70e037b80d2 --- /dev/null +++ b/docs/guides/sharded_ruler.md @@ -0,0 +1,26 @@ +--- +title: "Config for horizontally scaling the Ruler" +linkTitle: "Config for horizontally scaling the Ruler" +weight: 4 +slug: ruler-sharding +--- + +## Context + +One option to scale the ruler is by scaling it horizontally. However, with multiple ruler instances running they will need to coordinate to determine which instance will evaluate which rule. Similar to the ingesters, the rulers establish a hash ring to divide up the responsibilities of evaluating rules. + +## Config + +In order to enable sharding in the ruler the following flag needs to be set: + +``` + -ruler.enable-sharding=true +``` + +In addition the ruler requires it's own ring to be configured, for instance: + +``` + -ruler.ring.consul.hostname=consul.dev.svc.cluster.local:8500 +``` + +The only configuration that is required is to enable sharding and configure a key value store. From there the rulers will shard and handle the division of rules automatically. diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 27675823b0b..eb3079279ae 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -405,7 +405,7 @@ func (t *Cortex) stopTableManager() error { } func (t *Cortex) initRuler(cfg *Config) (err error) { - cfg.Ruler.LifecyclerConfig.ListenPort = &cfg.Server.GRPCListenPort + cfg.Ruler.Ring.ListenPort = cfg.Server.GRPCListenPort queryable, engine := querier.New(cfg.Querier, t.distributor, t.store) t.ruler, err = ruler.NewRuler(cfg.Ruler, engine, queryable, t.distributor) diff --git a/pkg/ruler/lifecycle_test.go b/pkg/ruler/lifecycle_test.go index 0b61010e618..a1d55bf6c76 100644 --- a/pkg/ruler/lifecycle_test.go +++ b/pkg/ruler/lifecycle_test.go @@ -21,14 +21,14 @@ func TestRulerShutdown(t *testing.T) { } test.Poll(t, 100*time.Millisecond, 0, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.RulerRingKey) + return testutils.NumTokens(config.Ring.KVStore.Mock, "localhost", ring.RulerRingKey) }) } // TestRulerRestart tests a restarting ruler doesn't keep adding more tokens. func TestRulerRestart(t *testing.T) { config := defaultRulerConfig() - config.LifecyclerConfig.SkipUnregister = true + config.Ring.SkipUnregister = true config.EnableSharding = true { @@ -38,7 +38,7 @@ func TestRulerRestart(t *testing.T) { } test.Poll(t, 100*time.Millisecond, 1, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.RulerRingKey) + return testutils.NumTokens(config.Ring.KVStore.Mock, "localhost", ring.RulerRingKey) }) { @@ -50,6 +50,6 @@ func TestRulerRestart(t *testing.T) { time.Sleep(200 * time.Millisecond) test.Poll(t, 100*time.Millisecond, 1, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.RulerRingKey) + return testutils.NumTokens(config.Ring.KVStore.Mock, "localhost", ring.RulerRingKey) }) } diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 81943f0652e..e956502b2ff 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -61,14 +61,14 @@ type Config struct { EnableSharding bool // Enable sharding rule groups SearchPendingFor time.Duration - LifecyclerConfig ring.LifecyclerConfig + Ring RingConfig FlushCheckPeriod time.Duration } // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *Config) RegisterFlags(f *flag.FlagSet) { - cfg.LifecyclerConfig.RegisterFlagsWithPrefix("ruler.", f) cfg.StoreConfig.RegisterFlags(f) + cfg.Ring.RegisterFlags(f) // Deprecated Flags that will be maintained to avoid user disruption flagext.DeprecatedFlag(f, "ruler.client-timeout", "This flag has been renamed to ruler.configs.client-timeout") @@ -149,14 +149,15 @@ func NewRuler(cfg Config, engine *promql.Engine, queryable promStorage.Queryable // If sharding is enabled, create/join a ring to distribute tokens to // the ruler if cfg.EnableSharding { - ruler.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, ruler, "ruler", ring.RulerRingKey, true) + lifecyclerCfg := cfg.Ring.ToLifecyclerConfig() + ruler.lifecycler, err = ring.NewLifecycler(lifecyclerCfg, ruler, "ruler", ring.RulerRingKey, true) if err != nil { return nil, err } ruler.lifecycler.Start() - ruler.ring, err = ring.New(cfg.LifecyclerConfig.RingConfig, "ruler", ring.RulerRingKey) + ruler.ring, err = ring.New(lifecyclerCfg.RingConfig, "ruler", ring.RulerRingKey) if err != nil { return nil, err } diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go new file mode 100644 index 00000000000..cae800dc861 --- /dev/null +++ b/pkg/ruler/ruler_ring.go @@ -0,0 +1,92 @@ +package ruler + +import ( + "flag" + "os" + "time" + + "github.com/cortexproject/cortex/pkg/ring" + "github.com/cortexproject/cortex/pkg/ring/kv" + "github.com/cortexproject/cortex/pkg/util" + "github.com/cortexproject/cortex/pkg/util/flagext" + "github.com/go-kit/kit/log/level" +) + +// RingConfig masks the ring lifecycler config which contains +// many options not really required by the rulers ring. This config +// is used to strip down the config to the minimum, and avoid confusion +// to the user. +type RingConfig struct { + KVStore kv.Config `yaml:"kvstore,omitempty"` + HeartbeatPeriod time.Duration `yaml:"heartbeat_period,omitempty"` + HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout,omitempty"` + + // Instance details + InstanceID string `yaml:"instance_id" doc:"hidden"` + InstanceInterfaceNames []string `yaml:"instance_interface_names" doc:"hidden"` + InstancePort int `yaml:"instance_port" doc:"hidden"` + InstanceAddr string `yaml:"instance_addr" doc:"hidden"` + NumTokens int `yaml:"num_tokens"` + + // Injected internally + ListenPort int `yaml:"-"` + + // Used for testing + SkipUnregister bool `yaml:"-"` +} + +// RegisterFlags adds the flags required to config this to the given FlagSet +func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { + hostname, err := os.Hostname() + if err != nil { + level.Error(util.Logger).Log("msg", "failed to get hostname", "err", err) + os.Exit(1) + } + + // Ring flags + cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f) + f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") + f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring.") + + // Instance flags + cfg.InstanceInterfaceNames = []string{"eth0", "en0"} + f.Var((*flagext.Strings)(&cfg.InstanceInterfaceNames), "ruler.ring.instance-interface", "Name of network interface to read address from.") + f.StringVar(&cfg.InstanceAddr, "ruler.ring.instance-addr", "", "IP address to advertise in the ring.") + f.IntVar(&cfg.InstancePort, "ruler.ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).") + f.StringVar(&cfg.InstanceID, "ruler.ring.instance-id", hostname, "Instance ID to register in the ring.") + f.IntVar(&cfg.NumTokens, "ruler.ring.num-tokens", 128, "Number of tokens for each ingester.") +} + +// ToLifecyclerConfig returns a LifecyclerConfig based on the ruler +// ring config. +func (cfg *RingConfig) ToLifecyclerConfig() ring.LifecyclerConfig { + // We have to make sure that the ring.LifecyclerConfig and ring.Config + // defaults are preserved + lc := ring.LifecyclerConfig{} + rc := ring.Config{} + + flagext.DefaultValues(&lc) + flagext.DefaultValues(&rc) + + // Configure ring + rc.KVStore = cfg.KVStore + rc.HeartbeatTimeout = cfg.HeartbeatTimeout + rc.ReplicationFactor = 1 + + // Configure lifecycler + lc.RingConfig = rc + lc.ListenPort = &cfg.ListenPort + lc.Addr = cfg.InstanceAddr + lc.Port = cfg.InstancePort + lc.ID = cfg.InstanceID + lc.InfNames = cfg.InstanceInterfaceNames + lc.SkipUnregister = cfg.SkipUnregister + lc.HeartbeatPeriod = cfg.HeartbeatPeriod + lc.NumTokens = cfg.NumTokens + lc.ObservePeriod = 0 + lc.JoinAfter = 0 + lc.MinReadyDuration = 0 + lc.FinalSleep = 0 + + return lc +} diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 932a6441906..719e294d7cb 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -30,14 +30,12 @@ func defaultRulerConfig() Config { }, } flagext.DefaultValues(&cfg) - flagext.DefaultValues(&cfg.LifecyclerConfig) - cfg.LifecyclerConfig.RingConfig.ReplicationFactor = 1 - cfg.LifecyclerConfig.RingConfig.KVStore.Mock = consul - cfg.LifecyclerConfig.NumTokens = 1 - cfg.LifecyclerConfig.FinalSleep = time.Duration(0) - cfg.LifecyclerConfig.ListenPort = func(i int) *int { return &i }(0) - cfg.LifecyclerConfig.Addr = "localhost" - cfg.LifecyclerConfig.ID = "localhost" + flagext.DefaultValues(&cfg.Ring) + cfg.Ring.KVStore.Mock = consul + cfg.Ring.NumTokens = 1 + cfg.Ring.ListenPort = 0 + cfg.Ring.InstanceAddr = "localhost" + cfg.Ring.InstanceID = "localhost" return cfg }