From 8e166aebfb0999b9f535b03d470a4e7a4302fa7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 15 Jun 2020 10:38:17 +0200 Subject: [PATCH 1/7] Added support for rejoining the cluster. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ring/kv/memberlist/memberlist_client.go | 25 +++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/pkg/ring/kv/memberlist/memberlist_client.go b/pkg/ring/kv/memberlist/memberlist_client.go index fcdd8c5612..cb756d4f52 100644 --- a/pkg/ring/kv/memberlist/memberlist_client.go +++ b/pkg/ring/kv/memberlist/memberlist_client.go @@ -139,6 +139,7 @@ type KVConfig struct { MaxJoinBackoff time.Duration `yaml:"max_join_backoff"` MaxJoinRetries int `yaml:"max_join_retries"` AbortIfJoinFails bool `yaml:"abort_if_cluster_join_fails"` + RejoinInterval time.Duration `yaml:"rejoin_interval"` // Remove LEFT ingesters from ring after this timeout. LeftIngestersTimeout time.Duration `yaml:"left_ingesters_timeout"` @@ -168,6 +169,7 @@ func (cfg *KVConfig) RegisterFlags(f *flag.FlagSet, prefix string) { f.DurationVar(&cfg.MaxJoinBackoff, prefix+"memberlist.max-join-backoff", 1*time.Minute, "Max backoff duration to join other cluster members.") f.IntVar(&cfg.MaxJoinRetries, prefix+"memberlist.max-join-retries", 10, "Max number of retries to join other cluster members.") f.BoolVar(&cfg.AbortIfJoinFails, prefix+"memberlist.abort-if-join-fails", true, "If this node fails to join memberlist cluster, abort.") + f.DurationVar(&cfg.RejoinInterval, prefix+"memberlist.rejoin-interval", 0, "If not 0, how often to rejoin the cluster. Occasional rejoin can be useful in case of cluster split, and is harmless otherwise.") f.DurationVar(&cfg.LeftIngestersTimeout, prefix+"memberlist.left-ingesters-timeout", 5*time.Minute, "How long to keep LEFT ingesters in the ring.") f.DurationVar(&cfg.LeaveTimeout, prefix+"memberlist.leave-timeout", 5*time.Second, "Timeout for leaving memberlist cluster.") f.DurationVar(&cfg.GossipInterval, prefix+"memberlist.gossip-interval", 0, "How often to gossip. Uses memberlist LAN defaults if 0.") @@ -388,8 +390,27 @@ func (m *KV) running(ctx context.Context) error { } } - <-ctx.Done() - return nil + var tickerChan <-chan time.Time = nil + if m.cfg.RejoinInterval > 0 { + t := time.NewTicker(m.cfg.RejoinInterval) + tickerChan = t.C + } + + for { + select { + case <-tickerChan: + reached, err := m.memberlist.Join(m.cfg.JoinMembers) + if err == nil { + level.Info(util.Logger).Log("msg", "re-joined memberlist cluster", "reached_nodes", reached) + } else { + // Don't report error from rejoin, otherwise KV service would be stopped completely. + level.Warn(util.Logger).Log("msg", "re-joining memberlist cluster failed", "err", err) + } + + case <-ctx.Done(): + return nil + } + } } // GetCodec returns codec for given ID or nil. From 1d258ba7b95b6f5fc2cdcab6decbe1a94b3223eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 15 Jun 2020 10:56:12 +0200 Subject: [PATCH 2/7] Added unit-test for rejoin. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../kv/memberlist/memberlist_client_test.go | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/pkg/ring/kv/memberlist/memberlist_client_test.go b/pkg/ring/kv/memberlist/memberlist_client_test.go index 37cbdb6bf6..b154fddee7 100644 --- a/pkg/ring/kv/memberlist/memberlist_client_test.go +++ b/pkg/ring/kv/memberlist/memberlist_client_test.go @@ -9,15 +9,19 @@ import ( "math" "math/rand" "net" + "os" "sort" "sync" "testing" "time" + "github.com/go-kit/kit/log" "github.com/stretchr/testify/require" "github.com/cortexproject/cortex/pkg/ring/kv/codec" + "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/services" + "github.com/cortexproject/cortex/pkg/util/test" ) const ACTIVE = 1 @@ -760,7 +764,7 @@ func TestMemberlistFailsToJoin(t *testing.T) { func getFreePorts(count int) ([]int, error) { var ports []int for i := 0; i < count; i++ { - addr, err := net.ResolveTCPAddr("tcp", "localhost:0") + addr, err := net.ResolveTCPAddr("tcp", "127.0.0.1:0") if err != nil { return nil, err } @@ -980,3 +984,53 @@ func TestGenerateRandomSuffix(t *testing.T) { require.NotEqual(t, h1, h2) require.NotEqual(t, h2, h3) } + +func TestRejoin(t *testing.T) { + util.Logger = log.NewLogfmtLogger(os.Stdout) + + ports, err := getFreePorts(2) + require.NoError(t, err) + + cfg1 := KVConfig{ + TCPTransport: TCPTransportConfig{ + BindAddrs: []string{"localhost"}, + BindPort: ports[0], + }, + + RandomizeNodeName: true, + Codecs: []codec.Codec{dataCodec{}}, + AbortIfJoinFails: false, + } + + cfg2 := cfg1 + cfg2.TCPTransport.BindPort = ports[1] + cfg2.JoinMembers = []string{fmt.Sprintf("localhost:%d", ports[0])} + cfg2.RejoinInterval = 1 * time.Second + + mkv1 := NewKV(cfg1) + require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv1)) + defer services.StopAndAwaitTerminated(context.Background(), mkv1) //nolint:errcheck + + mkv2 := NewKV(cfg2) + require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv2)) + defer services.StopAndAwaitTerminated(context.Background(), mkv2) //nolint:errcheck + + membersFunc := func() interface{} { + return mkv2.memberlist.NumMembers() + } + + test.Poll(t, 5*time.Second, 2, membersFunc) + + // Shutdown first KV + require.NoError(t, services.StopAndAwaitTerminated(context.Background(), mkv1)) + + // Second KV should see single member now. + test.Poll(t, 5*time.Second, 1, membersFunc) + + // Let's start first KV again. It is not configured to join the cluster, but KV2 is rejoining. + mkv1 = NewKV(cfg1) + require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv1)) + defer services.StopAndAwaitTerminated(context.Background(), mkv1) //nolint:errcheck + + test.Poll(t, 5*time.Second, 2, membersFunc) +} From 538061c94c750bebb258ee7ca18f4392fed035fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 15 Jun 2020 10:56:47 +0200 Subject: [PATCH 3/7] CHANGELOG entry. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2061486384..de163411a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,6 +111,7 @@ * [ENHANCEMENT] Add `-cassandra.reconnect-interval` to allow specifying the reconnect interval to a Cassandra server that has been marked `DOWN` by the gocql driver. Also change the default value of the reconnect interval from `60s` to `1s`. #2687 * [ENHANCEMENT] Experimental TSDB: Applied a jitter to the period bucket scans in order to better distribute bucket operations over the time and increase the probability of hitting the shared cache (if configured). #2693 * [ENHANCEMENT] Experimental TSDB: Series limit per user and per metric now work in TSDB blocks. #2676 +* [ENHANCEMENT] Experimental Memberlist: Added ability to periodically rejoin the memberlist cluster. * [BUGFIX] Ruler: Ensure temporary rule files with special characters are properly mapped and cleaned up. #2506 * [BUGFIX] Fixes #2411, Ensure requests are properly routed to the prometheus api embedded in the query if `-server.path-prefix` is set. #2372 * [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400 From 539eeead002bf57a0265c31a73d86dfc50419e77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 15 Jun 2020 11:00:44 +0200 Subject: [PATCH 4/7] Added PR number to CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de163411a2..14a21938ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,7 +111,7 @@ * [ENHANCEMENT] Add `-cassandra.reconnect-interval` to allow specifying the reconnect interval to a Cassandra server that has been marked `DOWN` by the gocql driver. Also change the default value of the reconnect interval from `60s` to `1s`. #2687 * [ENHANCEMENT] Experimental TSDB: Applied a jitter to the period bucket scans in order to better distribute bucket operations over the time and increase the probability of hitting the shared cache (if configured). #2693 * [ENHANCEMENT] Experimental TSDB: Series limit per user and per metric now work in TSDB blocks. #2676 -* [ENHANCEMENT] Experimental Memberlist: Added ability to periodically rejoin the memberlist cluster. +* [ENHANCEMENT] Experimental Memberlist: Added ability to periodically rejoin the memberlist cluster. #2724 * [BUGFIX] Ruler: Ensure temporary rule files with special characters are properly mapped and cleaned up. #2506 * [BUGFIX] Fixes #2411, Ensure requests are properly routed to the prometheus api embedded in the query if `-server.path-prefix` is set. #2372 * [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400 From 568419f64bdef5eb8f015a96d43a68d1fb20bc73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 15 Jun 2020 11:54:26 +0200 Subject: [PATCH 5/7] Update docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/configuration/config-file-reference.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 73a3d1033e..9720a3a1be 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -2324,6 +2324,11 @@ The `memberlist_config` configures the Gossip memberlist. # CLI flag: -memberlist.abort-if-join-fails [abort_if_cluster_join_fails: | default = true] +# If not 0, how often to rejoin the cluster. Occasional rejoin can be useful in +# case of cluster split, and is harmless otherwise. +# CLI flag: -memberlist.rejoin-interval +[rejoin_interval: | default = 0s] + # How long to keep LEFT ingesters in the ring. # CLI flag: -memberlist.left-ingesters-timeout [left_ingesters_timeout: | default = 5m] From cfb4a81058c89d661df935f3d8f6c16e3b78b7fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 16 Jun 2020 13:16:07 +0200 Subject: [PATCH 6/7] Stop the ticker when leaving running method. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ring/kv/memberlist/memberlist_client.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/ring/kv/memberlist/memberlist_client.go b/pkg/ring/kv/memberlist/memberlist_client.go index cb756d4f52..78bcf63458 100644 --- a/pkg/ring/kv/memberlist/memberlist_client.go +++ b/pkg/ring/kv/memberlist/memberlist_client.go @@ -393,6 +393,8 @@ func (m *KV) running(ctx context.Context) error { var tickerChan <-chan time.Time = nil if m.cfg.RejoinInterval > 0 { t := time.NewTicker(m.cfg.RejoinInterval) + defer t.Stop() + tickerChan = t.C } From 73e752f77f0e566a586e5531a352f20a0aacba64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 16 Jun 2020 13:25:57 +0200 Subject: [PATCH 7/7] Explain when -memberlist.rejoin-interval may be useful. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/configuration/config-file-reference.md | 8 ++++++-- pkg/ring/kv/memberlist/memberlist_client.go | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 9720a3a1be..b9c3a6891f 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -2324,8 +2324,12 @@ The `memberlist_config` configures the Gossip memberlist. # CLI flag: -memberlist.abort-if-join-fails [abort_if_cluster_join_fails: | default = true] -# If not 0, how often to rejoin the cluster. Occasional rejoin can be useful in -# case of cluster split, and is harmless otherwise. +# If not 0, how often to rejoin the cluster. Occasional rejoin can help to fix +# the cluster split issue, and is harmless otherwise. For example when using +# only few components as a seed nodes (via -memberlist.join), then it's +# recommended to use rejoin. If -memberlist.join points to dynamic service that +# resolves to all gossiping nodes (eg. Kubernetes headless service), then rejoin +# is not needed. # CLI flag: -memberlist.rejoin-interval [rejoin_interval: | default = 0s] diff --git a/pkg/ring/kv/memberlist/memberlist_client.go b/pkg/ring/kv/memberlist/memberlist_client.go index 78bcf63458..f9a43506f1 100644 --- a/pkg/ring/kv/memberlist/memberlist_client.go +++ b/pkg/ring/kv/memberlist/memberlist_client.go @@ -169,7 +169,7 @@ func (cfg *KVConfig) RegisterFlags(f *flag.FlagSet, prefix string) { f.DurationVar(&cfg.MaxJoinBackoff, prefix+"memberlist.max-join-backoff", 1*time.Minute, "Max backoff duration to join other cluster members.") f.IntVar(&cfg.MaxJoinRetries, prefix+"memberlist.max-join-retries", 10, "Max number of retries to join other cluster members.") f.BoolVar(&cfg.AbortIfJoinFails, prefix+"memberlist.abort-if-join-fails", true, "If this node fails to join memberlist cluster, abort.") - f.DurationVar(&cfg.RejoinInterval, prefix+"memberlist.rejoin-interval", 0, "If not 0, how often to rejoin the cluster. Occasional rejoin can be useful in case of cluster split, and is harmless otherwise.") + f.DurationVar(&cfg.RejoinInterval, prefix+"memberlist.rejoin-interval", 0, "If not 0, how often to rejoin the cluster. Occasional rejoin can help to fix the cluster split issue, and is harmless otherwise. For example when using only few components as a seed nodes (via -memberlist.join), then it's recommended to use rejoin. If -memberlist.join points to dynamic service that resolves to all gossiping nodes (eg. Kubernetes headless service), then rejoin is not needed.") f.DurationVar(&cfg.LeftIngestersTimeout, prefix+"memberlist.left-ingesters-timeout", 5*time.Minute, "How long to keep LEFT ingesters in the ring.") f.DurationVar(&cfg.LeaveTimeout, prefix+"memberlist.leave-timeout", 5*time.Second, "Timeout for leaving memberlist cluster.") f.DurationVar(&cfg.GossipInterval, prefix+"memberlist.gossip-interval", 0, "How often to gossip. Uses memberlist LAN defaults if 0.")