diff --git a/CHANGELOG.md b/CHANGELOG.md index 678b8fe1133..8dc2b3498e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ * [ENHANCEMENT] Distributor: Add native histograms max sample size bytes limit validation. #6834 * [ENHANCEMENT] Querier: Support caching parquet labels file in parquet queryable. #6835 * [ENHANCEMENT] Querier: Support query limits in parquet queryable. #6870 +* [ENHANCEMENT] Ring: Add zone label to ring_members metric. #6900 * [ENHANCEMENT] Ingester: Add new metric `cortex_ingester_push_errors_total` to track reasons for ingester request failures. #6901 * [BUGFIX] Ingester: Avoid error or early throttling when READONLY ingesters are present in the ring #6517 * [BUGFIX] Ingester: Fix labelset data race condition. #6573 diff --git a/pkg/ring/ring.go b/pkg/ring/ring.go index 6235bae797c..92c343d6849 100644 --- a/pkg/ring/ring.go +++ b/pkg/ring/ring.go @@ -201,7 +201,8 @@ type Ring struct { // List of zones for which there's at least 1 instance in the ring. This list is guaranteed // to be sorted alphabetically. - ringZones []string + ringZones []string + previousRingZones []string // Cache of shuffle-sharded subrings per identifier. Invalidated when topology changes. // If set to nil, no caching is done (used by tests, and subrings). @@ -262,7 +263,7 @@ func NewWithStoreClientAndStrategy(cfg Config, name, key string, store kv.Client Name: "ring_members", Help: "Number of members in the ring", ConstLabels: map[string]string{"name": name}}, - []string{"state"}), + []string{"state", "zone"}), totalTokensGauge: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ Name: "ring_tokens_total", Help: "Number of tokens in the ring", @@ -362,6 +363,7 @@ func (r *Ring) updateRingState(ringDesc *Desc) { r.ringTokensByZone = ringTokensByZone r.ringInstanceByToken = ringInstanceByToken r.ringInstanceIdByAddr = ringInstanceByAddr + r.previousRingZones = r.ringZones r.ringZones = ringZones r.lastTopologyChange = now if r.shuffledSubringCache != nil { @@ -665,12 +667,19 @@ func (r *Ring) updateRingMetrics(compareResult CompareResult) { return } - numByState := map[string]int{} + numByStateByZone := map[string]map[string]int{} oldestTimestampByState := map[string]int64{} // Initialized to zero so we emit zero-metrics (instead of not emitting anything) for _, s := range []string{unhealthy, ACTIVE.String(), LEAVING.String(), PENDING.String(), JOINING.String(), READONLY.String()} { - numByState[s] = 0 + numByStateByZone[s] = map[string]int{} + // make sure removed zones got zero value + for _, zone := range r.previousRingZones { + numByStateByZone[s][zone] = 0 + } + for _, zone := range r.ringZones { + numByStateByZone[s][zone] = 0 + } oldestTimestampByState[s] = 0 } @@ -679,14 +688,19 @@ func (r *Ring) updateRingMetrics(compareResult CompareResult) { if !r.IsHealthy(&instance, Reporting, r.KVClient.LastUpdateTime(r.key)) { s = unhealthy } - numByState[s]++ + if _, ok := numByStateByZone[s]; !ok { + numByStateByZone[s] = map[string]int{} + } + numByStateByZone[s][instance.Zone]++ if oldestTimestampByState[s] == 0 || instance.Timestamp < oldestTimestampByState[s] { oldestTimestampByState[s] = instance.Timestamp } } - for state, count := range numByState { - r.numMembersGaugeVec.WithLabelValues(state).Set(float64(count)) + for state, zones := range numByStateByZone { + for zone, count := range zones { + r.numMembersGaugeVec.WithLabelValues(state, zone).Set(float64(count)) + } } for state, timestamp := range oldestTimestampByState { r.oldestTimestampGaugeVec.WithLabelValues(state).Set(float64(timestamp)) diff --git a/pkg/ring/ring_test.go b/pkg/ring/ring_test.go index e2f7e0a8d1c..682cb7d942d 100644 --- a/pkg/ring/ring_test.go +++ b/pkg/ring/ring_test.go @@ -3202,12 +3202,12 @@ func TestUpdateMetrics(t *testing.T) { ring_member_ownership_percent{member="B",name="test"} 0.5000000002328306 # HELP ring_members Number of members in the ring # TYPE ring_members gauge - ring_members{name="test",state="ACTIVE"} 2 - ring_members{name="test",state="JOINING"} 0 - ring_members{name="test",state="LEAVING"} 0 - ring_members{name="test",state="PENDING"} 0 - ring_members{name="test",state="READONLY"} 0 - ring_members{name="test",state="Unhealthy"} 0 + ring_members{name="test",state="ACTIVE",zone=""} 2 + ring_members{name="test",state="JOINING",zone=""} 0 + ring_members{name="test",state="LEAVING",zone=""} 0 + ring_members{name="test",state="PENDING",zone=""} 0 + ring_members{name="test",state="READONLY",zone=""} 0 + ring_members{name="test",state="Unhealthy",zone=""} 0 # HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring. # TYPE ring_oldest_member_timestamp gauge ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11 @@ -3230,12 +3230,12 @@ func TestUpdateMetrics(t *testing.T) { Expected: ` # HELP ring_members Number of members in the ring # TYPE ring_members gauge - ring_members{name="test",state="ACTIVE"} 2 - ring_members{name="test",state="JOINING"} 0 - ring_members{name="test",state="LEAVING"} 0 - ring_members{name="test",state="PENDING"} 0 - ring_members{name="test",state="READONLY"} 0 - ring_members{name="test",state="Unhealthy"} 0 + ring_members{name="test",state="ACTIVE",zone=""} 2 + ring_members{name="test",state="JOINING",zone=""} 0 + ring_members{name="test",state="LEAVING",zone=""} 0 + ring_members{name="test",state="PENDING",zone=""} 0 + ring_members{name="test",state="READONLY",zone=""} 0 + ring_members{name="test",state="Unhealthy",zone=""} 0 # HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring. # TYPE ring_oldest_member_timestamp gauge ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11 @@ -3310,12 +3310,12 @@ func TestUpdateMetricsWithRemoval(t *testing.T) { ring_member_ownership_percent{member="B",name="test"} 0.5000000002328306 # HELP ring_members Number of members in the ring # TYPE ring_members gauge - ring_members{name="test",state="ACTIVE"} 2 - ring_members{name="test",state="JOINING"} 0 - ring_members{name="test",state="LEAVING"} 0 - ring_members{name="test",state="PENDING"} 0 - ring_members{name="test",state="READONLY"} 0 - ring_members{name="test",state="Unhealthy"} 0 + ring_members{name="test",state="ACTIVE",zone=""} 2 + ring_members{name="test",state="JOINING",zone=""} 0 + ring_members{name="test",state="LEAVING",zone=""} 0 + ring_members{name="test",state="PENDING",zone=""} 0 + ring_members{name="test",state="READONLY",zone=""} 0 + ring_members{name="test",state="Unhealthy",zone=""} 0 # HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring. # TYPE ring_oldest_member_timestamp gauge ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11 @@ -3347,12 +3347,130 @@ func TestUpdateMetricsWithRemoval(t *testing.T) { ring_member_ownership_percent{member="A",name="test"} 1 # HELP ring_members Number of members in the ring # TYPE ring_members gauge - ring_members{name="test",state="ACTIVE"} 1 - ring_members{name="test",state="JOINING"} 0 - ring_members{name="test",state="LEAVING"} 0 - ring_members{name="test",state="PENDING"} 0 - ring_members{name="test",state="READONLY"} 0 - ring_members{name="test",state="Unhealthy"} 0 + ring_members{name="test",state="ACTIVE",zone=""} 1 + ring_members{name="test",state="JOINING",zone=""} 0 + ring_members{name="test",state="LEAVING",zone=""} 0 + ring_members{name="test",state="PENDING",zone=""} 0 + ring_members{name="test",state="READONLY",zone=""} 0 + ring_members{name="test",state="Unhealthy",zone=""} 0 + # HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring. + # TYPE ring_oldest_member_timestamp gauge + ring_oldest_member_timestamp{name="test",state="ACTIVE"} 22 + ring_oldest_member_timestamp{name="test",state="JOINING"} 0 + ring_oldest_member_timestamp{name="test",state="LEAVING"} 0 + ring_oldest_member_timestamp{name="test",state="PENDING"} 0 + ring_oldest_member_timestamp{name="test",state="READONLY"} 0 + ring_oldest_member_timestamp{name="test",state="Unhealthy"} 0 + # HELP ring_tokens_owned The number of tokens in the ring owned by the member + # TYPE ring_tokens_owned gauge + ring_tokens_owned{member="A",name="test"} 2 + # HELP ring_tokens_total Number of tokens in the ring + # TYPE ring_tokens_total gauge + ring_tokens_total{name="test"} 2 + `)) + assert.NoError(t, err) +} + +func TestUpdateMetricsWithZone(t *testing.T) { + cfg := Config{ + KVStore: kv.Config{}, + HeartbeatTimeout: 0, // get healthy stats + ReplicationFactor: 3, + ZoneAwarenessEnabled: true, + DetailedMetricsEnabled: true, + } + + registry := prometheus.NewRegistry() + + // create the ring to set up metrics, but do not start + ring, err := NewWithStoreClientAndStrategy(cfg, testRingName, testRingKey, &MockClient{}, NewDefaultReplicationStrategy(), registry, log.NewNopLogger()) + require.NoError(t, err) + + ringDesc := Desc{ + Ingesters: map[string]InstanceDesc{ + "A": {Addr: "127.0.0.1", Timestamp: 22, Zone: "zone1", Tokens: []uint32{math.MaxUint32 / 6, (math.MaxUint32 / 6) * 4}}, + "B": {Addr: "127.0.0.2", Timestamp: 11, Zone: "zone2", Tokens: []uint32{(math.MaxUint32 / 6) * 2, (math.MaxUint32 / 6) * 5}}, + "C": {Addr: "127.0.0.3", Timestamp: 33, Zone: "zone3", Tokens: []uint32{(math.MaxUint32 / 6) * 3, math.MaxUint32}}, + }, + } + ring.updateRingState(&ringDesc) + + err = testutil.GatherAndCompare(registry, bytes.NewBufferString(` + # HELP ring_member_ownership_percent The percent ownership of the ring by member + # TYPE ring_member_ownership_percent gauge + ring_member_ownership_percent{member="A",name="test"} 0.3333333332557231 + ring_member_ownership_percent{member="B",name="test"} 0.3333333330228925 + ring_member_ownership_percent{member="C",name="test"} 0.3333333337213844 + # HELP ring_members Number of members in the ring + # TYPE ring_members gauge + ring_members{name="test",state="ACTIVE",zone="zone1"} 1 + ring_members{name="test",state="ACTIVE",zone="zone2"} 1 + ring_members{name="test",state="ACTIVE",zone="zone3"} 1 + ring_members{name="test",state="JOINING",zone="zone1"} 0 + ring_members{name="test",state="JOINING",zone="zone2"} 0 + ring_members{name="test",state="JOINING",zone="zone3"} 0 + ring_members{name="test",state="LEAVING",zone="zone1"} 0 + ring_members{name="test",state="LEAVING",zone="zone2"} 0 + ring_members{name="test",state="LEAVING",zone="zone3"} 0 + ring_members{name="test",state="PENDING",zone="zone1"} 0 + ring_members{name="test",state="PENDING",zone="zone2"} 0 + ring_members{name="test",state="PENDING",zone="zone3"} 0 + ring_members{name="test",state="READONLY",zone="zone1"} 0 + ring_members{name="test",state="READONLY",zone="zone2"} 0 + ring_members{name="test",state="READONLY",zone="zone3"} 0 + ring_members{name="test",state="Unhealthy",zone="zone1"} 0 + ring_members{name="test",state="Unhealthy",zone="zone2"} 0 + ring_members{name="test",state="Unhealthy",zone="zone3"} 0 + # HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring. + # TYPE ring_oldest_member_timestamp gauge + ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11 + ring_oldest_member_timestamp{name="test",state="JOINING"} 0 + ring_oldest_member_timestamp{name="test",state="LEAVING"} 0 + ring_oldest_member_timestamp{name="test",state="PENDING"} 0 + ring_oldest_member_timestamp{name="test",state="READONLY"} 0 + ring_oldest_member_timestamp{name="test",state="Unhealthy"} 0 + # HELP ring_tokens_owned The number of tokens in the ring owned by the member + # TYPE ring_tokens_owned gauge + ring_tokens_owned{member="A",name="test"} 2 + ring_tokens_owned{member="B",name="test"} 2 + ring_tokens_owned{member="C",name="test"} 2 + # HELP ring_tokens_total Number of tokens in the ring + # TYPE ring_tokens_total gauge + ring_tokens_total{name="test"} 6 + `)) + require.NoError(t, err) + + ringDescNew := Desc{ + Ingesters: map[string]InstanceDesc{ + "A": {Addr: "127.0.0.1", Timestamp: 22, Zone: "zone1", Tokens: []uint32{math.MaxUint32 / 6, (math.MaxUint32 / 6) * 4}}, + }, + } + ring.updateRingState(&ringDescNew) + + err = testutil.GatherAndCompare(registry, bytes.NewBufferString(` + # HELP ring_member_ownership_percent The percent ownership of the ring by member + # TYPE ring_member_ownership_percent gauge + ring_member_ownership_percent{member="A",name="test"} 1 + # HELP ring_members Number of members in the ring + # TYPE ring_members gauge + ring_members{name="test",state="ACTIVE",zone="zone1"} 1 + ring_members{name="test",state="ACTIVE",zone="zone2"} 0 + ring_members{name="test",state="ACTIVE",zone="zone3"} 0 + ring_members{name="test",state="JOINING",zone="zone1"} 0 + ring_members{name="test",state="JOINING",zone="zone2"} 0 + ring_members{name="test",state="JOINING",zone="zone3"} 0 + ring_members{name="test",state="LEAVING",zone="zone1"} 0 + ring_members{name="test",state="LEAVING",zone="zone2"} 0 + ring_members{name="test",state="LEAVING",zone="zone3"} 0 + ring_members{name="test",state="PENDING",zone="zone1"} 0 + ring_members{name="test",state="PENDING",zone="zone2"} 0 + ring_members{name="test",state="PENDING",zone="zone3"} 0 + ring_members{name="test",state="READONLY",zone="zone1"} 0 + ring_members{name="test",state="READONLY",zone="zone2"} 0 + ring_members{name="test",state="READONLY",zone="zone3"} 0 + ring_members{name="test",state="Unhealthy",zone="zone1"} 0 + ring_members{name="test",state="Unhealthy",zone="zone2"} 0 + ring_members{name="test",state="Unhealthy",zone="zone3"} 0 # HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring. # TYPE ring_oldest_member_timestamp gauge ring_oldest_member_timestamp{name="test",state="ACTIVE"} 22