Skip to content

Commit c53e7f1

Browse files
authored
Decouple blocks meta fetcher metrics mapping from bucket store metrics (#2375)
* De-couple blocks meta fetcher metrics mapping from bucket store metrics Signed-off-by: Marco Pracucci <[email protected]> * Added PR number to CHANGELOG Signed-off-by: Marco Pracucci <[email protected]>
1 parent 5c98262 commit c53e7f1

File tree

7 files changed

+212
-84
lines changed

7 files changed

+212
-84
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@
7373
* [CHANGE] Experimental TSDB: Added `-compactor.deletion-delay`, which is time before a block marked for deletion is deleted from bucket. If not 0, blocks will be marked for deletion and compactor component will delete blocks marked for deletion from the bucket. If delete-delay is 0, blocks will be deleted straight away. Note that deleting blocks immediately can cause query failures, if store gateway / querier still has the block loaded, or compactor is ignoring the deletion because it's compacting the block at the same time. Default value is 48h. #2335
7474
* [CHANGE] Experimental TSDB: Added `-experimental.tsdb.bucket-store.index-cache.postings-compression-enabled`, to set duration after which the blocks marked for deletion will be filtered out while fetching blocks used for querying. This option allows querier to ignore blocks that are marked for deletion with some delay. This ensures store can still serve blocks that are meant to be deleted but do not have a replacement yet. Default is 24h, half of the default value for `-compactor.deletion-delay`. #2335
7575
* [CHANGE] Experimental TSDB: Added `-experimental.tsdb.bucket-store.index-cache.memcached.max-item-size` to control maximum size of item that is stored to memcached. Defaults to 1 MiB. #2335
76+
* [CHANGE] Experimental TSDB: renamed blocks meta fetcher metrics: #2375
77+
* `cortex_querier_bucket_store_blocks_meta_syncs_total` > `cortex_querier_blocks_meta_syncs_total`
78+
* `cortex_querier_bucket_store_blocks_meta_sync_failures_total` > `cortex_querier_blocks_meta_sync_failures_total`
79+
* `cortex_querier_bucket_store_blocks_meta_sync_duration_seconds` > `cortex_querier_blocks_meta_sync_duration_seconds`
80+
* `cortex_querier_bucket_store_blocks_meta_sync_consistency_delay_seconds` > `cortex_querier_blocks_meta_sync_consistency_delay_seconds`
7681
* [FEATURE] Added experimental storage API to the ruler service that is enabled when the `-experimental.ruler.enable-api` is set to true #2269
7782
* `-ruler.storage.type` flag now allows `s3`,`gcs`, and `azure` values
7883
* `-ruler.storage.(s3|gcs|azure)` flags exist to allow the configuration of object clients set for rule storage

development/tsdb-blocks-storage-s3/config/cortex.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ tsdb:
3737

3838
bucket_store:
3939
sync_dir: /tmp/cortex-tsdb-querier
40+
consistency_delay: 5s
4041

4142
index_cache:
4243
backend: memcached
@@ -52,7 +53,7 @@ tsdb:
5253

5354
ruler:
5455
enable_api: true
55-
storeconfig:
56+
storage:
5657
type: configdb
5758
configdb:
5859
configs_api_url: http://configstore:80/
@@ -72,4 +73,4 @@ compactor:
7273
host: consul:8500
7374

7475
frontend_worker:
75-
address: "query-frontend:9007"
76+
frontend_address: "query-frontend:9007"

pkg/querier/block_store.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type UserStore struct {
3737
bucket objstore.Bucket
3838
logLevel logging.Level
3939
bucketStoreMetrics *tsdbBucketStoreMetrics
40+
metaFetcherMetrics *metaFetcherMetrics
4041
indexCacheMetrics prometheus.Collector
4142

4243
// Index cache shared across all tenants.
@@ -61,6 +62,7 @@ func NewUserStore(cfg tsdb.Config, bucketClient objstore.Bucket, logLevel loggin
6162
stores: map[string]*store.BucketStore{},
6263
logLevel: logLevel,
6364
bucketStoreMetrics: newTSDBBucketStoreMetrics(),
65+
metaFetcherMetrics: newMetaFetcherMetrics(),
6466
indexCacheMetrics: tsdb.MustNewIndexCacheMetrics(cfg.BucketStore.IndexCache.Backend, indexCacheRegistry),
6567
syncTimes: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
6668
Name: "cortex_querier_blocks_sync_seconds",
@@ -76,7 +78,7 @@ func NewUserStore(cfg tsdb.Config, bucketClient objstore.Bucket, logLevel loggin
7678
}
7779

7880
if registerer != nil {
79-
registerer.MustRegister(u.bucketStoreMetrics, u.indexCacheMetrics)
81+
registerer.MustRegister(u.bucketStoreMetrics, u.metaFetcherMetrics, u.indexCacheMetrics)
8082
}
8183

8284
u.Service = services.NewBasicService(u.starting, u.syncStoresLoop, nil)
@@ -259,16 +261,16 @@ func (u *UserStore) getOrCreateStore(userID string) (*store.BucketStore, error)
259261

260262
userBkt := tsdb.NewUserBucketClient(userID, u.bucket)
261263

262-
reg := prometheus.NewRegistry()
264+
fetcherReg := prometheus.NewRegistry()
263265
fetcher, err := block.NewMetaFetcher(
264266
userLogger,
265267
u.cfg.BucketStore.MetaSyncConcurrency,
266268
userBkt,
267269
filepath.Join(u.cfg.BucketStore.SyncDir, userID), // The fetcher stores cached metas in the "meta-syncer/" sub directory
268-
reg,
270+
fetcherReg,
269271
[]block.MetadataFilter{
270272
// List of filters to apply (order matters).
271-
block.NewConsistencyDelayMetaFilter(userLogger, u.cfg.BucketStore.ConsistencyDelay, reg),
273+
block.NewConsistencyDelayMetaFilter(userLogger, u.cfg.BucketStore.ConsistencyDelay, fetcherReg),
272274
block.NewIgnoreDeletionMarkFilter(userLogger, userBkt, u.cfg.BucketStore.IgnoreDeletionMarksDelay),
273275
// Filters out duplicate blocks that can be formed from two or more overlapping
274276
// blocks that fully submatches the source blocks of the older blocks.
@@ -279,9 +281,10 @@ func (u *UserStore) getOrCreateStore(userID string) (*store.BucketStore, error)
279281
return nil, err
280282
}
281283

284+
bucketStoreReg := prometheus.NewRegistry()
282285
bs, err = store.NewBucketStore(
283286
userLogger,
284-
reg,
287+
bucketStoreReg,
285288
userBkt,
286289
fetcher,
287290
filepath.Join(u.cfg.BucketStore.SyncDir, userID),
@@ -301,7 +304,8 @@ func (u *UserStore) getOrCreateStore(userID string) (*store.BucketStore, error)
301304
}
302305

303306
u.stores[userID] = bs
304-
u.bucketStoreMetrics.addUserRegistry(userID, reg)
307+
u.metaFetcherMetrics.addUserRegistry(userID, fetcherReg)
308+
u.bucketStoreMetrics.addUserRegistry(userID, bucketStoreReg)
305309

306310
return bs, nil
307311
}

pkg/querier/block_store_metrics.go

Lines changed: 14 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -16,34 +16,26 @@ type tsdbBucketStoreMetrics struct {
1616
regs map[string]*prometheus.Registry
1717

1818
// exported metrics, gathered from Thanos BucketStore
19-
blockLoads *prometheus.Desc
20-
blockLoadFailures *prometheus.Desc
21-
blockDrops *prometheus.Desc
22-
blockDropFailures *prometheus.Desc
23-
blocksLoaded *prometheus.Desc
24-
seriesDataTouched *prometheus.Desc
25-
seriesDataFetched *prometheus.Desc
26-
seriesDataSizeTouched *prometheus.Desc
27-
seriesDataSizeFetched *prometheus.Desc
28-
seriesBlocksQueried *prometheus.Desc
29-
seriesGetAllDuration *prometheus.Desc
30-
seriesMergeDuration *prometheus.Desc
31-
seriesRefetches *prometheus.Desc
32-
resultSeriesCount *prometheus.Desc
33-
metaSyncs *prometheus.Desc
34-
metaSyncFailures *prometheus.Desc
35-
metaSyncDuration *prometheus.Desc
36-
metaSyncConsistencyDelay *prometheus.Desc
19+
blockLoads *prometheus.Desc
20+
blockLoadFailures *prometheus.Desc
21+
blockDrops *prometheus.Desc
22+
blockDropFailures *prometheus.Desc
23+
blocksLoaded *prometheus.Desc
24+
seriesDataTouched *prometheus.Desc
25+
seriesDataFetched *prometheus.Desc
26+
seriesDataSizeTouched *prometheus.Desc
27+
seriesDataSizeFetched *prometheus.Desc
28+
seriesBlocksQueried *prometheus.Desc
29+
seriesGetAllDuration *prometheus.Desc
30+
seriesMergeDuration *prometheus.Desc
31+
seriesRefetches *prometheus.Desc
32+
resultSeriesCount *prometheus.Desc
3733

3834
cachedPostingsCompressions *prometheus.Desc
3935
cachedPostingsCompressionErrors *prometheus.Desc
4036
cachedPostingsCompressionTimeSeconds *prometheus.Desc
4137
cachedPostingsOriginalSizeBytes *prometheus.Desc
4238
cachedPostingsCompressedSizeBytes *prometheus.Desc
43-
44-
// Ignored:
45-
// blocks_meta_synced
46-
// blocks_meta_modified
4739
}
4840

4941
func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics {
@@ -107,22 +99,6 @@ func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics {
10799
"cortex_querier_bucket_store_series_result_series",
108100
"TSDB: Number of series observed in the final result of a query.",
109101
nil, nil),
110-
metaSyncs: prometheus.NewDesc(
111-
"cortex_querier_bucket_store_blocks_meta_syncs_total",
112-
"TSDB: Total blocks metadata synchronization attempts",
113-
nil, nil),
114-
metaSyncFailures: prometheus.NewDesc(
115-
"cortex_querier_bucket_store_blocks_meta_sync_failures_total",
116-
"TSDB: Total blocks metadata synchronization failures",
117-
nil, nil),
118-
metaSyncDuration: prometheus.NewDesc(
119-
"cortex_querier_bucket_store_blocks_meta_sync_duration_seconds",
120-
"TSDB: Duration of the blocks metadata synchronization in seconds",
121-
nil, nil),
122-
metaSyncConsistencyDelay: prometheus.NewDesc(
123-
"cortex_querier_bucket_store_blocks_meta_sync_consistency_delay_seconds",
124-
"TSDB: Configured consistency delay in seconds.",
125-
nil, nil),
126102

127103
cachedPostingsCompressions: prometheus.NewDesc(
128104
"cortex_querier_bucket_store_cached_postings_compressions_total",
@@ -181,11 +157,6 @@ func (m *tsdbBucketStoreMetrics) Describe(out chan<- *prometheus.Desc) {
181157
out <- m.seriesRefetches
182158
out <- m.resultSeriesCount
183159

184-
out <- m.metaSyncs
185-
out <- m.metaSyncFailures
186-
out <- m.metaSyncDuration
187-
out <- m.metaSyncConsistencyDelay
188-
189160
out <- m.cachedPostingsCompressions
190161
out <- m.cachedPostingsCompressionErrors
191162
out <- m.cachedPostingsCompressionTimeSeconds
@@ -214,11 +185,6 @@ func (m *tsdbBucketStoreMetrics) Collect(out chan<- prometheus.Metric) {
214185
data.SendSumOfCounters(out, m.seriesRefetches, "thanos_bucket_store_series_refetches_total")
215186
data.SendSumOfSummaries(out, m.resultSeriesCount, "thanos_bucket_store_series_result_series")
216187

217-
data.SendSumOfCounters(out, m.metaSyncs, "blocks_meta_syncs_total")
218-
data.SendSumOfCounters(out, m.metaSyncFailures, "blocks_meta_sync_failures_total")
219-
data.SendSumOfHistograms(out, m.metaSyncDuration, "blocks_meta_sync_duration_seconds")
220-
data.SendMaxOfGauges(out, m.metaSyncConsistencyDelay, "consistency_delay_seconds")
221-
222188
data.SendSumOfCountersWithLabels(out, m.cachedPostingsCompressions, "thanos_bucket_store_cached_postings_compressions_total", "op")
223189
data.SendSumOfCountersWithLabels(out, m.cachedPostingsCompressionErrors, "thanos_bucket_store_cached_postings_compression_errors_total", "op")
224190
data.SendSumOfCountersWithLabels(out, m.cachedPostingsCompressionTimeSeconds, "thanos_bucket_store_cached_postings_compression_time_seconds", "op")
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package querier
2+
3+
import (
4+
"sync"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
8+
"github.com/cortexproject/cortex/pkg/util"
9+
)
10+
11+
// This struct aggregates metrics exported by Thanos MetaFetcher
12+
// and re-exports those aggregates as Cortex metrics.
13+
type metaFetcherMetrics struct {
14+
// Maps userID -> registry
15+
regsMu sync.Mutex
16+
regs map[string]*prometheus.Registry
17+
18+
// Exported metrics, gathered from Thanos MetaFetcher
19+
syncs *prometheus.Desc
20+
syncFailures *prometheus.Desc
21+
syncDuration *prometheus.Desc
22+
syncConsistencyDelay *prometheus.Desc
23+
24+
// Ignored:
25+
// blocks_meta_synced
26+
// blocks_meta_modified
27+
}
28+
29+
func newMetaFetcherMetrics() *metaFetcherMetrics {
30+
return &metaFetcherMetrics{
31+
regs: map[string]*prometheus.Registry{},
32+
33+
syncs: prometheus.NewDesc(
34+
"cortex_querier_blocks_meta_syncs_total",
35+
"Total blocks metadata synchronization attempts",
36+
nil, nil),
37+
syncFailures: prometheus.NewDesc(
38+
"cortex_querier_blocks_meta_sync_failures_total",
39+
"Total blocks metadata synchronization failures",
40+
nil, nil),
41+
syncDuration: prometheus.NewDesc(
42+
"cortex_querier_blocks_meta_sync_duration_seconds",
43+
"Duration of the blocks metadata synchronization in seconds",
44+
nil, nil),
45+
syncConsistencyDelay: prometheus.NewDesc(
46+
"cortex_querier_blocks_meta_sync_consistency_delay_seconds",
47+
"Configured consistency delay in seconds.",
48+
nil, nil),
49+
}
50+
}
51+
52+
func (m *metaFetcherMetrics) addUserRegistry(user string, reg *prometheus.Registry) {
53+
m.regsMu.Lock()
54+
m.regs[user] = reg
55+
m.regsMu.Unlock()
56+
}
57+
58+
func (m *metaFetcherMetrics) registries() map[string]*prometheus.Registry {
59+
regs := map[string]*prometheus.Registry{}
60+
61+
m.regsMu.Lock()
62+
defer m.regsMu.Unlock()
63+
for uid, r := range m.regs {
64+
regs[uid] = r
65+
}
66+
67+
return regs
68+
}
69+
70+
func (m *metaFetcherMetrics) Describe(out chan<- *prometheus.Desc) {
71+
72+
out <- m.syncs
73+
out <- m.syncFailures
74+
out <- m.syncDuration
75+
out <- m.syncConsistencyDelay
76+
}
77+
78+
func (m *metaFetcherMetrics) Collect(out chan<- prometheus.Metric) {
79+
data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries())
80+
81+
data.SendSumOfCounters(out, m.syncs, "blocks_meta_syncs_total")
82+
data.SendSumOfCounters(out, m.syncFailures, "blocks_meta_sync_failures_total")
83+
data.SendSumOfHistograms(out, m.syncDuration, "blocks_meta_sync_duration_seconds")
84+
data.SendMaxOfGauges(out, m.syncConsistencyDelay, "consistency_delay_seconds")
85+
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package querier
2+
3+
import (
4+
"bytes"
5+
"testing"
6+
7+
"github.com/prometheus/client_golang/prometheus"
8+
"github.com/prometheus/client_golang/prometheus/promauto"
9+
"github.com/prometheus/client_golang/prometheus/testutil"
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestMetaFetcherMetrics(t *testing.T) {
14+
mainReg := prometheus.NewPedanticRegistry()
15+
16+
metrics := newMetaFetcherMetrics()
17+
mainReg.MustRegister(metrics)
18+
19+
metrics.addUserRegistry("user1", populateMetaFetcherMetrics(3))
20+
metrics.addUserRegistry("user2", populateMetaFetcherMetrics(5))
21+
metrics.addUserRegistry("user3", populateMetaFetcherMetrics(7))
22+
23+
//noinspection ALL
24+
err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
25+
# HELP cortex_querier_blocks_meta_sync_duration_seconds Duration of the blocks metadata synchronization in seconds
26+
# TYPE cortex_querier_blocks_meta_sync_duration_seconds histogram
27+
cortex_querier_blocks_meta_sync_duration_seconds_bucket{le="0.01"} 0
28+
cortex_querier_blocks_meta_sync_duration_seconds_bucket{le="1"} 0
29+
cortex_querier_blocks_meta_sync_duration_seconds_bucket{le="10"} 3
30+
cortex_querier_blocks_meta_sync_duration_seconds_bucket{le="100"} 3
31+
cortex_querier_blocks_meta_sync_duration_seconds_bucket{le="1000"} 3
32+
cortex_querier_blocks_meta_sync_duration_seconds_bucket{le="+Inf"} 3
33+
cortex_querier_blocks_meta_sync_duration_seconds_sum 9
34+
cortex_querier_blocks_meta_sync_duration_seconds_count 3
35+
36+
# HELP cortex_querier_blocks_meta_sync_failures_total Total blocks metadata synchronization failures
37+
# TYPE cortex_querier_blocks_meta_sync_failures_total counter
38+
cortex_querier_blocks_meta_sync_failures_total 30
39+
40+
# HELP cortex_querier_blocks_meta_syncs_total Total blocks metadata synchronization attempts
41+
# TYPE cortex_querier_blocks_meta_syncs_total counter
42+
cortex_querier_blocks_meta_syncs_total 15
43+
44+
# HELP cortex_querier_blocks_meta_sync_consistency_delay_seconds Configured consistency delay in seconds.
45+
# TYPE cortex_querier_blocks_meta_sync_consistency_delay_seconds gauge
46+
cortex_querier_blocks_meta_sync_consistency_delay_seconds 300
47+
`))
48+
require.NoError(t, err)
49+
}
50+
51+
func populateMetaFetcherMetrics(base float64) *prometheus.Registry {
52+
reg := prometheus.NewRegistry()
53+
m := newMetaFetcherMetricsMock(reg)
54+
55+
m.syncs.Add(base * 1)
56+
m.syncFailures.Add(base * 2)
57+
m.syncDuration.Observe(3)
58+
m.syncConsistencyDelay.Set(300)
59+
60+
return reg
61+
}
62+
63+
type metaFetcherMetricsMock struct {
64+
syncs prometheus.Counter
65+
syncFailures prometheus.Counter
66+
syncDuration prometheus.Histogram
67+
syncConsistencyDelay prometheus.Gauge
68+
}
69+
70+
func newMetaFetcherMetricsMock(reg prometheus.Registerer) *metaFetcherMetricsMock {
71+
var m metaFetcherMetricsMock
72+
73+
m.syncs = promauto.With(reg).NewCounter(prometheus.CounterOpts{
74+
Subsystem: "blocks_meta",
75+
Name: "syncs_total",
76+
Help: "Total blocks metadata synchronization attempts",
77+
})
78+
m.syncFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
79+
Subsystem: "blocks_meta",
80+
Name: "sync_failures_total",
81+
Help: "Total blocks metadata synchronization failures",
82+
})
83+
m.syncDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
84+
Subsystem: "blocks_meta",
85+
Name: "sync_duration_seconds",
86+
Help: "Duration of the blocks metadata synchronization in seconds",
87+
Buckets: []float64{0.01, 1, 10, 100, 1000},
88+
})
89+
m.syncConsistencyDelay = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
90+
Name: "consistency_delay_seconds",
91+
Help: "Configured consistency delay in seconds.",
92+
})
93+
94+
return &m
95+
}

0 commit comments

Comments
 (0)