From 57a3b0c2783ab1e744ba7b614372ab9959446a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 16 Jan 2020 14:49:06 +0100 Subject: [PATCH 01/27] Moved MetricFamiliesPerUser type to util package. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ingester/metrics.go | 62 +++----------------------------------- pkg/util/metrics_helper.go | 61 +++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 58 deletions(-) create mode 100644 pkg/util/metrics_helper.go diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 7a97258b97b..857435d9f27 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -6,7 +6,6 @@ import ( "github.com/cortexproject/cortex/pkg/util" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" ) const ( @@ -184,7 +183,7 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) { func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { regs := sm.registries() - data := gatheredMetricsPerUser{} + data := util.NewMetricFamiliersPerUser() for userID, r := range regs { m, err := r.Gather() @@ -193,16 +192,16 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { continue } - data.addGatheredDataForUser(userID, m) + data.AddGatheredDataForUser(userID, m) } // OK, we have it all. Let's build results. for metric, desc := range sm.sumCountersGlobally { - out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, data.sumCountersAcrossAllUsers(metric)) + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, data.SumCountersAcrossAllUsers(metric)) } for metric, desc := range sm.sumCountersPerUser { - userValues := data.sumCountersPerUser(metric) + userValues := data.SumCountersPerUser(metric) for user, val := range userValues { out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, val, user) } @@ -226,56 +225,3 @@ func (sm *tsdbMetrics) setRegistryForUser(userID string, registry *prometheus.Re sm.regs[userID] = registry sm.regsMu.Unlock() } - -func sumCounters(mfs []*dto.MetricFamily) float64 { - result := float64(0) - for _, mf := range mfs { - if mf.Type == nil || *mf.Type != dto.MetricType_COUNTER { - continue - } - - for _, m := range mf.Metric { - if m == nil || m.Counter == nil || m.Counter.Value == nil { - continue - } - - result += *m.Counter.Value - } - } - return result -} - -// first key = userID, second key = metric name. Value = slice of gathered values with the same metric name. -type gatheredMetricsPerUser map[string]map[string][]*dto.MetricFamily - -func (d gatheredMetricsPerUser) addGatheredDataForUser(userID string, metrics []*dto.MetricFamily) { - // first, create new map which maps metric names to a slice of MetricFamily instances. - // That makes it easier to do searches later. - perMetricName := map[string][]*dto.MetricFamily{} - - for _, m := range metrics { - if m.Name == nil { - continue - } - perMetricName[*m.Name] = append(perMetricName[*m.Name], m) - } - - d[userID] = perMetricName -} - -func (d gatheredMetricsPerUser) sumCountersAcrossAllUsers(counter string) float64 { - result := float64(0) - for _, perMetric := range d { - result += sumCounters(perMetric[counter]) - } - return result -} - -func (d gatheredMetricsPerUser) sumCountersPerUser(counter string) map[string]float64 { - result := map[string]float64{} - for user, perMetric := range d { - v := sumCounters(perMetric[counter]) - result[user] = v - } - return result -} diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go new file mode 100644 index 00000000000..a6ce2bb298d --- /dev/null +++ b/pkg/util/metrics_helper.go @@ -0,0 +1,61 @@ +package util + +import ( + dto "github.com/prometheus/client_model/go" +) + +// MetricFamiliesPerUser is a collection of metrics gathered via calling Gatherer.Gather() method on different +// gatherers, one per user. +// First key = userID, second key = metric name. +// Value = slice of gathered values with the same metric name. +type MetricFamiliesPerUser map[string]map[string][]*dto.MetricFamily + +func NewMetricFamiliersPerUser() MetricFamiliesPerUser { + return MetricFamiliesPerUser{} +} + +// AddGatheredDataForUser adds user-specific output of Gatherer.Gather method. +func (d MetricFamiliesPerUser) AddGatheredDataForUser(userID string, metrics []*dto.MetricFamily) { + // first, create new map which maps metric names to a slice of MetricFamily instances. + // That makes it easier to do searches later. + perMetricName := map[string][]*dto.MetricFamily{} + + for _, m := range metrics { + if m.Name == nil { + continue + } + perMetricName[*m.Name] = append(perMetricName[*m.Name], m) + } + + d[userID] = perMetricName +} + +// SumCountersAcrossAllUsers returns sum(counter). +func (d MetricFamiliesPerUser) SumCountersAcrossAllUsers(counter string) float64 { + result := float64(0) + for _, perMetric := range d { + result += sumCounters(perMetric[counter]) + } + return result +} + +// SumCountersPerUser returns sum(counter) by (userID), where userID will be the map key. +func (d MetricFamiliesPerUser) SumCountersPerUser(counter string) map[string]float64 { + result := map[string]float64{} + for user, perMetric := range d { + v := sumCounters(perMetric[counter]) + result[user] = v + } + return result +} + +func sumCounters(mfs []*dto.MetricFamily) float64 { + result := float64(0) + for _, mf := range mfs { + for _, m := range mf.Metric { + // This works even if m is nil, m.Counter is nil or m.Counter.Value is nil (it returns 0 in those cases) + result += m.GetCounter().GetValue() + } + } + return result +} From c27da490ad387ec4f5de7748b3b389f00147c6f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 16 Jan 2020 15:01:59 +0100 Subject: [PATCH 02/27] Added prometheus registry to block stores. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block_store.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index 1c00a8c05c2..bd593be688c 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -12,6 +12,7 @@ import ( "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" + "github.com/prometheus/client_golang/prometheus" "github.com/thanos-io/thanos/pkg/model" "github.com/thanos-io/thanos/pkg/objstore" "github.com/thanos-io/thanos/pkg/store" @@ -30,6 +31,10 @@ type UserStore struct { stores map[string]*store.BucketStore client storepb.StoreClient logLevel logging.Level + + // Maps userID -> registry + regsMu sync.Mutex + regs map[string]*prometheus.Registry } // NewUserStore returns a new UserStore @@ -124,9 +129,10 @@ func (u *UserStore) syncUserStores(ctx context.Context, f func(context.Context, if err != nil { return err } + reg := prometheus.NewRegistry() bs, err = store.NewBucketStore( u.logger, - nil, + reg, userBkt, filepath.Join(u.cfg.BucketStore.SyncDir, user), indexCache, @@ -147,6 +153,10 @@ func (u *UserStore) syncUserStores(ctx context.Context, f func(context.Context, } u.stores[user] = bs + + u.regsMu.Lock() + u.regs[user] = reg + u.regsMu.Unlock() } wg.Add(1) From 47855685591634ef0628ac8dc07ebc58c18f9d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 09:14:35 +0100 Subject: [PATCH 03/27] Expose first set of metrics from TSDB bucket store. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block.go | 1 + pkg/querier/block_store.go | 90 +++++++++++++++++++++++++++++++++++++- pkg/util/metrics_helper.go | 22 +++++++--- 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/pkg/querier/block.go b/pkg/querier/block.go index b90360ac25e..f6c7641bd73 100644 --- a/pkg/querier/block.go +++ b/pkg/querier/block.go @@ -46,6 +46,7 @@ func NewBlockQuerier(cfg tsdb.Config, logLevel logging.Level, r prometheus.Regis return nil, err } b.us = us + r.MustRegister(us) level.Info(util.Logger).Log("msg", "synchronizing TSDB blocks for all users") if err := us.InitialSync(context.Background()); err != nil { diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index bd593be688c..4e2ec3c3529 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -10,6 +10,7 @@ import ( "github.com/cortexproject/cortex/pkg/ingester" "github.com/cortexproject/cortex/pkg/storage/tsdb" + "github.com/cortexproject/cortex/pkg/util" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" @@ -35,6 +36,17 @@ type UserStore struct { // Maps userID -> registry regsMu sync.Mutex regs map[string]*prometheus.Registry + + // exported metrics + blockLoads *prometheus.Desc + blockLoadFailures *prometheus.Desc + blockDrops *prometheus.Desc + blockDropFailures *prometheus.Desc + blocksLoaded *prometheus.Desc + + // original metric name -> exported metric + countersMap map[string]*prometheus.Desc + gaugesMap map[string]*prometheus.Desc } // NewUserStore returns a new UserStore @@ -48,8 +60,41 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* logger: logger, cfg: cfg, bucket: bkt, - stores: make(map[string]*store.BucketStore), + stores: map[string]*store.BucketStore{}, logLevel: logLevel, + regs: map[string]*prometheus.Registry{}, + + blockLoads: prometheus.NewDesc( + "cortex_bucket_store_block_loads_total", + "TSDB: Total number of remote block loading attempts.", + nil, nil), + blockLoadFailures: prometheus.NewDesc( + "cortex_bucket_store_block_load_failures_total", + "TSDB: Total number of failed remote block loading attempts.", + nil, nil), + blockDrops: prometheus.NewDesc( + "cortex_bucket_store_block_drops_total", + "TSDB: Total number of local blocks that were dropped.", + nil, nil), + blockDropFailures: prometheus.NewDesc( + "cortex_bucket_store_block_drop_failures_total", + "TSDB: Total number of local blocks that failed to be dropped.", + nil, nil), + blocksLoaded: prometheus.NewDesc( + "cortex_bucket_store_blocks_loaded", + "TSDB: Number of currently loaded blocks.", + nil, nil), + } + + u.countersMap = map[string]*prometheus.Desc{ + "thanos_bucket_store_block_loads_total": u.blockLoads, + "thanos_bucket_store_block_load_failures_total": u.blockLoadFailures, + "thanos_bucket_store_block_drops_total": u.blockDrops, + "thanos_bucket_store_block_drop_failures_total": u.blockDropFailures, + } + + u.gaugesMap = map[string]*prometheus.Desc{ + "thanos_bucket_store_blocks_loaded": u.blocksLoaded, } serv := grpc.NewServer() @@ -254,3 +299,46 @@ func (u *UserStore) LabelValues(ctx context.Context, req *storepb.LabelValuesReq return store.LabelValues(ctx, req) } + +func (u *UserStore) registries() map[string]*prometheus.Registry { + regs := map[string]*prometheus.Registry{} + + u.regsMu.Lock() + defer u.regsMu.Unlock() + for uid, r := range u.regs { + regs[uid] = r + } + + return regs +} + +func (u *UserStore) Describe(out chan<- *prometheus.Desc) { + out <- u.blockLoads + out <- u.blockLoadFailures + out <- u.blockDrops + out <- u.blockDropFailures + out <- u.blocksLoaded + +} + +func (u *UserStore) Collect(out chan<- prometheus.Metric) { + regs := u.registries() + data := util.NewMetricFamiliersPerUser() + + for userID, r := range regs { + m, err := r.Gather() + if err != nil { + level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) + continue + } + + data.AddGatheredDataForUser(userID, m) + } + + for metric, desc := range u.countersMap { + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, data.SumCountersAcrossAllUsers(metric)) + } + for metric, desc := range u.gaugesMap { + out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, data.SumGaugesAcrossAllUsers(metric)) + } +} diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index a6ce2bb298d..0ad6d58b30d 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -34,7 +34,7 @@ func (d MetricFamiliesPerUser) AddGatheredDataForUser(userID string, metrics []* func (d MetricFamiliesPerUser) SumCountersAcrossAllUsers(counter string) float64 { result := float64(0) for _, perMetric := range d { - result += sumCounters(perMetric[counter]) + result += sum(perMetric[counter], counterValue) } return result } @@ -43,19 +43,31 @@ func (d MetricFamiliesPerUser) SumCountersAcrossAllUsers(counter string) float64 func (d MetricFamiliesPerUser) SumCountersPerUser(counter string) map[string]float64 { result := map[string]float64{} for user, perMetric := range d { - v := sumCounters(perMetric[counter]) + v := sum(perMetric[counter], counterValue) result[user] = v } return result } -func sumCounters(mfs []*dto.MetricFamily) float64 { +// SumCountersAcrossAllUsers returns sum(counter). +func (d MetricFamiliesPerUser) SumGaugesAcrossAllUsers(gauge string) float64 { + result := float64(0) + for _, perMetric := range d { + result += sum(perMetric[gauge], gaugeValue) + } + return result +} + +func sum(mfs []*dto.MetricFamily, fn func(*dto.Metric) float64) float64 { result := float64(0) for _, mf := range mfs { for _, m := range mf.Metric { - // This works even if m is nil, m.Counter is nil or m.Counter.Value is nil (it returns 0 in those cases) - result += m.GetCounter().GetValue() + result += fn(m) } } return result } + +// This works even if m is nil, m.Counter is nil or m.Counter.Value is nil (it returns 0 in those cases) +func counterValue(m *dto.Metric) float64 { return m.GetCounter().GetValue() } +func gaugeValue(m *dto.Metric) float64 { return m.GetGauge().GetValue() } From b93dd89db7af0f411df760c3d97e07dedf62814c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 11:59:41 +0100 Subject: [PATCH 04/27] Support for summaries with labels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ingester/metrics.go | 5 +- pkg/querier/block_store.go | 26 ++++++-- pkg/util/metrics_helper.go | 124 +++++++++++++++++++++++++++++++++---- 3 files changed, 138 insertions(+), 17 deletions(-) diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 857435d9f27..69138f6af08 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -187,12 +187,13 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { for userID, r := range regs { m, err := r.Gather() + if err == nil { + err = data.AddGatheredDataForUser(userID, m) + } if err != nil { level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) continue } - - data.AddGatheredDataForUser(userID, m) } // OK, we have it all. Let's build results. diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index 4e2ec3c3529..029640e4989 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -43,10 +43,12 @@ type UserStore struct { blockDrops *prometheus.Desc blockDropFailures *prometheus.Desc blocksLoaded *prometheus.Desc + seriesDataTouched *prometheus.Desc // original metric name -> exported metric - countersMap map[string]*prometheus.Desc - gaugesMap map[string]*prometheus.Desc + countersMap map[string]*prometheus.Desc + gaugesMap map[string]*prometheus.Desc + summariesMap map[string]*prometheus.Desc } // NewUserStore returns a new UserStore @@ -84,6 +86,10 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* "cortex_bucket_store_blocks_loaded", "TSDB: Number of currently loaded blocks.", nil, nil), + seriesDataTouched: prometheus.NewDesc( + "thanos_bucket_store_series_data_touched", + "TSDB: How many items of a data type in a block were touched for a single series request.", + []string{"data_type"}, nil), } u.countersMap = map[string]*prometheus.Desc{ @@ -97,6 +103,10 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* "thanos_bucket_store_blocks_loaded": u.blocksLoaded, } + u.summariesMap = map[string]*prometheus.Desc{ + "thanos_bucket_store_series_data_touched": u.seriesDataTouched, + } + serv := grpc.NewServer() storepb.RegisterStoreServer(serv, u) l, err := net.Listen("tcp", "") @@ -327,12 +337,14 @@ func (u *UserStore) Collect(out chan<- prometheus.Metric) { for userID, r := range regs { m, err := r.Gather() + if err == nil { + err = data.AddGatheredDataForUser(userID, m) + } + if err != nil { level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) continue } - - data.AddGatheredDataForUser(userID, m) } for metric, desc := range u.countersMap { @@ -341,4 +353,10 @@ func (u *UserStore) Collect(out chan<- prometheus.Metric) { for metric, desc := range u.gaugesMap { out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, data.SumGaugesAcrossAllUsers(metric)) } + for metric, desc := range u.summariesMap { + result := data.SummariersAcrossAllUsers(metric) + for _, sum := range result { + out <- prometheus.MustNewConstSummary(desc, sum.SampleCount, sum.SampleSum, sum.Quantiles, sum.LabelValues...) + } + } } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 0ad6d58b30d..5a5e1e5a581 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -1,6 +1,10 @@ package util import ( + "bytes" + "errors" + "fmt" + dto "github.com/prometheus/client_model/go" ) @@ -8,26 +12,35 @@ import ( // gatherers, one per user. // First key = userID, second key = metric name. // Value = slice of gathered values with the same metric name. -type MetricFamiliesPerUser map[string]map[string][]*dto.MetricFamily +type MetricFamiliesPerUser map[string]map[string]*dto.MetricFamily func NewMetricFamiliersPerUser() MetricFamiliesPerUser { return MetricFamiliesPerUser{} } // AddGatheredDataForUser adds user-specific output of Gatherer.Gather method. -func (d MetricFamiliesPerUser) AddGatheredDataForUser(userID string, metrics []*dto.MetricFamily) { +// Gatherer.Gather specifies that there metric families are uniquely named, and we use that fact here. +// If they are not, this method returns error. +func (d MetricFamiliesPerUser) AddGatheredDataForUser(userID string, metrics []*dto.MetricFamily) error { // first, create new map which maps metric names to a slice of MetricFamily instances. // That makes it easier to do searches later. - perMetricName := map[string][]*dto.MetricFamily{} + perMetricName := map[string]*dto.MetricFamily{} for _, m := range metrics { - if m.Name == nil { - continue + name := m.GetName() + // these errors should never happen when passing Gatherer.Gather() output. + if name == "" { + return errors.New("empty name for metric family") + } + if perMetricName[name] != nil { + return fmt.Errorf("non-unique name for metric family: %q", name) } - perMetricName[*m.Name] = append(perMetricName[*m.Name], m) + + perMetricName[name] = m } d[userID] = perMetricName + return nil } // SumCountersAcrossAllUsers returns sum(counter). @@ -58,13 +71,102 @@ func (d MetricFamiliesPerUser) SumGaugesAcrossAllUsers(gauge string) float64 { return result } -func sum(mfs []*dto.MetricFamily, fn func(*dto.Metric) float64) float64 { - result := float64(0) - for _, mf := range mfs { - for _, m := range mf.Metric { - result += fn(m) +type SummaryResult struct { + SampleCount uint64 + SampleSum float64 + Quantiles map[float64]float64 + LabelValues []string +} + +func (d MetricFamiliesPerUser) SummariersAcrossAllUsers(metricName string, labelNames ...string) []SummaryResult { + result := map[string]SummaryResult{} + + for _, perMetric := range d { // for each user + mf := perMetric[metricName] + for _, m := range mf.GetMetric() { + lbls, include := getLabelValues(m, labelNames) + if !include { + continue + } + + key := getLabelsString(lbls) + r := result[key] + if r.LabelValues == nil { + r.LabelValues = lbls + } + summary := m.GetSummary() + r.SampleCount += summary.GetSampleCount() + r.SampleSum += summary.GetSampleSum() + r.Quantiles = mergeSummaryQuantiles(r.Quantiles, summary.GetQuantile()) + + result[key] = r } } + + out := make([]SummaryResult, 0, len(result)) + for _, sr := range result { + out = append(out, sr) + } + return out +} + +func mergeSummaryQuantiles(quantiles map[float64]float64, summaryQuantiles []*dto.Quantile) map[float64]float64 { + if len(summaryQuantiles) == 0 { + return quantiles + } + + out := quantiles + if out == nil { + out = map[float64]float64{} + } + + for _, q := range summaryQuantiles { + // we assume that all summaries have same quantiles + out[q.GetQuantile()] += q.GetValue() + } + return out +} + +func getLabelValues(m *dto.Metric, labelNames []string) ([]string, bool) { + if len(labelNames) == 0 { + return nil, true + } + + all := map[string]string{} + for _, lp := range m.GetLabel() { + all[lp.GetName()] = lp.GetValue() + } + + result := make([]string, 0, len(labelNames)) + for _, ln := range labelNames { + lv, ok := all[ln] + if !ok { + // required labels not found + return nil, false + } + result = append(result, lv) + } + return result, true +} + +func getLabelsString(labelValues []string) string { + if len(labelValues) == 0 { + return "" + } + + buf := bytes.Buffer{} + for _, v := range labelValues { + buf.WriteString(v) + buf.WriteByte(0) // separator, not used in prometheus labels + } + return buf.String() +} + +func sum(mf *dto.MetricFamily, fn func(*dto.Metric) float64) float64 { + result := float64(0) + for _, m := range mf.Metric { + result += fn(m) + } return result } From 2834f8dca254560819330b2cc1d0e41e850ecead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 13:21:38 +0100 Subject: [PATCH 05/27] Send data directly to channel to avoid allocating extra slices/maps just to return results. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ingester/metrics.go | 31 +++++---------------- pkg/querier/block_store.go | 40 ++++++--------------------- pkg/util/metrics_helper.go | 55 +++++++++++++++++--------------------- 3 files changed, 39 insertions(+), 87 deletions(-) diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 69138f6af08..7d6d71c8a21 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -121,10 +121,6 @@ type tsdbMetrics struct { memSeriesCreatedTotal *prometheus.Desc memSeriesRemovedTotal *prometheus.Desc - // These maps drive the collection output. Key = original metric name to group. - sumCountersGlobally map[string]*prometheus.Desc - sumCountersPerUser map[string]*prometheus.Desc - regsMu sync.RWMutex // custom mutex for shipper registry, to avoid blocking main user state mutex on collection regs map[string]*prometheus.Registry // One prometheus registry per tenant } @@ -154,18 +150,6 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics { memSeriesRemovedTotal: prometheus.NewDesc(memSeriesRemovedTotalName, memSeriesRemovedTotalHelp, []string{"user"}, nil), } - m.sumCountersGlobally = map[string]*prometheus.Desc{ - "thanos_shipper_dir_syncs_total": m.dirSyncs, - "thanos_shipper_dir_sync_failures_total": m.dirSyncFailures, - "thanos_shipper_uploads_total": m.uploads, - "thanos_shipper_upload_failures_total": m.uploadFailures, - } - - m.sumCountersPerUser = map[string]*prometheus.Desc{ - "prometheus_tsdb_head_series_created_total": m.memSeriesCreatedTotal, - "prometheus_tsdb_head_series_removed_total": m.memSeriesRemovedTotal, - } - if r != nil { r.MustRegister(m) } @@ -197,16 +181,13 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { } // OK, we have it all. Let's build results. - for metric, desc := range sm.sumCountersGlobally { - out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, data.SumCountersAcrossAllUsers(metric)) - } + data.SendSumOfCounters(out, sm.dirSyncs, "thanos_shipper_dir_syncs_total") + data.SendSumOfCounters(out, sm.dirSyncFailures, "thanos_shipper_dir_sync_failures_total") + data.SendSumOfCounters(out, sm.uploads, "thanos_shipper_uploads_total") + data.SendSumOfCounters(out, sm.uploadFailures, "thanos_shipper_upload_failures_total") - for metric, desc := range sm.sumCountersPerUser { - userValues := data.SumCountersPerUser(metric) - for user, val := range userValues { - out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, val, user) - } - } + data.SendSumOfCountersPerUser(out, sm.memSeriesCreatedTotal, "prometheus_tsdb_head_series_created_total") + data.SendSumOfCountersPerUser(out, sm.memSeriesRemovedTotal, "prometheus_tsdb_head_series_removed_total") } // make a copy of the map, so that metrics can be gathered while the new registry is being added. diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index 029640e4989..f3135a39da3 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -44,11 +44,6 @@ type UserStore struct { blockDropFailures *prometheus.Desc blocksLoaded *prometheus.Desc seriesDataTouched *prometheus.Desc - - // original metric name -> exported metric - countersMap map[string]*prometheus.Desc - gaugesMap map[string]*prometheus.Desc - summariesMap map[string]*prometheus.Desc } // NewUserStore returns a new UserStore @@ -92,21 +87,6 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* []string{"data_type"}, nil), } - u.countersMap = map[string]*prometheus.Desc{ - "thanos_bucket_store_block_loads_total": u.blockLoads, - "thanos_bucket_store_block_load_failures_total": u.blockLoadFailures, - "thanos_bucket_store_block_drops_total": u.blockDrops, - "thanos_bucket_store_block_drop_failures_total": u.blockDropFailures, - } - - u.gaugesMap = map[string]*prometheus.Desc{ - "thanos_bucket_store_blocks_loaded": u.blocksLoaded, - } - - u.summariesMap = map[string]*prometheus.Desc{ - "thanos_bucket_store_series_data_touched": u.seriesDataTouched, - } - serv := grpc.NewServer() storepb.RegisterStoreServer(serv, u) l, err := net.Listen("tcp", "") @@ -347,16 +327,12 @@ func (u *UserStore) Collect(out chan<- prometheus.Metric) { } } - for metric, desc := range u.countersMap { - out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, data.SumCountersAcrossAllUsers(metric)) - } - for metric, desc := range u.gaugesMap { - out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, data.SumGaugesAcrossAllUsers(metric)) - } - for metric, desc := range u.summariesMap { - result := data.SummariersAcrossAllUsers(metric) - for _, sum := range result { - out <- prometheus.MustNewConstSummary(desc, sum.SampleCount, sum.SampleSum, sum.Quantiles, sum.LabelValues...) - } - } + data.SendSumOfCounters(out, u.blockLoads, "thanos_bucket_store_block_loads_total") + data.SendSumOfCounters(out, u.blockLoadFailures, "thanos_bucket_store_block_load_failures_total") + data.SendSumOfCounters(out, u.blockDrops, "thanos_bucket_store_block_drops_total") + data.SendSumOfCounters(out, u.blockDropFailures, "thanos_bucket_store_block_drop_failures_total") + + data.SendSumOfGauges(out, u.blocksLoaded, "thanos_bucket_store_blocks_loaded") + + data.SendSumOfSummaries(out, u.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 5a5e1e5a581..cbdac5d16c4 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" + "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" ) @@ -22,8 +23,7 @@ func NewMetricFamiliersPerUser() MetricFamiliesPerUser { // Gatherer.Gather specifies that there metric families are uniquely named, and we use that fact here. // If they are not, this method returns error. func (d MetricFamiliesPerUser) AddGatheredDataForUser(userID string, metrics []*dto.MetricFamily) error { - // first, create new map which maps metric names to a slice of MetricFamily instances. - // That makes it easier to do searches later. + // Keeping map of metric name to its family makes it easier to do searches later. perMetricName := map[string]*dto.MetricFamily{} for _, m := range metrics { @@ -43,46 +43,43 @@ func (d MetricFamiliesPerUser) AddGatheredDataForUser(userID string, metrics []* return nil } -// SumCountersAcrossAllUsers returns sum(counter). -func (d MetricFamiliesPerUser) SumCountersAcrossAllUsers(counter string) float64 { +func (d MetricFamiliesPerUser) SendSumOfCounters(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string) { result := float64(0) for _, perMetric := range d { result += sum(perMetric[counter], counterValue) } - return result + + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, result) } -// SumCountersPerUser returns sum(counter) by (userID), where userID will be the map key. -func (d MetricFamiliesPerUser) SumCountersPerUser(counter string) map[string]float64 { - result := map[string]float64{} +func (d MetricFamiliesPerUser) SendSumOfCountersPerUser(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string) { for user, perMetric := range d { v := sum(perMetric[counter], counterValue) - result[user] = v + + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, v, user) } - return result } -// SumCountersAcrossAllUsers returns sum(counter). -func (d MetricFamiliesPerUser) SumGaugesAcrossAllUsers(gauge string) float64 { +func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string) { result := float64(0) for _, perMetric := range d { result += sum(perMetric[gauge], gaugeValue) } - return result + out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, result) } -type SummaryResult struct { - SampleCount uint64 - SampleSum float64 - Quantiles map[float64]float64 - LabelValues []string -} +func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, desc *prometheus.Desc, summaryName string, labelNames ...string) { + type summaryResult struct { + sampleCount uint64 + sampleSum float64 + quantiles map[float64]float64 + labelValues []string + } -func (d MetricFamiliesPerUser) SummariersAcrossAllUsers(metricName string, labelNames ...string) []SummaryResult { - result := map[string]SummaryResult{} + result := map[string]summaryResult{} for _, perMetric := range d { // for each user - mf := perMetric[metricName] + mf := perMetric[summaryName] for _, m := range mf.GetMetric() { lbls, include := getLabelValues(m, labelNames) if !include { @@ -91,23 +88,21 @@ func (d MetricFamiliesPerUser) SummariersAcrossAllUsers(metricName string, label key := getLabelsString(lbls) r := result[key] - if r.LabelValues == nil { - r.LabelValues = lbls + if r.labelValues == nil { + r.labelValues = lbls } summary := m.GetSummary() - r.SampleCount += summary.GetSampleCount() - r.SampleSum += summary.GetSampleSum() - r.Quantiles = mergeSummaryQuantiles(r.Quantiles, summary.GetQuantile()) + r.sampleCount += summary.GetSampleCount() + r.sampleSum += summary.GetSampleSum() + r.quantiles = mergeSummaryQuantiles(r.quantiles, summary.GetQuantile()) result[key] = r } } - out := make([]SummaryResult, 0, len(result)) for _, sr := range result { - out = append(out, sr) + out <- prometheus.MustNewConstSummary(desc, sr.sampleCount, sr.sampleSum, sr.quantiles, sr.labelValues...) } - return out } func mergeSummaryQuantiles(quantiles map[float64]float64, summaryQuantiles []*dto.Quantile) map[float64]float64 { From 520911f29ac5b41100a75020308d7147bca56805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 13:26:51 +0100 Subject: [PATCH 06/27] Added more summaries. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block_store.go | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index f3135a39da3..19f04121b74 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -38,12 +38,16 @@ type UserStore struct { regs map[string]*prometheus.Registry // exported metrics - blockLoads *prometheus.Desc - blockLoadFailures *prometheus.Desc - blockDrops *prometheus.Desc - blockDropFailures *prometheus.Desc - blocksLoaded *prometheus.Desc - seriesDataTouched *prometheus.Desc + blockLoads *prometheus.Desc + blockLoadFailures *prometheus.Desc + blockDrops *prometheus.Desc + blockDropFailures *prometheus.Desc + blocksLoaded *prometheus.Desc + seriesDataTouched *prometheus.Desc + seriesDataFetched *prometheus.Desc + seriesDataSizeTouched *prometheus.Desc + seriesDataSizeFetched *prometheus.Desc + seriesBlocksQueried *prometheus.Desc } // NewUserStore returns a new UserStore @@ -82,9 +86,25 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* "TSDB: Number of currently loaded blocks.", nil, nil), seriesDataTouched: prometheus.NewDesc( - "thanos_bucket_store_series_data_touched", + "cortex_bucket_store_series_data_touched", "TSDB: How many items of a data type in a block were touched for a single series request.", []string{"data_type"}, nil), + seriesDataFetched: prometheus.NewDesc( + "cortex_bucket_store_series_data_fetched", + "TSDB: How many items of a data type in a block were fetched for a single series request.", + []string{"data_type"}, nil), + seriesDataSizeTouched: prometheus.NewDesc( + "cortex_bucket_store_series_data_size_touched_bytes", + "TSDB: Size of all items of a data type in a block were touched for a single series request.", + []string{"data_type"}, nil), + seriesDataSizeFetched: prometheus.NewDesc( + "cortex_bucket_store_series_data_size_fetched_bytes", + "TSDB: Size of all items of a data type in a block were fetched for a single series request.", + []string{"data_type"}, nil), + seriesBlocksQueried: prometheus.NewDesc( + "cortex_bucket_store_series_blocks_queried", + "TSDB: Number of blocks in a bucket store that were touched to satisfy a query.", + nil, nil), } serv := grpc.NewServer() @@ -335,4 +355,8 @@ func (u *UserStore) Collect(out chan<- prometheus.Metric) { data.SendSumOfGauges(out, u.blocksLoaded, "thanos_bucket_store_blocks_loaded") data.SendSumOfSummaries(out, u.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") + data.SendSumOfSummaries(out, u.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type") + data.SendSumOfSummaries(out, u.seriesDataSizeTouched, "thanos_bucket_store_series_data_size_touched_bytes", "data_type") + data.SendSumOfSummaries(out, u.seriesDataSizeFetched, "thanos_bucket_store_series_data_size_fetched_bytes", "data_type") + data.SendSumOfSummaries(out, u.seriesBlocksQueried, "thanos_bucket_store_series_blocks_queried") } From 7e3a2d5240cb2146916360056c89a5302ee11929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 14:09:55 +0100 Subject: [PATCH 07/27] Added remaining metrics from TSDB Bucket Store. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block_store.go | 46 ++++++++++++-- pkg/util/metrics_helper.go | 120 +++++++++++++++++++++++++++++-------- 2 files changed, 135 insertions(+), 31 deletions(-) diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index 19f04121b74..a3722997b91 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -48,6 +48,10 @@ type UserStore struct { seriesDataSizeTouched *prometheus.Desc seriesDataSizeFetched *prometheus.Desc seriesBlocksQueried *prometheus.Desc + seriesGetAllDuration *prometheus.Desc + seriesMergeDuration *prometheus.Desc + resultSeriesCount *prometheus.Desc + chunkSizeBytes *prometheus.Desc } // NewUserStore returns a new UserStore @@ -105,6 +109,23 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* "cortex_bucket_store_series_blocks_queried", "TSDB: Number of blocks in a bucket store that were touched to satisfy a query.", nil, nil), + + seriesGetAllDuration: prometheus.NewDesc( + "thanos_bucket_store_series_get_all_duration_seconds", + "TSDB: Time it takes until all per-block prepares and preloads for a query are finished.", + nil, nil), + seriesMergeDuration: prometheus.NewDesc( + "thanos_bucket_store_series_merge_duration_seconds", + "TSDB: Time it takes to merge sub-results from all queried blocks into a single result.", + nil, nil), + resultSeriesCount: prometheus.NewDesc( + "thanos_bucket_store_series_result_series", + "Number of series observed in the final result of a query.", + nil, nil), + chunkSizeBytes: prometheus.NewDesc( + "thanos_bucket_store_sent_chunk_size_bytes", + "TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", + nil, nil), } serv := grpc.NewServer() @@ -328,7 +349,15 @@ func (u *UserStore) Describe(out chan<- *prometheus.Desc) { out <- u.blockDrops out <- u.blockDropFailures out <- u.blocksLoaded - + out <- u.seriesDataTouched + out <- u.seriesDataFetched + out <- u.seriesDataSizeTouched + out <- u.seriesDataSizeFetched + out <- u.seriesBlocksQueried + out <- u.seriesGetAllDuration + out <- u.seriesMergeDuration + out <- u.resultSeriesCount + out <- u.chunkSizeBytes } func (u *UserStore) Collect(out chan<- prometheus.Metric) { @@ -354,9 +383,14 @@ func (u *UserStore) Collect(out chan<- prometheus.Metric) { data.SendSumOfGauges(out, u.blocksLoaded, "thanos_bucket_store_blocks_loaded") - data.SendSumOfSummaries(out, u.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") - data.SendSumOfSummaries(out, u.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type") - data.SendSumOfSummaries(out, u.seriesDataSizeTouched, "thanos_bucket_store_series_data_size_touched_bytes", "data_type") - data.SendSumOfSummaries(out, u.seriesDataSizeFetched, "thanos_bucket_store_series_data_size_fetched_bytes", "data_type") - data.SendSumOfSummaries(out, u.seriesBlocksQueried, "thanos_bucket_store_series_blocks_queried") + data.SendSumOfSummariesWithLabels(out, u.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") + data.SendSumOfSummariesWithLabels(out, u.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type") + data.SendSumOfSummariesWithLabels(out, u.seriesDataSizeTouched, "thanos_bucket_store_series_data_size_touched_bytes", "data_type") + data.SendSumOfSummariesWithLabels(out, u.seriesDataSizeFetched, "thanos_bucket_store_series_data_size_fetched_bytes", "data_type") + data.SendSumOfSummariesWithLabels(out, u.seriesBlocksQueried, "thanos_bucket_store_series_blocks_queried") + + data.SendSumOfHistograms(out, u.seriesGetAllDuration, "thanos_bucket_store_series_get_all_duration_seconds") + data.SendSumOfHistograms(out, u.seriesMergeDuration, "thanos_bucket_store_series_merge_duration_seconds") + data.SendSumOfSummaries(out, u.resultSeriesCount, "thanos_bucket_store_series_result_series") + data.SendSumOfHistograms(out, u.chunkSizeBytes, "thanos_bucket_store_sent_chunk_size_bytes") } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index cbdac5d16c4..fe64eeb485d 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -68,7 +68,26 @@ func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, des out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, result) } -func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, desc *prometheus.Desc, summaryName string, labelNames ...string) { +func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, desc *prometheus.Desc, summaryName string) { + var ( + sampleCount uint64 + sampleSum float64 + quantiles map[float64]float64 + ) + + for _, userMetrics := range d { // for each user + for _, m := range userMetrics[summaryName].GetMetric() { + summary := m.GetSummary() + sampleCount += summary.GetSampleCount() + sampleSum += summary.GetSampleSum() + quantiles = mergeSummaryQuantiles(quantiles, summary.GetQuantile()) + } + } + + out <- prometheus.MustNewConstSummary(desc, sampleCount, sampleSum, quantiles) +} + +func (d MetricFamiliesPerUser) SendSumOfSummariesWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, summaryName string, labelNames ...string) { type summaryResult struct { sampleCount uint64 sampleSum float64 @@ -78,25 +97,23 @@ func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, result := map[string]summaryResult{} - for _, perMetric := range d { // for each user - mf := perMetric[summaryName] - for _, m := range mf.GetMetric() { - lbls, include := getLabelValues(m, labelNames) - if !include { - continue - } + for _, userMetrics := range d { // for each user + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[summaryName], labelNames) - key := getLabelsString(lbls) - r := result[key] - if r.labelValues == nil { - r.labelValues = lbls - } - summary := m.GetSummary() - r.sampleCount += summary.GetSampleCount() - r.sampleSum += summary.GetSampleSum() - r.quantiles = mergeSummaryQuantiles(r.quantiles, summary.GetQuantile()) + for key, mwl := range metricsPerLabelValue { + for _, m := range mwl.metrics { + r := result[key] + if r.labelValues == nil { + r.labelValues = mwl.labelValues + } + + summary := m.GetSummary() + r.sampleCount += summary.GetSampleCount() + r.sampleSum += summary.GetSampleSum() + r.quantiles = mergeSummaryQuantiles(r.quantiles, summary.GetQuantile()) - result[key] = r + result[key] = r + } } } @@ -105,6 +122,25 @@ func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, } } +func (d MetricFamiliesPerUser) SendSumOfHistograms(out chan<- prometheus.Metric, desc *prometheus.Desc, histogramName string) { + var ( + sampleCount uint64 + sampleSum float64 + buckets map[float64]uint64 + ) + + for _, userMetrics := range d { // for each user + for _, m := range userMetrics[histogramName].GetMetric() { + histo := m.GetHistogram() + sampleCount += histo.GetSampleCount() + sampleSum += histo.GetSampleSum() + buckets = mergeHistogramBuckets(buckets, histo.GetBucket()) + } + } + + out <- prometheus.MustNewConstHistogram(desc, sampleCount, sampleSum, buckets) +} + func mergeSummaryQuantiles(quantiles map[float64]float64, summaryQuantiles []*dto.Quantile) map[float64]float64 { if len(summaryQuantiles) == 0 { return quantiles @@ -122,11 +158,49 @@ func mergeSummaryQuantiles(quantiles map[float64]float64, summaryQuantiles []*dt return out } -func getLabelValues(m *dto.Metric, labelNames []string) ([]string, bool) { - if len(labelNames) == 0 { - return nil, true +func mergeHistogramBuckets(buckets map[float64]uint64, histogramBuckets []*dto.Bucket) map[float64]uint64 { + if len(histogramBuckets) == 0 { + return buckets + } + + out := buckets + if out == nil { + out = map[float64]uint64{} + } + + for _, q := range histogramBuckets { + // we assume that all histograms have same buckets + out[q.GetUpperBound()] += q.GetCumulativeCount() } + return out +} + +type metricsWithLabels struct { + labelValues []string + metrics []*dto.Metric +} + +func getMetricsWithLabelNames(mf *dto.MetricFamily, labelNames []string) map[string]metricsWithLabels { + result := map[string]metricsWithLabels{} + for _, m := range mf.GetMetric() { + lbls, include := getLabelValues(m, labelNames) + if !include { + continue + } + + key := getLabelsString(lbls) + r := result[key] + if r.labelValues == nil { + r.labelValues = lbls + } + r.metrics = append(r.metrics, m) + result[key] = r + } + return result +} + +func getLabelValues(m *dto.Metric, labelNames []string) ([]string, bool) { all := map[string]string{} for _, lp := range m.GetLabel() { all[lp.GetName()] = lp.GetValue() @@ -145,10 +219,6 @@ func getLabelValues(m *dto.Metric, labelNames []string) ([]string, bool) { } func getLabelsString(labelValues []string) string { - if len(labelValues) == 0 { - return "" - } - buf := bytes.Buffer{} for _, v := range labelValues { buf.WriteString(v) From db6808d1b7c8913f5c8729113836f2c55d49396f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 14:17:34 +0100 Subject: [PATCH 08/27] Extracted TSDB bucket store metrics into separate type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block_store.go | 173 +++-------------------------- pkg/querier/block_store_metrics.go | 167 ++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 159 deletions(-) create mode 100644 pkg/querier/block_store_metrics.go diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index a3722997b91..dac9d425f55 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -10,7 +10,6 @@ import ( "github.com/cortexproject/cortex/pkg/ingester" "github.com/cortexproject/cortex/pkg/storage/tsdb" - "github.com/cortexproject/cortex/pkg/util" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" @@ -26,32 +25,13 @@ import ( // UserStore is a multi-tenant version of Thanos BucketStore type UserStore struct { - logger log.Logger - cfg tsdb.Config - bucket objstore.BucketReader - stores map[string]*store.BucketStore - client storepb.StoreClient - logLevel logging.Level - - // Maps userID -> registry - regsMu sync.Mutex - regs map[string]*prometheus.Registry - - // exported metrics - blockLoads *prometheus.Desc - blockLoadFailures *prometheus.Desc - blockDrops *prometheus.Desc - blockDropFailures *prometheus.Desc - blocksLoaded *prometheus.Desc - seriesDataTouched *prometheus.Desc - seriesDataFetched *prometheus.Desc - seriesDataSizeTouched *prometheus.Desc - seriesDataSizeFetched *prometheus.Desc - seriesBlocksQueried *prometheus.Desc - seriesGetAllDuration *prometheus.Desc - seriesMergeDuration *prometheus.Desc - resultSeriesCount *prometheus.Desc - chunkSizeBytes *prometheus.Desc + logger log.Logger + cfg tsdb.Config + bucket objstore.BucketReader + stores map[string]*store.BucketStore + client storepb.StoreClient + logLevel logging.Level + tsdbMetrics *tsdbBucketStoreMetrics } // NewUserStore returns a new UserStore @@ -62,70 +42,12 @@ func NewUserStore(cfg tsdb.Config, logLevel logging.Level, logger log.Logger) (* } u := &UserStore{ - logger: logger, - cfg: cfg, - bucket: bkt, - stores: map[string]*store.BucketStore{}, - logLevel: logLevel, - regs: map[string]*prometheus.Registry{}, - - blockLoads: prometheus.NewDesc( - "cortex_bucket_store_block_loads_total", - "TSDB: Total number of remote block loading attempts.", - nil, nil), - blockLoadFailures: prometheus.NewDesc( - "cortex_bucket_store_block_load_failures_total", - "TSDB: Total number of failed remote block loading attempts.", - nil, nil), - blockDrops: prometheus.NewDesc( - "cortex_bucket_store_block_drops_total", - "TSDB: Total number of local blocks that were dropped.", - nil, nil), - blockDropFailures: prometheus.NewDesc( - "cortex_bucket_store_block_drop_failures_total", - "TSDB: Total number of local blocks that failed to be dropped.", - nil, nil), - blocksLoaded: prometheus.NewDesc( - "cortex_bucket_store_blocks_loaded", - "TSDB: Number of currently loaded blocks.", - nil, nil), - seriesDataTouched: prometheus.NewDesc( - "cortex_bucket_store_series_data_touched", - "TSDB: How many items of a data type in a block were touched for a single series request.", - []string{"data_type"}, nil), - seriesDataFetched: prometheus.NewDesc( - "cortex_bucket_store_series_data_fetched", - "TSDB: How many items of a data type in a block were fetched for a single series request.", - []string{"data_type"}, nil), - seriesDataSizeTouched: prometheus.NewDesc( - "cortex_bucket_store_series_data_size_touched_bytes", - "TSDB: Size of all items of a data type in a block were touched for a single series request.", - []string{"data_type"}, nil), - seriesDataSizeFetched: prometheus.NewDesc( - "cortex_bucket_store_series_data_size_fetched_bytes", - "TSDB: Size of all items of a data type in a block were fetched for a single series request.", - []string{"data_type"}, nil), - seriesBlocksQueried: prometheus.NewDesc( - "cortex_bucket_store_series_blocks_queried", - "TSDB: Number of blocks in a bucket store that were touched to satisfy a query.", - nil, nil), - - seriesGetAllDuration: prometheus.NewDesc( - "thanos_bucket_store_series_get_all_duration_seconds", - "TSDB: Time it takes until all per-block prepares and preloads for a query are finished.", - nil, nil), - seriesMergeDuration: prometheus.NewDesc( - "thanos_bucket_store_series_merge_duration_seconds", - "TSDB: Time it takes to merge sub-results from all queried blocks into a single result.", - nil, nil), - resultSeriesCount: prometheus.NewDesc( - "thanos_bucket_store_series_result_series", - "Number of series observed in the final result of a query.", - nil, nil), - chunkSizeBytes: prometheus.NewDesc( - "thanos_bucket_store_sent_chunk_size_bytes", - "TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", - nil, nil), + logger: logger, + cfg: cfg, + bucket: bkt, + stores: map[string]*store.BucketStore{}, + logLevel: logLevel, + tsdbMetrics: newTSDBBucketStoreMetrics(), } serv := grpc.NewServer() @@ -229,10 +151,7 @@ func (u *UserStore) syncUserStores(ctx context.Context, f func(context.Context, } u.stores[user] = bs - - u.regsMu.Lock() - u.regs[user] = reg - u.regsMu.Unlock() + u.tsdbMetrics.addUserRegistry(user, reg) } wg.Add(1) @@ -330,67 +249,3 @@ func (u *UserStore) LabelValues(ctx context.Context, req *storepb.LabelValuesReq return store.LabelValues(ctx, req) } - -func (u *UserStore) registries() map[string]*prometheus.Registry { - regs := map[string]*prometheus.Registry{} - - u.regsMu.Lock() - defer u.regsMu.Unlock() - for uid, r := range u.regs { - regs[uid] = r - } - - return regs -} - -func (u *UserStore) Describe(out chan<- *prometheus.Desc) { - out <- u.blockLoads - out <- u.blockLoadFailures - out <- u.blockDrops - out <- u.blockDropFailures - out <- u.blocksLoaded - out <- u.seriesDataTouched - out <- u.seriesDataFetched - out <- u.seriesDataSizeTouched - out <- u.seriesDataSizeFetched - out <- u.seriesBlocksQueried - out <- u.seriesGetAllDuration - out <- u.seriesMergeDuration - out <- u.resultSeriesCount - out <- u.chunkSizeBytes -} - -func (u *UserStore) Collect(out chan<- prometheus.Metric) { - regs := u.registries() - data := util.NewMetricFamiliersPerUser() - - for userID, r := range regs { - m, err := r.Gather() - if err == nil { - err = data.AddGatheredDataForUser(userID, m) - } - - if err != nil { - level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) - continue - } - } - - data.SendSumOfCounters(out, u.blockLoads, "thanos_bucket_store_block_loads_total") - data.SendSumOfCounters(out, u.blockLoadFailures, "thanos_bucket_store_block_load_failures_total") - data.SendSumOfCounters(out, u.blockDrops, "thanos_bucket_store_block_drops_total") - data.SendSumOfCounters(out, u.blockDropFailures, "thanos_bucket_store_block_drop_failures_total") - - data.SendSumOfGauges(out, u.blocksLoaded, "thanos_bucket_store_blocks_loaded") - - data.SendSumOfSummariesWithLabels(out, u.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") - data.SendSumOfSummariesWithLabels(out, u.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type") - data.SendSumOfSummariesWithLabels(out, u.seriesDataSizeTouched, "thanos_bucket_store_series_data_size_touched_bytes", "data_type") - data.SendSumOfSummariesWithLabels(out, u.seriesDataSizeFetched, "thanos_bucket_store_series_data_size_fetched_bytes", "data_type") - data.SendSumOfSummariesWithLabels(out, u.seriesBlocksQueried, "thanos_bucket_store_series_blocks_queried") - - data.SendSumOfHistograms(out, u.seriesGetAllDuration, "thanos_bucket_store_series_get_all_duration_seconds") - data.SendSumOfHistograms(out, u.seriesMergeDuration, "thanos_bucket_store_series_merge_duration_seconds") - data.SendSumOfSummaries(out, u.resultSeriesCount, "thanos_bucket_store_series_result_series") - data.SendSumOfHistograms(out, u.chunkSizeBytes, "thanos_bucket_store_sent_chunk_size_bytes") -} diff --git a/pkg/querier/block_store_metrics.go b/pkg/querier/block_store_metrics.go new file mode 100644 index 00000000000..f6f5cb1b7c2 --- /dev/null +++ b/pkg/querier/block_store_metrics.go @@ -0,0 +1,167 @@ +package querier + +import ( + "sync" + + "github.com/cortexproject/cortex/pkg/util" + "github.com/go-kit/kit/log/level" + "github.com/prometheus/client_golang/prometheus" +) + +// This struct aggregates metrics exported by Thanos Bucket Store +// and re-exports those aggregates as Cortex metrics. +type tsdbBucketStoreMetrics struct { + // Maps userID -> registry + regsMu sync.Mutex + regs map[string]*prometheus.Registry + + // exported metrics + blockLoads *prometheus.Desc + blockLoadFailures *prometheus.Desc + blockDrops *prometheus.Desc + blockDropFailures *prometheus.Desc + blocksLoaded *prometheus.Desc + seriesDataTouched *prometheus.Desc + seriesDataFetched *prometheus.Desc + seriesDataSizeTouched *prometheus.Desc + seriesDataSizeFetched *prometheus.Desc + seriesBlocksQueried *prometheus.Desc + seriesGetAllDuration *prometheus.Desc + seriesMergeDuration *prometheus.Desc + resultSeriesCount *prometheus.Desc + chunkSizeBytes *prometheus.Desc +} + +func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics { + return &tsdbBucketStoreMetrics{ + regs: map[string]*prometheus.Registry{}, + + blockLoads: prometheus.NewDesc( + "cortex_bucket_store_block_loads_total", + "TSDB: Total number of remote block loading attempts.", + nil, nil), + blockLoadFailures: prometheus.NewDesc( + "cortex_bucket_store_block_load_failures_total", + "TSDB: Total number of failed remote block loading attempts.", + nil, nil), + blockDrops: prometheus.NewDesc( + "cortex_bucket_store_block_drops_total", + "TSDB: Total number of local blocks that were dropped.", + nil, nil), + blockDropFailures: prometheus.NewDesc( + "cortex_bucket_store_block_drop_failures_total", + "TSDB: Total number of local blocks that failed to be dropped.", + nil, nil), + blocksLoaded: prometheus.NewDesc( + "cortex_bucket_store_blocks_loaded", + "TSDB: Number of currently loaded blocks.", + nil, nil), + seriesDataTouched: prometheus.NewDesc( + "cortex_bucket_store_series_data_touched", + "TSDB: How many items of a data type in a block were touched for a single series request.", + []string{"data_type"}, nil), + seriesDataFetched: prometheus.NewDesc( + "cortex_bucket_store_series_data_fetched", + "TSDB: How many items of a data type in a block were fetched for a single series request.", + []string{"data_type"}, nil), + seriesDataSizeTouched: prometheus.NewDesc( + "cortex_bucket_store_series_data_size_touched_bytes", + "TSDB: Size of all items of a data type in a block were touched for a single series request.", + []string{"data_type"}, nil), + seriesDataSizeFetched: prometheus.NewDesc( + "cortex_bucket_store_series_data_size_fetched_bytes", + "TSDB: Size of all items of a data type in a block were fetched for a single series request.", + []string{"data_type"}, nil), + seriesBlocksQueried: prometheus.NewDesc( + "cortex_bucket_store_series_blocks_queried", + "TSDB: Number of blocks in a bucket store that were touched to satisfy a query.", + nil, nil), + + seriesGetAllDuration: prometheus.NewDesc( + "cortex_bucket_store_series_get_all_duration_seconds", + "TSDB: Time it takes until all per-block prepares and preloads for a query are finished.", + nil, nil), + seriesMergeDuration: prometheus.NewDesc( + "cortex_bucket_store_series_merge_duration_seconds", + "TSDB: Time it takes to merge sub-results from all queried blocks into a single result.", + nil, nil), + resultSeriesCount: prometheus.NewDesc( + "cortex_bucket_store_series_result_series", + "Number of series observed in the final result of a query.", + nil, nil), + chunkSizeBytes: prometheus.NewDesc( + "cortex_bucket_store_sent_chunk_size_bytes", + "TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", + nil, nil), + } +} + +func (m *tsdbBucketStoreMetrics) addUserRegistry(user string, reg *prometheus.Registry) { + m.regsMu.Lock() + m.regs[user] = reg + m.regsMu.Unlock() +} + +func (m *tsdbBucketStoreMetrics) registries() map[string]*prometheus.Registry { + regs := map[string]*prometheus.Registry{} + + m.regsMu.Lock() + defer m.regsMu.Unlock() + for uid, r := range m.regs { + regs[uid] = r + } + + return regs +} + +func (m *tsdbBucketStoreMetrics) Describe(out chan<- *prometheus.Desc) { + out <- m.blockLoads + out <- m.blockLoadFailures + out <- m.blockDrops + out <- m.blockDropFailures + out <- m.blocksLoaded + out <- m.seriesDataTouched + out <- m.seriesDataFetched + out <- m.seriesDataSizeTouched + out <- m.seriesDataSizeFetched + out <- m.seriesBlocksQueried + out <- m.seriesGetAllDuration + out <- m.seriesMergeDuration + out <- m.resultSeriesCount + out <- m.chunkSizeBytes +} + +func (m *tsdbBucketStoreMetrics) Collect(out chan<- prometheus.Metric) { + regs := m.registries() + data := util.NewMetricFamiliersPerUser() + + for userID, r := range regs { + m, err := r.Gather() + if err == nil { + err = data.AddGatheredDataForUser(userID, m) + } + + if err != nil { + level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) + continue + } + } + + data.SendSumOfCounters(out, m.blockLoads, "thanos_bucket_store_block_loads_total") + data.SendSumOfCounters(out, m.blockLoadFailures, "thanos_bucket_store_block_load_failures_total") + data.SendSumOfCounters(out, m.blockDrops, "thanos_bucket_store_block_drops_total") + data.SendSumOfCounters(out, m.blockDropFailures, "thanos_bucket_store_block_drop_failures_total") + + data.SendSumOfGauges(out, m.blocksLoaded, "thanos_bucket_store_blocks_loaded") + + data.SendSumOfSummariesWithLabels(out, m.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") + data.SendSumOfSummariesWithLabels(out, m.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type") + data.SendSumOfSummariesWithLabels(out, m.seriesDataSizeTouched, "thanos_bucket_store_series_data_size_touched_bytes", "data_type") + data.SendSumOfSummariesWithLabels(out, m.seriesDataSizeFetched, "thanos_bucket_store_series_data_size_fetched_bytes", "data_type") + data.SendSumOfSummariesWithLabels(out, m.seriesBlocksQueried, "thanos_bucket_store_series_blocks_queried") + + data.SendSumOfHistograms(out, m.seriesGetAllDuration, "thanos_bucket_store_series_get_all_duration_seconds") + data.SendSumOfHistograms(out, m.seriesMergeDuration, "thanos_bucket_store_series_merge_duration_seconds") + data.SendSumOfSummaries(out, m.resultSeriesCount, "thanos_bucket_store_series_result_series") + data.SendSumOfHistograms(out, m.chunkSizeBytes, "thanos_bucket_store_sent_chunk_size_bytes") +} From 88a118ddf85c19f06cc8c852c793d1e916134929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 14:58:41 +0100 Subject: [PATCH 09/27] Added test for bucket_store_metrics_test. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ingester/metrics_test.go | 13 +- pkg/querier/block.go | 2 +- pkg/querier/bucket_store_metrics_test.go | 317 +++++++++++++++++++++++ 3 files changed, 320 insertions(+), 12 deletions(-) create mode 100644 pkg/querier/bucket_store_metrics_test.go diff --git a/pkg/ingester/metrics_test.go b/pkg/ingester/metrics_test.go index 1cb450dd21b..749cbe4ca48 100644 --- a/pkg/ingester/metrics_test.go +++ b/pkg/ingester/metrics_test.go @@ -10,7 +10,7 @@ import ( ) func TestTSDBMetrics(t *testing.T) { - mainReg := prometheus.NewRegistry() + mainReg := prometheus.NewPedanticRegistry() tsdbMetrics := newTSDBMetrics(mainReg) @@ -18,15 +18,6 @@ func TestTSDBMetrics(t *testing.T) { tsdbMetrics.setRegistryForUser("user2", populateTSDBMetrics(85787)) tsdbMetrics.setRegistryForUser("user3", populateTSDBMetrics(999)) - metricNames := []string{ - "cortex_ingester_shipper_dir_syncs_total", - "cortex_ingester_shipper_dir_sync_failures_total", - "cortex_ingester_shipper_uploads_total", - "cortex_ingester_shipper_upload_failures_total", - "cortex_ingester_memory_series_created_total", - "cortex_ingester_memory_series_removed_total", - } - err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` # HELP cortex_ingester_shipper_dir_syncs_total TSDB: Total dir sync attempts # TYPE cortex_ingester_shipper_dir_syncs_total counter @@ -61,7 +52,7 @@ func TestTSDBMetrics(t *testing.T) { cortex_ingester_memory_series_removed_total{user="user1"} 74070 cortex_ingester_memory_series_removed_total{user="user2"} 514722 cortex_ingester_memory_series_removed_total{user="user3"} 5994 - `), metricNames...) + `)) require.NoError(t, err) } diff --git a/pkg/querier/block.go b/pkg/querier/block.go index f6c7641bd73..d9843d95e3d 100644 --- a/pkg/querier/block.go +++ b/pkg/querier/block.go @@ -46,7 +46,7 @@ func NewBlockQuerier(cfg tsdb.Config, logLevel logging.Level, r prometheus.Regis return nil, err } b.us = us - r.MustRegister(us) + r.MustRegister(us.tsdbMetrics) level.Info(util.Logger).Log("msg", "synchronizing TSDB blocks for all users") if err := us.InitialSync(context.Background()); err != nil { diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go new file mode 100644 index 00000000000..02cd5cf6db7 --- /dev/null +++ b/pkg/querier/bucket_store_metrics_test.go @@ -0,0 +1,317 @@ +package querier + +import ( + "bytes" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" +) + +func TestTsdbBucketStoreMetrics(t *testing.T) { + mainReg := prometheus.NewPedanticRegistry() + + tsdbMetrics := newTSDBBucketStoreMetrics() + mainReg.MustRegister(tsdbMetrics) + + tsdbMetrics.addUserRegistry("user1", populateTSDBBucketStore(5328)) + tsdbMetrics.addUserRegistry("user2", populateTSDBBucketStore(6908)) + tsdbMetrics.addUserRegistry("user3", populateTSDBBucketStore(10283)) + + //noinspection ALL + err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` + # HELP cortex_bucket_store_blocks_loaded TSDB: Number of currently loaded blocks. + # TYPE cortex_bucket_store_blocks_loaded gauge + cortex_bucket_store_blocks_loaded 22519 + + # HELP cortex_bucket_store_block_loads_total TSDB: Total number of remote block loading attempts. + # TYPE cortex_bucket_store_block_loads_total counter + cortex_bucket_store_block_loads_total 45038 + + # HELP cortex_bucket_store_block_load_failures_total TSDB: Total number of failed remote block loading attempts. + # TYPE cortex_bucket_store_block_load_failures_total counter + cortex_bucket_store_block_load_failures_total 67557 + + # HELP cortex_bucket_store_block_drops_total TSDB: Total number of local blocks that were dropped. + # TYPE cortex_bucket_store_block_drops_total counter + cortex_bucket_store_block_drops_total 90076 + + # HELP cortex_bucket_store_block_drop_failures_total TSDB: Total number of local blocks that failed to be dropped. + # TYPE cortex_bucket_store_block_drop_failures_total counter + cortex_bucket_store_block_drop_failures_total 112595 + + # HELP cortex_bucket_store_sent_chunk_size_bytes TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier. + # TYPE cortex_bucket_store_sent_chunk_size_bytes histogram + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="32"} 0 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="256"} 0 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="512"} 0 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="1024"} 0 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="32768"} 0 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="262144"} 4 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="524288"} 6 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="1.048576e+06"} 6 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="3.3554432e+07"} 6 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="2.68435456e+08"} 6 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="5.36870912e+08"} 6 + cortex_bucket_store_sent_chunk_size_bytes_bucket{le="+Inf"} 6 + cortex_bucket_store_sent_chunk_size_bytes_sum 1.328621e+06 + cortex_bucket_store_sent_chunk_size_bytes_count 6 + + # HELP cortex_bucket_store_series_blocks_queried TSDB: Number of blocks in a bucket store that were touched to satisfy a query. + # TYPE cortex_bucket_store_series_blocks_queried summary + cortex_bucket_store_series_blocks_queried_sum 1.283583e+06 + cortex_bucket_store_series_blocks_queried_count 9 + + # HELP cortex_bucket_store_series_data_fetched TSDB: How many items of a data type in a block were fetched for a single series request. + # TYPE cortex_bucket_store_series_data_fetched summary + cortex_bucket_store_series_data_fetched_sum{data_type="fetched-a"} 202671 + cortex_bucket_store_series_data_fetched_count{data_type="fetched-a"} 3 + cortex_bucket_store_series_data_fetched_sum{data_type="fetched-b"} 225190 + cortex_bucket_store_series_data_fetched_count{data_type="fetched-b"} 3 + cortex_bucket_store_series_data_fetched_sum{data_type="fetched-c"} 247709 + cortex_bucket_store_series_data_fetched_count{data_type="fetched-c"} 3 + + # HELP cortex_bucket_store_series_data_size_fetched_bytes TSDB: Size of all items of a data type in a block were fetched for a single series request. + # TYPE cortex_bucket_store_series_data_size_fetched_bytes summary + cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-a"} 337785 + cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-a"} 3 + cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-b"} 360304 + cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-b"} 3 + cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-c"} 382823 + cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-c"} 3 + + # HELP cortex_bucket_store_series_data_size_touched_bytes TSDB: Size of all items of a data type in a block were touched for a single series request. + # TYPE cortex_bucket_store_series_data_size_touched_bytes summary + cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-a"} 270228 + cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-a"} 3 + cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-b"} 292747 + cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-b"} 3 + cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-c"} 315266 + cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-c"} 3 + + # HELP cortex_bucket_store_series_data_touched TSDB: How many items of a data type in a block were touched for a single series request. + # TYPE cortex_bucket_store_series_data_touched summary + cortex_bucket_store_series_data_touched_sum{data_type="touched-a"} 135114 + cortex_bucket_store_series_data_touched_count{data_type="touched-a"} 3 + cortex_bucket_store_series_data_touched_sum{data_type="touched-b"} 157633 + cortex_bucket_store_series_data_touched_count{data_type="touched-b"} 3 + cortex_bucket_store_series_data_touched_sum{data_type="touched-c"} 180152 + cortex_bucket_store_series_data_touched_count{data_type="touched-c"} 3 + + # HELP cortex_bucket_store_series_get_all_duration_seconds TSDB: Time it takes until all per-block prepares and preloads for a query are finished. + # TYPE cortex_bucket_store_series_get_all_duration_seconds histogram + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.001"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.01"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.1"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.3"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.6"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="1"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="3"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="6"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="9"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="20"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="30"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="60"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="90"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="120"} 0 + cortex_bucket_store_series_get_all_duration_seconds_bucket{le="+Inf"} 9 + cortex_bucket_store_series_get_all_duration_seconds_sum 1.486254e+06 + cortex_bucket_store_series_get_all_duration_seconds_count 9 + + # HELP cortex_bucket_store_series_merge_duration_seconds TSDB: Time it takes to merge sub-results from all queried blocks into a single result. + # TYPE cortex_bucket_store_series_merge_duration_seconds histogram + cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.001"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.01"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.1"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.3"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.6"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="1"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="3"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="6"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="9"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="20"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="30"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="60"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="90"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="120"} 0 + cortex_bucket_store_series_merge_duration_seconds_bucket{le="+Inf"} 9 + cortex_bucket_store_series_merge_duration_seconds_sum 1.688925e+06 + cortex_bucket_store_series_merge_duration_seconds_count 9 + + # HELP cortex_bucket_store_series_result_series Number of series observed in the final result of a query. + # TYPE cortex_bucket_store_series_result_series summary + cortex_bucket_store_series_result_series_sum 1.238545e+06 + cortex_bucket_store_series_result_series_count 6 +`)) + require.NoError(t, err) +} + +func populateTSDBBucketStore(base float64) *prometheus.Registry { + reg := prometheus.NewRegistry() + m := newBucketStoreMetrics(reg) + + m.blocksLoaded.Add(1 * base) + m.blockLoads.Add(2 * base) + m.blockLoadFailures.Add(3 * base) + m.blockDrops.Add(4 * base) + m.blockDropFailures.Add(5 * base) + m.seriesDataTouched.WithLabelValues("touched-a").Observe(6 * base) + m.seriesDataTouched.WithLabelValues("touched-b").Observe(7 * base) + m.seriesDataTouched.WithLabelValues("touched-c").Observe(8 * base) + + m.seriesDataFetched.WithLabelValues("fetched-a").Observe(9 * base) + m.seriesDataFetched.WithLabelValues("fetched-b").Observe(10 * base) + m.seriesDataFetched.WithLabelValues("fetched-c").Observe(11 * base) + + m.seriesDataSizeTouched.WithLabelValues("size-touched-a").Observe(12 * base) + m.seriesDataSizeTouched.WithLabelValues("size-touched-b").Observe(13 * base) + m.seriesDataSizeTouched.WithLabelValues("size-touched-c").Observe(14 * base) + + m.seriesDataSizeFetched.WithLabelValues("size-fetched-a").Observe(15 * base) + m.seriesDataSizeFetched.WithLabelValues("size-fetched-b").Observe(16 * base) + m.seriesDataSizeFetched.WithLabelValues("size-fetched-c").Observe(17 * base) + + m.seriesBlocksQueried.Observe(18 * base) + m.seriesBlocksQueried.Observe(19 * base) + m.seriesBlocksQueried.Observe(20 * base) + + m.seriesGetAllDuration.Observe(21 * base) + m.seriesGetAllDuration.Observe(22 * base) + m.seriesGetAllDuration.Observe(23 * base) + + m.seriesMergeDuration.Observe(24 * base) + m.seriesMergeDuration.Observe(25 * base) + m.seriesMergeDuration.Observe(26 * base) + + m.resultSeriesCount.Observe(27 * base) + m.resultSeriesCount.Observe(28 * base) + + m.chunkSizeBytes.Observe(29 * base) + m.chunkSizeBytes.Observe(30 * base) + + m.queriesDropped.Add(31 * base) + m.queriesLimit.Add(32 * base) + return reg +} + +// copied from Thanos, pkg/store/bucket.go +type bucketStoreMetrics struct { + blocksLoaded prometheus.Gauge + blockLoads prometheus.Counter + blockLoadFailures prometheus.Counter + blockDrops prometheus.Counter + blockDropFailures prometheus.Counter + seriesDataTouched *prometheus.SummaryVec + seriesDataFetched *prometheus.SummaryVec + seriesDataSizeTouched *prometheus.SummaryVec + seriesDataSizeFetched *prometheus.SummaryVec + seriesBlocksQueried prometheus.Summary + seriesGetAllDuration prometheus.Histogram + seriesMergeDuration prometheus.Histogram + resultSeriesCount prometheus.Summary + chunkSizeBytes prometheus.Histogram + queriesDropped prometheus.Counter + queriesLimit prometheus.Gauge +} + +func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { + var m bucketStoreMetrics + + m.blockLoads = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_bucket_store_block_loads_total", + Help: "Total number of remote block loading attempts.", + }) + m.blockLoadFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_bucket_store_block_load_failures_total", + Help: "Total number of failed remote block loading attempts.", + }) + m.blockDrops = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_bucket_store_block_drops_total", + Help: "Total number of local blocks that were dropped.", + }) + m.blockDropFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_bucket_store_block_drop_failures_total", + Help: "Total number of local blocks that failed to be dropped.", + }) + m.blocksLoaded = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_bucket_store_blocks_loaded", + Help: "Number of currently loaded blocks.", + }) + + m.seriesDataTouched = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "thanos_bucket_store_series_data_touched", + Help: "How many items of a data type in a block were touched for a single series request.", + }, []string{"data_type"}) + m.seriesDataFetched = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "thanos_bucket_store_series_data_fetched", + Help: "How many items of a data type in a block were fetched for a single series request.", + }, []string{"data_type"}) + + m.seriesDataSizeTouched = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "thanos_bucket_store_series_data_size_touched_bytes", + Help: "Size of all items of a data type in a block were touched for a single series request.", + }, []string{"data_type"}) + m.seriesDataSizeFetched = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "thanos_bucket_store_series_data_size_fetched_bytes", + Help: "Size of all items of a data type in a block were fetched for a single series request.", + }, []string{"data_type"}) + + m.seriesBlocksQueried = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "thanos_bucket_store_series_blocks_queried", + Help: "Number of blocks in a bucket store that were touched to satisfy a query.", + }) + m.seriesGetAllDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "thanos_bucket_store_series_get_all_duration_seconds", + Help: "Time it takes until all per-block prepares and preloads for a query are finished.", + Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, + }) + m.seriesMergeDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "thanos_bucket_store_series_merge_duration_seconds", + Help: "Time it takes to merge sub-results from all queried blocks into a single result.", + Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, + }) + m.resultSeriesCount = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "thanos_bucket_store_series_result_series", + Help: "Number of series observed in the final result of a query.", + }) + + m.chunkSizeBytes = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "thanos_bucket_store_sent_chunk_size_bytes", + Help: "Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", + Buckets: []float64{ + 32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024, + }, + }) + + m.queriesDropped = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_bucket_store_queries_dropped_total", + Help: "Number of queries that were dropped due to the sample limit.", + }) + m.queriesLimit = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_bucket_store_queries_concurrent_max", + Help: "Number of maximum concurrent queries.", + }) + + if reg != nil { + reg.MustRegister( + m.blockLoads, + m.blockLoadFailures, + m.blockDrops, + m.blockDropFailures, + m.blocksLoaded, + m.seriesDataTouched, + m.seriesDataFetched, + m.seriesDataSizeTouched, + m.seriesDataSizeFetched, + m.seriesBlocksQueried, + m.seriesGetAllDuration, + m.seriesMergeDuration, + m.resultSeriesCount, + m.chunkSizeBytes, + m.queriesDropped, + m.queriesLimit, + ) + } + return &m +} From 23efddb046c8bab1c5b96725f9d4f8ce71b47efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 17 Jan 2020 15:08:24 +0100 Subject: [PATCH 10/27] Added test for bucket_store_metrics_test. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3359b590514..e3183509ae9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ instructions below to upgrade your Postgres. * [ENHANCEMENT] Experimental TSDB: Open existing TSDB on startup to prevent ingester from becoming ready before it can accept writes. #1917 * `--experimental.tsdb.max-tsdb-opening-concurrency-on-startup` * [ENHANCEMENT] Experimental TSDB: Added `cortex_ingester_shipper_dir_syncs_total`, `cortex_ingester_shipper_dir_sync_failures_total`, `cortex_ingester_shipper_uploads_total` and `cortex_ingester_shipper_upload_failures_total` metrics from TSDB shipper component. #1983 +* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (too many to list). #1996 * [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861 * [BUGFIX] Fixed #1904 ingesters getting stuck in a LEAVING state after coming up from an ungraceful exit. #1921 * [BUGFIX] Reduce memory usage when ingester Push() errors. #1922 From 821b95bed608a358c638f1d2cca261282211c097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 20 Jan 2020 10:35:37 +0100 Subject: [PATCH 11/27] Gather and report metrics from Thanos' storecache.InMemoryIndexCache. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block_store.go | 5 +- pkg/querier/block_store_metrics.go | 69 ++++++++++- pkg/querier/bucket_store_metrics_test.go | 142 +++++++++++++++++++++++ pkg/util/metrics_helper.go | 58 +++++++++ 4 files changed, 271 insertions(+), 3 deletions(-) diff --git a/pkg/querier/block_store.go b/pkg/querier/block_store.go index dac9d425f55..2e5942703a5 100644 --- a/pkg/querier/block_store.go +++ b/pkg/querier/block_store.go @@ -118,16 +118,17 @@ func (u *UserStore) syncUserStores(ctx context.Context, f func(context.Context, Bucket: bkt, } + reg := prometheus.NewRegistry() + indexCacheSizeBytes := u.cfg.BucketStore.IndexCacheSizeBytes maxItemSizeBytes := indexCacheSizeBytes / 2 - indexCache, err := storecache.NewInMemoryIndexCache(u.logger, nil, storecache.Opts{ + indexCache, err := storecache.NewInMemoryIndexCache(u.logger, reg, storecache.Opts{ MaxSizeBytes: indexCacheSizeBytes, MaxItemSizeBytes: maxItemSizeBytes, }) if err != nil { return err } - reg := prometheus.NewRegistry() bs, err = store.NewBucketStore( u.logger, reg, diff --git a/pkg/querier/block_store_metrics.go b/pkg/querier/block_store_metrics.go index f6f5cb1b7c2..18fa657a216 100644 --- a/pkg/querier/block_store_metrics.go +++ b/pkg/querier/block_store_metrics.go @@ -15,7 +15,7 @@ type tsdbBucketStoreMetrics struct { regsMu sync.Mutex regs map[string]*prometheus.Registry - // exported metrics + // exported metrics, gathered from Thanos BucketStore blockLoads *prometheus.Desc blockLoadFailures *prometheus.Desc blockDrops *prometheus.Desc @@ -30,6 +30,20 @@ type tsdbBucketStoreMetrics struct { seriesMergeDuration *prometheus.Desc resultSeriesCount *prometheus.Desc chunkSizeBytes *prometheus.Desc + + // Metrics gathered from Thanos storecache.InMemoryIndexCache + cacheItemsEvicted *prometheus.Desc + cacheItemsAdded *prometheus.Desc + cacheRequests *prometheus.Desc + cacheItemsOverflow *prometheus.Desc + cacheHits *prometheus.Desc + cacheItemsCurrentCount *prometheus.Desc + cacheItemsCurrentSize *prometheus.Desc + cacheItemsTotalCurrentSize *prometheus.Desc + + // Ignored: + // thanos_store_index_cache_max_size_bytes + // thanos_store_index_cache_max_item_size_bytes } func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics { @@ -93,6 +107,40 @@ func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics { "cortex_bucket_store_sent_chunk_size_bytes", "TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", nil, nil), + + // Cache + cacheItemsEvicted: prometheus.NewDesc( + "cortex_store_index_cache_items_evicted_total", + "TSDB: Total number of items that were evicted from the index cache.", + []string{"item_type"}, nil), + cacheItemsAdded: prometheus.NewDesc( + "cortex_store_index_cache_items_added_total", + "TSDB: Total number of items that were added to the index cache.", + []string{"item_type"}, nil), + cacheRequests: prometheus.NewDesc( + "cortex_store_index_cache_requests_total", + "TSDB: Total number of requests to the cache.", + []string{"item_type"}, nil), + cacheItemsOverflow: prometheus.NewDesc( + "cortex_store_index_cache_items_overflowed_total", + "TSDB: Total number of items that could not be added to the cache due to being too big.", + []string{"item_type"}, nil), + cacheHits: prometheus.NewDesc( + "cortex_store_index_cache_hits_total", + "TSDB: Total number of requests to the cache that were a hit.", + []string{"item_type"}, nil), + cacheItemsCurrentCount: prometheus.NewDesc( + "cortex_store_index_cache_items", + "TSDB: Current number of items in the index cache.", + []string{"item_type"}, nil), + cacheItemsCurrentSize: prometheus.NewDesc( + "cortex_store_index_cache_items_size_bytes", + "TSDB: Current byte size of items in the index cache.", + []string{"item_type"}, nil), + cacheItemsTotalCurrentSize: prometheus.NewDesc( + "cortex_store_index_cache_total_size_bytes", + "TSDB: Current byte size of items (both value and key) in the index cache.", + []string{"item_type"}, nil), } } @@ -129,6 +177,15 @@ func (m *tsdbBucketStoreMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.seriesMergeDuration out <- m.resultSeriesCount out <- m.chunkSizeBytes + + out <- m.cacheItemsEvicted + out <- m.cacheItemsAdded + out <- m.cacheRequests + out <- m.cacheItemsOverflow + out <- m.cacheHits + out <- m.cacheItemsCurrentCount + out <- m.cacheItemsCurrentSize + out <- m.cacheItemsTotalCurrentSize } func (m *tsdbBucketStoreMetrics) Collect(out chan<- prometheus.Metric) { @@ -164,4 +221,14 @@ func (m *tsdbBucketStoreMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfHistograms(out, m.seriesMergeDuration, "thanos_bucket_store_series_merge_duration_seconds") data.SendSumOfSummaries(out, m.resultSeriesCount, "thanos_bucket_store_series_result_series") data.SendSumOfHistograms(out, m.chunkSizeBytes, "thanos_bucket_store_sent_chunk_size_bytes") + + data.SendSumOfCountersWithLabels(out, m.cacheItemsEvicted, "thanos_store_index_cache_items_evicted_total", "item_type") + data.SendSumOfCountersWithLabels(out, m.cacheItemsAdded, "thanos_store_index_cache_items_added_total", "item_type") + data.SendSumOfCountersWithLabels(out, m.cacheRequests, "thanos_store_index_cache_requests_total", "item_type") + data.SendSumOfCountersWithLabels(out, m.cacheItemsOverflow, "thanos_store_index_cache_items_overflowed_total", "item_type") + data.SendSumOfCountersWithLabels(out, m.cacheHits, "thanos_store_index_cache_hits_total", "item_type") + + data.SendSumOfGaugesWithLabels(out, m.cacheItemsCurrentCount, "thanos_store_index_cache_items", "item_type") + data.SendSumOfGaugesWithLabels(out, m.cacheItemsCurrentSize, "thanos_store_index_cache_items_size_bytes", "item_type") + data.SendSumOfGaugesWithLabels(out, m.cacheItemsTotalCurrentSize, "thanos_store_index_cache_total_size_bytes", "item_type") } diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index 02cd5cf6db7..b79607ad081 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -143,6 +143,47 @@ func TestTsdbBucketStoreMetrics(t *testing.T) { # TYPE cortex_bucket_store_series_result_series summary cortex_bucket_store_series_result_series_sum 1.238545e+06 cortex_bucket_store_series_result_series_count 6 + + # HELP cortex_store_index_cache_items_evicted_total TSDB: Total number of items that were evicted from the index cache. + # TYPE cortex_store_index_cache_items_evicted_total counter + cortex_store_index_cache_items_evicted_total{item_type="Postings"} 1125950 + cortex_store_index_cache_items_evicted_total{item_type="Series"} 1148469 + + # HELP cortex_store_index_cache_requests_total TSDB: Total number of requests to the cache. + # TYPE cortex_store_index_cache_requests_total counter + cortex_store_index_cache_requests_total{item_type="Postings"} 1170988 + cortex_store_index_cache_requests_total{item_type="Series"} 1193507 + + # HELP cortex_store_index_cache_hits_total TSDB: Total number of requests to the cache that were a hit. + # TYPE cortex_store_index_cache_hits_total counter + cortex_store_index_cache_hits_total{item_type="Postings"} 1216026 + cortex_store_index_cache_hits_total{item_type="Series"} 1238545 + + # HELP cortex_store_index_cache_items_added_total TSDB: Total number of items that were added to the index cache. + # TYPE cortex_store_index_cache_items_added_total counter + cortex_store_index_cache_items_added_total{item_type="Postings"} 1261064 + cortex_store_index_cache_items_added_total{item_type="Series"} 1283583 + + # HELP cortex_store_index_cache_items TSDB: Current number of items in the index cache. + # TYPE cortex_store_index_cache_items counter + cortex_store_index_cache_items{item_type="Postings"} 1306102 + cortex_store_index_cache_items{item_type="Series"} 1328621 + + # HELP cortex_store_index_cache_items_size_bytes TSDB: Current byte size of items in the index cache. + # TYPE cortex_store_index_cache_items_size_bytes counter + cortex_store_index_cache_items_size_bytes{item_type="Postings"} 1351140 + cortex_store_index_cache_items_size_bytes{item_type="Series"} 1373659 + + # HELP cortex_store_index_cache_total_size_bytes TSDB: Current byte size of items (both value and key) in the index cache. + # TYPE cortex_store_index_cache_total_size_bytes counter + cortex_store_index_cache_total_size_bytes{item_type="Postings"} 1396178 + cortex_store_index_cache_total_size_bytes{item_type="Series"} 1418697 + + # HELP cortex_store_index_cache_items_overflowed_total TSDB: Total number of items that could not be added to the cache due to being too big. + # TYPE cortex_store_index_cache_items_overflowed_total counter + cortex_store_index_cache_items_overflowed_total{item_type="Postings"} 1441216 + cortex_store_index_cache_items_overflowed_total{item_type="Series"} 1463735 + `)) require.NoError(t, err) } @@ -192,6 +233,25 @@ func populateTSDBBucketStore(base float64) *prometheus.Registry { m.queriesDropped.Add(31 * base) m.queriesLimit.Add(32 * base) + + c := newIndexStoreCacheMetrics(reg) + + c.evicted.WithLabelValues(cacheTypePostings).Add(base * 50) + c.evicted.WithLabelValues(cacheTypeSeries).Add(base * 51) + c.requests.WithLabelValues(cacheTypePostings).Add(base * 52) + c.requests.WithLabelValues(cacheTypeSeries).Add(base * 53) + c.hits.WithLabelValues(cacheTypePostings).Add(base * 54) + c.hits.WithLabelValues(cacheTypeSeries).Add(base * 55) + c.added.WithLabelValues(cacheTypePostings).Add(base * 56) + c.added.WithLabelValues(cacheTypeSeries).Add(base * 57) + c.current.WithLabelValues(cacheTypePostings).Set(base * 58) + c.current.WithLabelValues(cacheTypeSeries).Set(base * 59) + c.currentSize.WithLabelValues(cacheTypePostings).Set(base * 60) + c.currentSize.WithLabelValues(cacheTypeSeries).Set(base * 61) + c.totalCurrentSize.WithLabelValues(cacheTypePostings).Set(base * 62) + c.totalCurrentSize.WithLabelValues(cacheTypeSeries).Set(base * 63) + c.overflow.WithLabelValues(cacheTypePostings).Add(base * 64) + c.overflow.WithLabelValues(cacheTypeSeries).Add(base * 65) return reg } @@ -215,6 +275,23 @@ type bucketStoreMetrics struct { queriesLimit prometheus.Gauge } +// Copied from Thanos, pkg/store/cache/inmemory.go, InMemoryIndexCache struct +type indexStoreCacheMetrics struct { + evicted *prometheus.CounterVec + requests *prometheus.CounterVec + hits *prometheus.CounterVec + added *prometheus.CounterVec + current *prometheus.GaugeVec + currentSize *prometheus.GaugeVec + totalCurrentSize *prometheus.GaugeVec + overflow *prometheus.CounterVec +} + +const ( + cacheTypePostings string = "Postings" + cacheTypeSeries string = "Series" +) + func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { var m bucketStoreMetrics @@ -315,3 +392,68 @@ func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { } return &m } + +func newIndexStoreCacheMetrics(reg prometheus.Registerer) *indexStoreCacheMetrics { + c := indexStoreCacheMetrics{} + c.evicted = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_store_index_cache_items_evicted_total", + Help: "Total number of items that were evicted from the index cache.", + }, []string{"item_type"}) + c.evicted.WithLabelValues(cacheTypePostings) + c.evicted.WithLabelValues(cacheTypeSeries) + + c.added = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_store_index_cache_items_added_total", + Help: "Total number of items that were added to the index cache.", + }, []string{"item_type"}) + c.added.WithLabelValues(cacheTypePostings) + c.added.WithLabelValues(cacheTypeSeries) + + c.requests = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_store_index_cache_requests_total", + Help: "Total number of requests to the cache.", + }, []string{"item_type"}) + c.requests.WithLabelValues(cacheTypePostings) + c.requests.WithLabelValues(cacheTypeSeries) + + c.overflow = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_store_index_cache_items_overflowed_total", + Help: "Total number of items that could not be added to the cache due to being too big.", + }, []string{"item_type"}) + c.overflow.WithLabelValues(cacheTypePostings) + c.overflow.WithLabelValues(cacheTypeSeries) + + c.hits = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_store_index_cache_hits_total", + Help: "Total number of requests to the cache that were a hit.", + }, []string{"item_type"}) + c.hits.WithLabelValues(cacheTypePostings) + c.hits.WithLabelValues(cacheTypeSeries) + + c.current = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "thanos_store_index_cache_items", + Help: "Current number of items in the index cache.", + }, []string{"item_type"}) + c.current.WithLabelValues(cacheTypePostings) + c.current.WithLabelValues(cacheTypeSeries) + + c.currentSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "thanos_store_index_cache_items_size_bytes", + Help: "Current byte size of items in the index cache.", + }, []string{"item_type"}) + c.currentSize.WithLabelValues(cacheTypePostings) + c.currentSize.WithLabelValues(cacheTypeSeries) + + c.totalCurrentSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "thanos_store_index_cache_total_size_bytes", + Help: "Current byte size of items (both value and key) in the index cache.", + }, []string{"item_type"}) + c.totalCurrentSize.WithLabelValues(cacheTypePostings) + c.totalCurrentSize.WithLabelValues(cacheTypeSeries) + + if reg != nil { + reg.MustRegister(c.requests, c.hits, c.added, c.evicted, c.current, c.currentSize, c.totalCurrentSize, c.overflow) + } + + return &c +} diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index fe64eeb485d..64abbcec231 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -52,6 +52,35 @@ func (d MetricFamiliesPerUser) SendSumOfCounters(out chan<- prometheus.Metric, d out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, result) } +func (d MetricFamiliesPerUser) SendSumOfCountersWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string, labelNames ...string) { + type counterResult struct { + value float64 + labelValues []string + } + + result := map[string]counterResult{} + + for _, userMetrics := range d { // for each user + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[counter], labelNames) + + for key, mlv := range metricsPerLabelValue { + for _, m := range mlv.metrics { + r := result[key] + if r.labelValues == nil { + r.labelValues = mlv.labelValues + } + + r.value += m.GetCounter().GetValue() + result[key] = r + } + } + } + + for _, cr := range result { + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, cr.value, cr.labelValues...) + } +} + func (d MetricFamiliesPerUser) SendSumOfCountersPerUser(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string) { for user, perMetric := range d { v := sum(perMetric[counter], counterValue) @@ -68,6 +97,35 @@ func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, des out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, result) } +func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string, labelNames ...string) { + type gaugeResult struct { + value float64 + labelValues []string + } + + result := map[string]gaugeResult{} + + for _, userMetrics := range d { // for each user + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[counter], labelNames) + + for key, mlv := range metricsPerLabelValue { + for _, m := range mlv.metrics { + r := result[key] + if r.labelValues == nil { + r.labelValues = mlv.labelValues + } + + r.value += m.GetGauge().GetValue() + result[key] = r + } + } + } + + for _, cr := range result { + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, cr.value, cr.labelValues...) + } +} + func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, desc *prometheus.Desc, summaryName string) { var ( sampleCount uint64 From d146805d9ea2ae11f571f6b12afd887dcb4133a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 20 Jan 2020 10:52:16 +0100 Subject: [PATCH 12/27] Extracted common code that builds MetricFamiliesPerUser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/ingester/metrics.go | 15 +-------------- pkg/querier/block_store_metrics.go | 16 +--------------- pkg/util/metrics_helper.go | 17 +++++++++++++++-- 3 files changed, 17 insertions(+), 31 deletions(-) diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 7d6d71c8a21..b57633c727a 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -4,7 +4,6 @@ import ( "sync" "github.com/cortexproject/cortex/pkg/util" - "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" ) @@ -166,19 +165,7 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) { } func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { - regs := sm.registries() - data := util.NewMetricFamiliersPerUser() - - for userID, r := range regs { - m, err := r.Gather() - if err == nil { - err = data.AddGatheredDataForUser(userID, m) - } - if err != nil { - level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) - continue - } - } + data := util.BuildMetricFamiliesPerUserFromUserRegistries(sm.registries()) // OK, we have it all. Let's build results. data.SendSumOfCounters(out, sm.dirSyncs, "thanos_shipper_dir_syncs_total") diff --git a/pkg/querier/block_store_metrics.go b/pkg/querier/block_store_metrics.go index 18fa657a216..27a9c8f6ed7 100644 --- a/pkg/querier/block_store_metrics.go +++ b/pkg/querier/block_store_metrics.go @@ -4,7 +4,6 @@ import ( "sync" "github.com/cortexproject/cortex/pkg/util" - "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" ) @@ -189,20 +188,7 @@ func (m *tsdbBucketStoreMetrics) Describe(out chan<- *prometheus.Desc) { } func (m *tsdbBucketStoreMetrics) Collect(out chan<- prometheus.Metric) { - regs := m.registries() - data := util.NewMetricFamiliersPerUser() - - for userID, r := range regs { - m, err := r.Gather() - if err == nil { - err = data.AddGatheredDataForUser(userID, m) - } - - if err != nil { - level.Warn(util.Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) - continue - } - } + data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries()) data.SendSumOfCounters(out, m.blockLoads, "thanos_bucket_store_block_loads_total") data.SendSumOfCounters(out, m.blockLoadFailures, "thanos_bucket_store_block_load_failures_total") diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 64abbcec231..aca8eda98d2 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" + "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" ) @@ -15,8 +16,20 @@ import ( // Value = slice of gathered values with the same metric name. type MetricFamiliesPerUser map[string]map[string]*dto.MetricFamily -func NewMetricFamiliersPerUser() MetricFamiliesPerUser { - return MetricFamiliesPerUser{} +func BuildMetricFamiliesPerUserFromUserRegistries(regs map[string]*prometheus.Registry) MetricFamiliesPerUser { + data := MetricFamiliesPerUser{} + for userID, r := range regs { + m, err := r.Gather() + if err == nil { + err = data.AddGatheredDataForUser(userID, m) + } + + if err != nil { + level.Warn(Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) + continue + } + } + return data } // AddGatheredDataForUser adds user-specific output of Gatherer.Gather method. From a9274b276be3a6bc131742c4a2d0662cebd81cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 20 Jan 2020 10:52:54 +0100 Subject: [PATCH 13/27] Added benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We may want to optimize this in the future. Signed-off-by: Peter Štibraný --- pkg/querier/bucket_store_metrics_test.go | 34 ++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index b79607ad081..cb4bdafe0b7 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -2,6 +2,7 @@ package querier import ( "bytes" + "fmt" "testing" "github.com/prometheus/client_golang/prometheus" @@ -188,6 +189,39 @@ func TestTsdbBucketStoreMetrics(t *testing.T) { require.NoError(t, err) } +func BenchmarkMetricsCollections10(b *testing.B) { + benchmarkMetricsCollection(b, 10) +} + +func BenchmarkMetricsCollections100(b *testing.B) { + benchmarkMetricsCollection(b, 100) +} + +func BenchmarkMetricsCollections1000(b *testing.B) { + benchmarkMetricsCollection(b, 1000) +} + +func BenchmarkMetricsCollections10000(b *testing.B) { + benchmarkMetricsCollection(b, 10000) +} + +func benchmarkMetricsCollection(b *testing.B, users int) { + mainReg := prometheus.NewRegistry() + + tsdbMetrics := newTSDBBucketStoreMetrics() + mainReg.MustRegister(tsdbMetrics) + + base := 123456.0 + for i := 0; i < users; i++ { + tsdbMetrics.addUserRegistry(fmt.Sprintf("user-%d", i), populateTSDBBucketStore(base*float64(i))) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + mainReg.Gather() + } +} + func populateTSDBBucketStore(base float64) *prometheus.Registry { reg := prometheus.NewRegistry() m := newBucketStoreMetrics(reg) From 2ec4fa165e490f51fb5c9e54f3fafd35a57333d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 20 Jan 2020 10:55:46 +0100 Subject: [PATCH 14/27] Updated CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3183509ae9..3fcd6ac228d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,8 @@ instructions below to upgrade your Postgres. * [ENHANCEMENT] Experimental TSDB: Open existing TSDB on startup to prevent ingester from becoming ready before it can accept writes. #1917 * `--experimental.tsdb.max-tsdb-opening-concurrency-on-startup` * [ENHANCEMENT] Experimental TSDB: Added `cortex_ingester_shipper_dir_syncs_total`, `cortex_ingester_shipper_dir_sync_failures_total`, `cortex_ingester_shipper_uploads_total` and `cortex_ingester_shipper_upload_failures_total` metrics from TSDB shipper component. #1983 -* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (too many to list). #1996 +* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (many metrics to list, but all have `cortex_bucket_store_` prefix). #1996 +* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos in memory index cache (many metrics to list, but all have `cortex_store_index_cache_` prefix). #1996 * [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861 * [BUGFIX] Fixed #1904 ingesters getting stuck in a LEAVING state after coming up from an ungraceful exit. #1921 * [BUGFIX] Reduce memory usage when ingester Push() errors. #1922 From 9b446031ebbd32f1718833e995384d65c29409b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:39:37 +0100 Subject: [PATCH 15/27] Added tests to sum and getMetricsWithLabelNames functions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/util/metrics_helper_test.go | 73 +++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 pkg/util/metrics_helper_test.go diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go new file mode 100644 index 00000000000..745ce94a1a0 --- /dev/null +++ b/pkg/util/metrics_helper_test.go @@ -0,0 +1,73 @@ +package util + +import ( + "testing" + + "github.com/gogo/protobuf/proto" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" +) + +func TestSum(t *testing.T) { + require.Equal(t, float64(0), sum(nil, counterValue)) + require.Equal(t, float64(0), sum(&dto.MetricFamily{Metric: nil}, counterValue)) + require.Equal(t, float64(0), sum(&dto.MetricFamily{Metric: []*dto.Metric{{Counter: &dto.Counter{}}}}, counterValue)) + require.Equal(t, 12345.6789, sum(&dto.MetricFamily{Metric: []*dto.Metric{{Counter: &dto.Counter{Value: proto.Float64(12345.6789)}}}}, counterValue)) + require.Equal(t, 20235.80235, sum(&dto.MetricFamily{Metric: []*dto.Metric{ + {Counter: &dto.Counter{Value: proto.Float64(12345.6789)}}, + {Counter: &dto.Counter{Value: proto.Float64(7890.12345)}}, + }}, counterValue)) + // using 'counterValue' as function only sums counters + require.Equal(t, float64(0), sum(&dto.MetricFamily{Metric: []*dto.Metric{ + {Gauge: &dto.Gauge{Value: proto.Float64(12345.6789)}}, + {Gauge: &dto.Gauge{Value: proto.Float64(7890.12345)}}, + }}, counterValue)) +} + +func TestCounterValue(t *testing.T) { + require.Equal(t, float64(0), counterValue(nil)) + require.Equal(t, float64(0), counterValue(&dto.Metric{})) + require.Equal(t, float64(0), counterValue(&dto.Metric{Counter: &dto.Counter{}})) + require.Equal(t, float64(543857.12837), counterValue(&dto.Metric{Counter: &dto.Counter{Value: proto.Float64(543857.12837)}})) +} + +func TestGetMetricsWithLabelNames(t *testing.T) { + labels := []string{"a", "b"} + + require.Equal(t, map[string]metricsWithLabels{}, getMetricsWithLabelNames(nil, labels)) + require.Equal(t, map[string]metricsWithLabels{}, getMetricsWithLabelNames(&dto.MetricFamily{}, labels)) + + m1 := &dto.Metric{Label: makeLabels("a", "5"), Counter: &dto.Counter{Value: proto.Float64(1)}} + m2 := &dto.Metric{Label: makeLabels("a", "10", "b", "20"), Counter: &dto.Counter{Value: proto.Float64(1.5)}} + m3 := &dto.Metric{Label: makeLabels("a", "10", "b", "20", "c", "1"), Counter: &dto.Counter{Value: proto.Float64(2)}} + m4 := &dto.Metric{Label: makeLabels("a", "10", "b", "20", "c", "2"), Counter: &dto.Counter{Value: proto.Float64(3)}} + m5 := &dto.Metric{Label: makeLabels("a", "11", "b", "21"), Counter: &dto.Counter{Value: proto.Float64(4)}} + m6 := &dto.Metric{Label: makeLabels("ignored", "123", "a", "12", "b", "22", "c", "30"), Counter: &dto.Counter{Value: proto.Float64(4)}} + + out := getMetricsWithLabelNames(&dto.MetricFamily{Metric: []*dto.Metric{m1, m2, m3, m4, m5, m6}}, labels) + + require.Equal(t, map[string]metricsWithLabels{ + getLabelsString([]string{"10", "20"}): { + labelValues: []string{"10", "20"}, + metrics: []*dto.Metric{m2, m3, m4}}, + getLabelsString([]string{"11", "21"}): { + labelValues: []string{"11", "21"}, + metrics: []*dto.Metric{m5}}, + getLabelsString([]string{"12", "22"}): { + labelValues: []string{"12", "22"}, + metrics: []*dto.Metric{m6}}, + }, out) +} + +func makeLabels(namesAndValues ...string) []*dto.LabelPair { + out := []*dto.LabelPair(nil) + + for i := 0; i+1 < len(namesAndValues); i = i + 2 { + out = append(out, &dto.LabelPair{ + Name: proto.String(namesAndValues[i]), + Value: proto.String(namesAndValues[i+1]), + }) + } + + return out +} From eee9e51d6f5c4d2824fb2f8c4b15af7aab249908 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:40:52 +0100 Subject: [PATCH 16/27] Fixes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed sum for nil input. Extracted common code for sum of counters/gauges with labels. Fixed gauge output (was reported as counter) Signed-off-by: Peter Štibraný --- pkg/querier/bucket_store_metrics_test.go | 6 +-- pkg/util/metrics_helper.go | 61 +++++++++--------------- 2 files changed, 26 insertions(+), 41 deletions(-) diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index cb4bdafe0b7..39a5a70f210 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -166,17 +166,17 @@ func TestTsdbBucketStoreMetrics(t *testing.T) { cortex_store_index_cache_items_added_total{item_type="Series"} 1283583 # HELP cortex_store_index_cache_items TSDB: Current number of items in the index cache. - # TYPE cortex_store_index_cache_items counter + # TYPE cortex_store_index_cache_items gauge cortex_store_index_cache_items{item_type="Postings"} 1306102 cortex_store_index_cache_items{item_type="Series"} 1328621 # HELP cortex_store_index_cache_items_size_bytes TSDB: Current byte size of items in the index cache. - # TYPE cortex_store_index_cache_items_size_bytes counter + # TYPE cortex_store_index_cache_items_size_bytes gauge cortex_store_index_cache_items_size_bytes{item_type="Postings"} 1351140 cortex_store_index_cache_items_size_bytes{item_type="Series"} 1373659 # HELP cortex_store_index_cache_total_size_bytes TSDB: Current byte size of items (both value and key) in the index cache. - # TYPE cortex_store_index_cache_total_size_bytes counter + # TYPE cortex_store_index_cache_total_size_bytes gauge cortex_store_index_cache_total_size_bytes{item_type="Postings"} 1396178 cortex_store_index_cache_total_size_bytes{item_type="Series"} 1418697 diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index aca8eda98d2..cb0ac469682 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -66,29 +66,7 @@ func (d MetricFamiliesPerUser) SendSumOfCounters(out chan<- prometheus.Metric, d } func (d MetricFamiliesPerUser) SendSumOfCountersWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string, labelNames ...string) { - type counterResult struct { - value float64 - labelValues []string - } - - result := map[string]counterResult{} - - for _, userMetrics := range d { // for each user - metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[counter], labelNames) - - for key, mlv := range metricsPerLabelValue { - for _, m := range mlv.metrics { - r := result[key] - if r.labelValues == nil { - r.labelValues = mlv.labelValues - } - - r.value += m.GetCounter().GetValue() - result[key] = r - } - } - } - + result := d.sumOfSingleValuesWithLabels(counter, counterValue, labelNames) for _, cr := range result { out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, cr.value, cr.labelValues...) } @@ -110,16 +88,23 @@ func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, des out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, result) } -func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string, labelNames ...string) { - type gaugeResult struct { - value float64 - labelValues []string +func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string, labelNames ...string) { + result := d.sumOfSingleValuesWithLabels(gauge, gaugeValue, labelNames) + for _, cr := range result { + out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, cr.value, cr.labelValues...) } +} + +type singleResult struct { + value float64 + labelValues []string +} - result := map[string]gaugeResult{} +func (d MetricFamiliesPerUser) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string) map[string]singleResult { + result := map[string]singleResult{} - for _, userMetrics := range d { // for each user - metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[counter], labelNames) + for _, userMetrics := range d { + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[metric], labelNames) for key, mlv := range metricsPerLabelValue { for _, m := range mlv.metrics { @@ -128,15 +113,13 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.M r.labelValues = mlv.labelValues } - r.value += m.GetGauge().GetValue() + r.value += fn(m) result[key] = r } } } - for _, cr := range result { - out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, cr.value, cr.labelValues...) - } + return result } func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, desc *prometheus.Desc, summaryName string) { @@ -146,7 +129,7 @@ func (d MetricFamiliesPerUser) SendSumOfSummaries(out chan<- prometheus.Metric, quantiles map[float64]float64 ) - for _, userMetrics := range d { // for each user + for _, userMetrics := range d { for _, m := range userMetrics[summaryName].GetMetric() { summary := m.GetSummary() sampleCount += summary.GetSampleCount() @@ -168,7 +151,7 @@ func (d MetricFamiliesPerUser) SendSumOfSummariesWithLabels(out chan<- prometheu result := map[string]summaryResult{} - for _, userMetrics := range d { // for each user + for _, userMetrics := range d { metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[summaryName], labelNames) for key, mwl := range metricsPerLabelValue { @@ -200,7 +183,7 @@ func (d MetricFamiliesPerUser) SendSumOfHistograms(out chan<- prometheus.Metric, buckets map[float64]uint64 ) - for _, userMetrics := range d { // for each user + for _, userMetrics := range d { for _, m := range userMetrics[histogramName].GetMetric() { histo := m.GetHistogram() sampleCount += histo.GetSampleCount() @@ -298,9 +281,11 @@ func getLabelsString(labelValues []string) string { return buf.String() } +// sum returns sum of values from all metrics from same metric family (= series with the same metric name, but different labels) +// Supplied function extracts value. func sum(mf *dto.MetricFamily, fn func(*dto.Metric) float64) float64 { result := float64(0) - for _, m := range mf.Metric { + for _, m := range mf.GetMetric() { result += fn(m) } return result From ece88df83d5c1478c23f2edadb7af242477d9646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:42:36 +0100 Subject: [PATCH 17/27] Replaced cortex_bucket_store prefix with cortex_querier_bucket_store. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- pkg/querier/block_store_metrics.go | 28 +-- pkg/querier/bucket_store_metrics_test.go | 244 +++++++++++------------ 3 files changed, 137 insertions(+), 137 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fcd6ac228d..6bd0f111e00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,7 @@ instructions below to upgrade your Postgres. * [ENHANCEMENT] Experimental TSDB: Open existing TSDB on startup to prevent ingester from becoming ready before it can accept writes. #1917 * `--experimental.tsdb.max-tsdb-opening-concurrency-on-startup` * [ENHANCEMENT] Experimental TSDB: Added `cortex_ingester_shipper_dir_syncs_total`, `cortex_ingester_shipper_dir_sync_failures_total`, `cortex_ingester_shipper_uploads_total` and `cortex_ingester_shipper_upload_failures_total` metrics from TSDB shipper component. #1983 -* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (many metrics to list, but all have `cortex_bucket_store_` prefix). #1996 +* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (many metrics to list, but all have `cortex_querier_bucket_store_` prefix). #1996 * [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos in memory index cache (many metrics to list, but all have `cortex_store_index_cache_` prefix). #1996 * [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861 * [BUGFIX] Fixed #1904 ingesters getting stuck in a LEAVING state after coming up from an ungraceful exit. #1921 diff --git a/pkg/querier/block_store_metrics.go b/pkg/querier/block_store_metrics.go index 27a9c8f6ed7..6626d0dbb30 100644 --- a/pkg/querier/block_store_metrics.go +++ b/pkg/querier/block_store_metrics.go @@ -50,60 +50,60 @@ func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics { regs: map[string]*prometheus.Registry{}, blockLoads: prometheus.NewDesc( - "cortex_bucket_store_block_loads_total", + "cortex_querier_bucket_store_block_loads_total", "TSDB: Total number of remote block loading attempts.", nil, nil), blockLoadFailures: prometheus.NewDesc( - "cortex_bucket_store_block_load_failures_total", + "cortex_querier_bucket_store_block_load_failures_total", "TSDB: Total number of failed remote block loading attempts.", nil, nil), blockDrops: prometheus.NewDesc( - "cortex_bucket_store_block_drops_total", + "cortex_querier_bucket_store_block_drops_total", "TSDB: Total number of local blocks that were dropped.", nil, nil), blockDropFailures: prometheus.NewDesc( - "cortex_bucket_store_block_drop_failures_total", + "cortex_querier_bucket_store_block_drop_failures_total", "TSDB: Total number of local blocks that failed to be dropped.", nil, nil), blocksLoaded: prometheus.NewDesc( - "cortex_bucket_store_blocks_loaded", + "cortex_querier_bucket_store_blocks_loaded", "TSDB: Number of currently loaded blocks.", nil, nil), seriesDataTouched: prometheus.NewDesc( - "cortex_bucket_store_series_data_touched", + "cortex_querier_bucket_store_series_data_touched", "TSDB: How many items of a data type in a block were touched for a single series request.", []string{"data_type"}, nil), seriesDataFetched: prometheus.NewDesc( - "cortex_bucket_store_series_data_fetched", + "cortex_querier_bucket_store_series_data_fetched", "TSDB: How many items of a data type in a block were fetched for a single series request.", []string{"data_type"}, nil), seriesDataSizeTouched: prometheus.NewDesc( - "cortex_bucket_store_series_data_size_touched_bytes", + "cortex_querier_bucket_store_series_data_size_touched_bytes", "TSDB: Size of all items of a data type in a block were touched for a single series request.", []string{"data_type"}, nil), seriesDataSizeFetched: prometheus.NewDesc( - "cortex_bucket_store_series_data_size_fetched_bytes", + "cortex_querier_bucket_store_series_data_size_fetched_bytes", "TSDB: Size of all items of a data type in a block were fetched for a single series request.", []string{"data_type"}, nil), seriesBlocksQueried: prometheus.NewDesc( - "cortex_bucket_store_series_blocks_queried", + "cortex_querier_bucket_store_series_blocks_queried", "TSDB: Number of blocks in a bucket store that were touched to satisfy a query.", nil, nil), seriesGetAllDuration: prometheus.NewDesc( - "cortex_bucket_store_series_get_all_duration_seconds", + "cortex_querier_bucket_store_series_get_all_duration_seconds", "TSDB: Time it takes until all per-block prepares and preloads for a query are finished.", nil, nil), seriesMergeDuration: prometheus.NewDesc( - "cortex_bucket_store_series_merge_duration_seconds", + "cortex_querier_bucket_store_series_merge_duration_seconds", "TSDB: Time it takes to merge sub-results from all queried blocks into a single result.", nil, nil), resultSeriesCount: prometheus.NewDesc( - "cortex_bucket_store_series_result_series", + "cortex_querier_bucket_store_series_result_series", "Number of series observed in the final result of a query.", nil, nil), chunkSizeBytes: prometheus.NewDesc( - "cortex_bucket_store_sent_chunk_size_bytes", + "cortex_querier_bucket_store_sent_chunk_size_bytes", "TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", nil, nil), diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index 39a5a70f210..1ad55badd16 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -22,128 +22,128 @@ func TestTsdbBucketStoreMetrics(t *testing.T) { //noinspection ALL err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` - # HELP cortex_bucket_store_blocks_loaded TSDB: Number of currently loaded blocks. - # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 22519 - - # HELP cortex_bucket_store_block_loads_total TSDB: Total number of remote block loading attempts. - # TYPE cortex_bucket_store_block_loads_total counter - cortex_bucket_store_block_loads_total 45038 - - # HELP cortex_bucket_store_block_load_failures_total TSDB: Total number of failed remote block loading attempts. - # TYPE cortex_bucket_store_block_load_failures_total counter - cortex_bucket_store_block_load_failures_total 67557 - - # HELP cortex_bucket_store_block_drops_total TSDB: Total number of local blocks that were dropped. - # TYPE cortex_bucket_store_block_drops_total counter - cortex_bucket_store_block_drops_total 90076 - - # HELP cortex_bucket_store_block_drop_failures_total TSDB: Total number of local blocks that failed to be dropped. - # TYPE cortex_bucket_store_block_drop_failures_total counter - cortex_bucket_store_block_drop_failures_total 112595 - - # HELP cortex_bucket_store_sent_chunk_size_bytes TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier. - # TYPE cortex_bucket_store_sent_chunk_size_bytes histogram - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="32"} 0 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="256"} 0 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="512"} 0 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="1024"} 0 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="32768"} 0 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="262144"} 4 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="524288"} 6 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="1.048576e+06"} 6 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="3.3554432e+07"} 6 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="2.68435456e+08"} 6 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="5.36870912e+08"} 6 - cortex_bucket_store_sent_chunk_size_bytes_bucket{le="+Inf"} 6 - cortex_bucket_store_sent_chunk_size_bytes_sum 1.328621e+06 - cortex_bucket_store_sent_chunk_size_bytes_count 6 - - # HELP cortex_bucket_store_series_blocks_queried TSDB: Number of blocks in a bucket store that were touched to satisfy a query. - # TYPE cortex_bucket_store_series_blocks_queried summary - cortex_bucket_store_series_blocks_queried_sum 1.283583e+06 - cortex_bucket_store_series_blocks_queried_count 9 - - # HELP cortex_bucket_store_series_data_fetched TSDB: How many items of a data type in a block were fetched for a single series request. - # TYPE cortex_bucket_store_series_data_fetched summary - cortex_bucket_store_series_data_fetched_sum{data_type="fetched-a"} 202671 - cortex_bucket_store_series_data_fetched_count{data_type="fetched-a"} 3 - cortex_bucket_store_series_data_fetched_sum{data_type="fetched-b"} 225190 - cortex_bucket_store_series_data_fetched_count{data_type="fetched-b"} 3 - cortex_bucket_store_series_data_fetched_sum{data_type="fetched-c"} 247709 - cortex_bucket_store_series_data_fetched_count{data_type="fetched-c"} 3 - - # HELP cortex_bucket_store_series_data_size_fetched_bytes TSDB: Size of all items of a data type in a block were fetched for a single series request. - # TYPE cortex_bucket_store_series_data_size_fetched_bytes summary - cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-a"} 337785 - cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-a"} 3 - cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-b"} 360304 - cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-b"} 3 - cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-c"} 382823 - cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-c"} 3 - - # HELP cortex_bucket_store_series_data_size_touched_bytes TSDB: Size of all items of a data type in a block were touched for a single series request. - # TYPE cortex_bucket_store_series_data_size_touched_bytes summary - cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-a"} 270228 - cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-a"} 3 - cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-b"} 292747 - cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-b"} 3 - cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-c"} 315266 - cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-c"} 3 - - # HELP cortex_bucket_store_series_data_touched TSDB: How many items of a data type in a block were touched for a single series request. - # TYPE cortex_bucket_store_series_data_touched summary - cortex_bucket_store_series_data_touched_sum{data_type="touched-a"} 135114 - cortex_bucket_store_series_data_touched_count{data_type="touched-a"} 3 - cortex_bucket_store_series_data_touched_sum{data_type="touched-b"} 157633 - cortex_bucket_store_series_data_touched_count{data_type="touched-b"} 3 - cortex_bucket_store_series_data_touched_sum{data_type="touched-c"} 180152 - cortex_bucket_store_series_data_touched_count{data_type="touched-c"} 3 - - # HELP cortex_bucket_store_series_get_all_duration_seconds TSDB: Time it takes until all per-block prepares and preloads for a query are finished. - # TYPE cortex_bucket_store_series_get_all_duration_seconds histogram - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.001"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.01"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.1"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.3"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.6"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="1"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="3"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="6"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="9"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="20"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="30"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="60"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="90"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="120"} 0 - cortex_bucket_store_series_get_all_duration_seconds_bucket{le="+Inf"} 9 - cortex_bucket_store_series_get_all_duration_seconds_sum 1.486254e+06 - cortex_bucket_store_series_get_all_duration_seconds_count 9 - - # HELP cortex_bucket_store_series_merge_duration_seconds TSDB: Time it takes to merge sub-results from all queried blocks into a single result. - # TYPE cortex_bucket_store_series_merge_duration_seconds histogram - cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.001"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.01"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.1"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.3"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.6"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="1"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="3"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="6"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="9"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="20"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="30"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="60"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="90"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="120"} 0 - cortex_bucket_store_series_merge_duration_seconds_bucket{le="+Inf"} 9 - cortex_bucket_store_series_merge_duration_seconds_sum 1.688925e+06 - cortex_bucket_store_series_merge_duration_seconds_count 9 - - # HELP cortex_bucket_store_series_result_series Number of series observed in the final result of a query. - # TYPE cortex_bucket_store_series_result_series summary - cortex_bucket_store_series_result_series_sum 1.238545e+06 - cortex_bucket_store_series_result_series_count 6 + # HELP cortex_querier_bucket_store_blocks_loaded TSDB: Number of currently loaded blocks. + # TYPE cortex_querier_bucket_store_blocks_loaded gauge + cortex_querier_bucket_store_blocks_loaded 22519 + + # HELP cortex_querier_bucket_store_block_loads_total TSDB: Total number of remote block loading attempts. + # TYPE cortex_querier_bucket_store_block_loads_total counter + cortex_querier_bucket_store_block_loads_total 45038 + + # HELP cortex_querier_bucket_store_block_load_failures_total TSDB: Total number of failed remote block loading attempts. + # TYPE cortex_querier_bucket_store_block_load_failures_total counter + cortex_querier_bucket_store_block_load_failures_total 67557 + + # HELP cortex_querier_bucket_store_block_drops_total TSDB: Total number of local blocks that were dropped. + # TYPE cortex_querier_bucket_store_block_drops_total counter + cortex_querier_bucket_store_block_drops_total 90076 + + # HELP cortex_querier_bucket_store_block_drop_failures_total TSDB: Total number of local blocks that failed to be dropped. + # TYPE cortex_querier_bucket_store_block_drop_failures_total counter + cortex_querier_bucket_store_block_drop_failures_total 112595 + + # HELP cortex_querier_bucket_store_sent_chunk_size_bytes TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier. + # TYPE cortex_querier_bucket_store_sent_chunk_size_bytes histogram + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="32"} 0 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="256"} 0 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="512"} 0 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="1024"} 0 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="32768"} 0 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="262144"} 4 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="524288"} 6 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="1.048576e+06"} 6 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="3.3554432e+07"} 6 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="2.68435456e+08"} 6 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="5.36870912e+08"} 6 + cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="+Inf"} 6 + cortex_querier_bucket_store_sent_chunk_size_bytes_sum 1.328621e+06 + cortex_querier_bucket_store_sent_chunk_size_bytes_count 6 + + # HELP cortex_querier_bucket_store_series_blocks_queried TSDB: Number of blocks in a bucket store that were touched to satisfy a query. + # TYPE cortex_querier_bucket_store_series_blocks_queried summary + cortex_querier_bucket_store_series_blocks_queried_sum 1.283583e+06 + cortex_querier_bucket_store_series_blocks_queried_count 9 + + # HELP cortex_querier_bucket_store_series_data_fetched TSDB: How many items of a data type in a block were fetched for a single series request. + # TYPE cortex_querier_bucket_store_series_data_fetched summary + cortex_querier_bucket_store_series_data_fetched_sum{data_type="fetched-a"} 202671 + cortex_querier_bucket_store_series_data_fetched_count{data_type="fetched-a"} 3 + cortex_querier_bucket_store_series_data_fetched_sum{data_type="fetched-b"} 225190 + cortex_querier_bucket_store_series_data_fetched_count{data_type="fetched-b"} 3 + cortex_querier_bucket_store_series_data_fetched_sum{data_type="fetched-c"} 247709 + cortex_querier_bucket_store_series_data_fetched_count{data_type="fetched-c"} 3 + + # HELP cortex_querier_bucket_store_series_data_size_fetched_bytes TSDB: Size of all items of a data type in a block were fetched for a single series request. + # TYPE cortex_querier_bucket_store_series_data_size_fetched_bytes summary + cortex_querier_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-a"} 337785 + cortex_querier_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-a"} 3 + cortex_querier_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-b"} 360304 + cortex_querier_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-b"} 3 + cortex_querier_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-c"} 382823 + cortex_querier_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-c"} 3 + + # HELP cortex_querier_bucket_store_series_data_size_touched_bytes TSDB: Size of all items of a data type in a block were touched for a single series request. + # TYPE cortex_querier_bucket_store_series_data_size_touched_bytes summary + cortex_querier_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-a"} 270228 + cortex_querier_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-a"} 3 + cortex_querier_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-b"} 292747 + cortex_querier_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-b"} 3 + cortex_querier_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-c"} 315266 + cortex_querier_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-c"} 3 + + # HELP cortex_querier_bucket_store_series_data_touched TSDB: How many items of a data type in a block were touched for a single series request. + # TYPE cortex_querier_bucket_store_series_data_touched summary + cortex_querier_bucket_store_series_data_touched_sum{data_type="touched-a"} 135114 + cortex_querier_bucket_store_series_data_touched_count{data_type="touched-a"} 3 + cortex_querier_bucket_store_series_data_touched_sum{data_type="touched-b"} 157633 + cortex_querier_bucket_store_series_data_touched_count{data_type="touched-b"} 3 + cortex_querier_bucket_store_series_data_touched_sum{data_type="touched-c"} 180152 + cortex_querier_bucket_store_series_data_touched_count{data_type="touched-c"} 3 + + # HELP cortex_querier_bucket_store_series_get_all_duration_seconds TSDB: Time it takes until all per-block prepares and preloads for a query are finished. + # TYPE cortex_querier_bucket_store_series_get_all_duration_seconds histogram + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="0.001"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="0.01"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="0.1"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="0.3"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="0.6"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="1"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="3"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="6"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="9"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="20"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="30"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="60"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="90"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="120"} 0 + cortex_querier_bucket_store_series_get_all_duration_seconds_bucket{le="+Inf"} 9 + cortex_querier_bucket_store_series_get_all_duration_seconds_sum 1.486254e+06 + cortex_querier_bucket_store_series_get_all_duration_seconds_count 9 + + # HELP cortex_querier_bucket_store_series_merge_duration_seconds TSDB: Time it takes to merge sub-results from all queried blocks into a single result. + # TYPE cortex_querier_bucket_store_series_merge_duration_seconds histogram + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="0.001"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="0.01"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="0.1"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="0.3"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="0.6"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="1"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="3"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="6"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="9"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="20"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="30"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="60"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="90"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="120"} 0 + cortex_querier_bucket_store_series_merge_duration_seconds_bucket{le="+Inf"} 9 + cortex_querier_bucket_store_series_merge_duration_seconds_sum 1.688925e+06 + cortex_querier_bucket_store_series_merge_duration_seconds_count 9 + + # HELP cortex_querier_bucket_store_series_result_series Number of series observed in the final result of a query. + # TYPE cortex_querier_bucket_store_series_result_series summary + cortex_querier_bucket_store_series_result_series_sum 1.238545e+06 + cortex_querier_bucket_store_series_result_series_count 6 # HELP cortex_store_index_cache_items_evicted_total TSDB: Total number of items that were evicted from the index cache. # TYPE cortex_store_index_cache_items_evicted_total counter From 2ff02407f4d06a432a43cdbc60b7427edc6b2254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:44:07 +0100 Subject: [PATCH 18/27] Removed uninteresting cortex_querier_bucket_store_sent_chunk_size_bytes metric. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block_store_metrics.go | 7 ------- pkg/querier/bucket_store_metrics_test.go | 17 ----------------- 2 files changed, 24 deletions(-) diff --git a/pkg/querier/block_store_metrics.go b/pkg/querier/block_store_metrics.go index 6626d0dbb30..e8c68cb58dd 100644 --- a/pkg/querier/block_store_metrics.go +++ b/pkg/querier/block_store_metrics.go @@ -28,7 +28,6 @@ type tsdbBucketStoreMetrics struct { seriesGetAllDuration *prometheus.Desc seriesMergeDuration *prometheus.Desc resultSeriesCount *prometheus.Desc - chunkSizeBytes *prometheus.Desc // Metrics gathered from Thanos storecache.InMemoryIndexCache cacheItemsEvicted *prometheus.Desc @@ -102,10 +101,6 @@ func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics { "cortex_querier_bucket_store_series_result_series", "Number of series observed in the final result of a query.", nil, nil), - chunkSizeBytes: prometheus.NewDesc( - "cortex_querier_bucket_store_sent_chunk_size_bytes", - "TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", - nil, nil), // Cache cacheItemsEvicted: prometheus.NewDesc( @@ -175,7 +170,6 @@ func (m *tsdbBucketStoreMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.seriesGetAllDuration out <- m.seriesMergeDuration out <- m.resultSeriesCount - out <- m.chunkSizeBytes out <- m.cacheItemsEvicted out <- m.cacheItemsAdded @@ -206,7 +200,6 @@ func (m *tsdbBucketStoreMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfHistograms(out, m.seriesGetAllDuration, "thanos_bucket_store_series_get_all_duration_seconds") data.SendSumOfHistograms(out, m.seriesMergeDuration, "thanos_bucket_store_series_merge_duration_seconds") data.SendSumOfSummaries(out, m.resultSeriesCount, "thanos_bucket_store_series_result_series") - data.SendSumOfHistograms(out, m.chunkSizeBytes, "thanos_bucket_store_sent_chunk_size_bytes") data.SendSumOfCountersWithLabels(out, m.cacheItemsEvicted, "thanos_store_index_cache_items_evicted_total", "item_type") data.SendSumOfCountersWithLabels(out, m.cacheItemsAdded, "thanos_store_index_cache_items_added_total", "item_type") diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index 1ad55badd16..afec36e757b 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -42,23 +42,6 @@ func TestTsdbBucketStoreMetrics(t *testing.T) { # TYPE cortex_querier_bucket_store_block_drop_failures_total counter cortex_querier_bucket_store_block_drop_failures_total 112595 - # HELP cortex_querier_bucket_store_sent_chunk_size_bytes TSDB: Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier. - # TYPE cortex_querier_bucket_store_sent_chunk_size_bytes histogram - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="32"} 0 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="256"} 0 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="512"} 0 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="1024"} 0 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="32768"} 0 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="262144"} 4 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="524288"} 6 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="1.048576e+06"} 6 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="3.3554432e+07"} 6 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="2.68435456e+08"} 6 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="5.36870912e+08"} 6 - cortex_querier_bucket_store_sent_chunk_size_bytes_bucket{le="+Inf"} 6 - cortex_querier_bucket_store_sent_chunk_size_bytes_sum 1.328621e+06 - cortex_querier_bucket_store_sent_chunk_size_bytes_count 6 - # HELP cortex_querier_bucket_store_series_blocks_queried TSDB: Number of blocks in a bucket store that were touched to satisfy a query. # TYPE cortex_querier_bucket_store_series_blocks_queried summary cortex_querier_bucket_store_series_blocks_queried_sum 1.283583e+06 From 92050ab5b0f0df0434f3caffb9269620df152077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:44:54 +0100 Subject: [PATCH 19/27] Replaced cortex_store_index_cache prefix with cortex_querier_blocks_index_cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- pkg/querier/block_store_metrics.go | 16 ++--- pkg/querier/bucket_store_metrics_test.go | 78 ++++++++++++------------ 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bd0f111e00..b7897f32beb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,7 +34,7 @@ instructions below to upgrade your Postgres. * `--experimental.tsdb.max-tsdb-opening-concurrency-on-startup` * [ENHANCEMENT] Experimental TSDB: Added `cortex_ingester_shipper_dir_syncs_total`, `cortex_ingester_shipper_dir_sync_failures_total`, `cortex_ingester_shipper_uploads_total` and `cortex_ingester_shipper_upload_failures_total` metrics from TSDB shipper component. #1983 * [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (many metrics to list, but all have `cortex_querier_bucket_store_` prefix). #1996 -* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos in memory index cache (many metrics to list, but all have `cortex_store_index_cache_` prefix). #1996 +* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos in memory index cache (many metrics to list, but all have `cortex_querier_blocks_index_cache_` prefix). #1996 * [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861 * [BUGFIX] Fixed #1904 ingesters getting stuck in a LEAVING state after coming up from an ungraceful exit. #1921 * [BUGFIX] Reduce memory usage when ingester Push() errors. #1922 diff --git a/pkg/querier/block_store_metrics.go b/pkg/querier/block_store_metrics.go index e8c68cb58dd..6b2c5733ab0 100644 --- a/pkg/querier/block_store_metrics.go +++ b/pkg/querier/block_store_metrics.go @@ -104,35 +104,35 @@ func newTSDBBucketStoreMetrics() *tsdbBucketStoreMetrics { // Cache cacheItemsEvicted: prometheus.NewDesc( - "cortex_store_index_cache_items_evicted_total", + "cortex_querier_blocks_index_cache_items_evicted_total", "TSDB: Total number of items that were evicted from the index cache.", []string{"item_type"}, nil), cacheItemsAdded: prometheus.NewDesc( - "cortex_store_index_cache_items_added_total", + "cortex_querier_blocks_index_cache_items_added_total", "TSDB: Total number of items that were added to the index cache.", []string{"item_type"}, nil), cacheRequests: prometheus.NewDesc( - "cortex_store_index_cache_requests_total", + "cortex_querier_blocks_index_cache_requests_total", "TSDB: Total number of requests to the cache.", []string{"item_type"}, nil), cacheItemsOverflow: prometheus.NewDesc( - "cortex_store_index_cache_items_overflowed_total", + "cortex_querier_blocks_index_cache_items_overflowed_total", "TSDB: Total number of items that could not be added to the cache due to being too big.", []string{"item_type"}, nil), cacheHits: prometheus.NewDesc( - "cortex_store_index_cache_hits_total", + "cortex_querier_blocks_index_cache_hits_total", "TSDB: Total number of requests to the cache that were a hit.", []string{"item_type"}, nil), cacheItemsCurrentCount: prometheus.NewDesc( - "cortex_store_index_cache_items", + "cortex_querier_blocks_index_cache_items", "TSDB: Current number of items in the index cache.", []string{"item_type"}, nil), cacheItemsCurrentSize: prometheus.NewDesc( - "cortex_store_index_cache_items_size_bytes", + "cortex_querier_blocks_index_cache_items_size_bytes", "TSDB: Current byte size of items in the index cache.", []string{"item_type"}, nil), cacheItemsTotalCurrentSize: prometheus.NewDesc( - "cortex_store_index_cache_total_size_bytes", + "cortex_querier_blocks_index_cache_total_size_bytes", "TSDB: Current byte size of items (both value and key) in the index cache.", []string{"item_type"}, nil), } diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index afec36e757b..69f50d2e7cd 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -128,45 +128,45 @@ func TestTsdbBucketStoreMetrics(t *testing.T) { cortex_querier_bucket_store_series_result_series_sum 1.238545e+06 cortex_querier_bucket_store_series_result_series_count 6 - # HELP cortex_store_index_cache_items_evicted_total TSDB: Total number of items that were evicted from the index cache. - # TYPE cortex_store_index_cache_items_evicted_total counter - cortex_store_index_cache_items_evicted_total{item_type="Postings"} 1125950 - cortex_store_index_cache_items_evicted_total{item_type="Series"} 1148469 - - # HELP cortex_store_index_cache_requests_total TSDB: Total number of requests to the cache. - # TYPE cortex_store_index_cache_requests_total counter - cortex_store_index_cache_requests_total{item_type="Postings"} 1170988 - cortex_store_index_cache_requests_total{item_type="Series"} 1193507 - - # HELP cortex_store_index_cache_hits_total TSDB: Total number of requests to the cache that were a hit. - # TYPE cortex_store_index_cache_hits_total counter - cortex_store_index_cache_hits_total{item_type="Postings"} 1216026 - cortex_store_index_cache_hits_total{item_type="Series"} 1238545 - - # HELP cortex_store_index_cache_items_added_total TSDB: Total number of items that were added to the index cache. - # TYPE cortex_store_index_cache_items_added_total counter - cortex_store_index_cache_items_added_total{item_type="Postings"} 1261064 - cortex_store_index_cache_items_added_total{item_type="Series"} 1283583 - - # HELP cortex_store_index_cache_items TSDB: Current number of items in the index cache. - # TYPE cortex_store_index_cache_items gauge - cortex_store_index_cache_items{item_type="Postings"} 1306102 - cortex_store_index_cache_items{item_type="Series"} 1328621 - - # HELP cortex_store_index_cache_items_size_bytes TSDB: Current byte size of items in the index cache. - # TYPE cortex_store_index_cache_items_size_bytes gauge - cortex_store_index_cache_items_size_bytes{item_type="Postings"} 1351140 - cortex_store_index_cache_items_size_bytes{item_type="Series"} 1373659 - - # HELP cortex_store_index_cache_total_size_bytes TSDB: Current byte size of items (both value and key) in the index cache. - # TYPE cortex_store_index_cache_total_size_bytes gauge - cortex_store_index_cache_total_size_bytes{item_type="Postings"} 1396178 - cortex_store_index_cache_total_size_bytes{item_type="Series"} 1418697 - - # HELP cortex_store_index_cache_items_overflowed_total TSDB: Total number of items that could not be added to the cache due to being too big. - # TYPE cortex_store_index_cache_items_overflowed_total counter - cortex_store_index_cache_items_overflowed_total{item_type="Postings"} 1441216 - cortex_store_index_cache_items_overflowed_total{item_type="Series"} 1463735 + # HELP cortex_querier_blocks_index_cache_items_evicted_total TSDB: Total number of items that were evicted from the index cache. + # TYPE cortex_querier_blocks_index_cache_items_evicted_total counter + cortex_querier_blocks_index_cache_items_evicted_total{item_type="Postings"} 1125950 + cortex_querier_blocks_index_cache_items_evicted_total{item_type="Series"} 1148469 + + # HELP cortex_querier_blocks_index_cache_requests_total TSDB: Total number of requests to the cache. + # TYPE cortex_querier_blocks_index_cache_requests_total counter + cortex_querier_blocks_index_cache_requests_total{item_type="Postings"} 1170988 + cortex_querier_blocks_index_cache_requests_total{item_type="Series"} 1193507 + + # HELP cortex_querier_blocks_index_cache_hits_total TSDB: Total number of requests to the cache that were a hit. + # TYPE cortex_querier_blocks_index_cache_hits_total counter + cortex_querier_blocks_index_cache_hits_total{item_type="Postings"} 1216026 + cortex_querier_blocks_index_cache_hits_total{item_type="Series"} 1238545 + + # HELP cortex_querier_blocks_index_cache_items_added_total TSDB: Total number of items that were added to the index cache. + # TYPE cortex_querier_blocks_index_cache_items_added_total counter + cortex_querier_blocks_index_cache_items_added_total{item_type="Postings"} 1261064 + cortex_querier_blocks_index_cache_items_added_total{item_type="Series"} 1283583 + + # HELP cortex_querier_blocks_index_cache_items TSDB: Current number of items in the index cache. + # TYPE cortex_querier_blocks_index_cache_items gauge + cortex_querier_blocks_index_cache_items{item_type="Postings"} 1306102 + cortex_querier_blocks_index_cache_items{item_type="Series"} 1328621 + + # HELP cortex_querier_blocks_index_cache_items_size_bytes TSDB: Current byte size of items in the index cache. + # TYPE cortex_querier_blocks_index_cache_items_size_bytes gauge + cortex_querier_blocks_index_cache_items_size_bytes{item_type="Postings"} 1351140 + cortex_querier_blocks_index_cache_items_size_bytes{item_type="Series"} 1373659 + + # HELP cortex_querier_blocks_index_cache_total_size_bytes TSDB: Current byte size of items (both value and key) in the index cache. + # TYPE cortex_querier_blocks_index_cache_total_size_bytes gauge + cortex_querier_blocks_index_cache_total_size_bytes{item_type="Postings"} 1396178 + cortex_querier_blocks_index_cache_total_size_bytes{item_type="Series"} 1418697 + + # HELP cortex_querier_blocks_index_cache_items_overflowed_total TSDB: Total number of items that could not be added to the cache due to being too big. + # TYPE cortex_querier_blocks_index_cache_items_overflowed_total counter + cortex_querier_blocks_index_cache_items_overflowed_total{item_type="Postings"} 1441216 + cortex_querier_blocks_index_cache_items_overflowed_total{item_type="Series"} 1463735 `)) require.NoError(t, err) From abaf8cb6d18cfc88fbb124a8b5e0ef24d2ce0ac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:50:14 +0100 Subject: [PATCH 20/27] Group metrics registration, and register only into non-nil registry. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/querier/block.go b/pkg/querier/block.go index d9843d95e3d..55886e4a328 100644 --- a/pkg/querier/block.go +++ b/pkg/querier/block.go @@ -39,14 +39,16 @@ func NewBlockQuerier(cfg tsdb.Config, logLevel logging.Level, r prometheus.Regis }), } - r.MustRegister(b.syncTimes) - us, err := NewUserStore(cfg, logLevel, util.Logger) if err != nil { return nil, err } b.us = us - r.MustRegister(us.tsdbMetrics) + + if r != nil { + r.MustRegister(b.syncTimes) + r.MustRegister(us.tsdbMetrics) + } level.Info(util.Logger).Log("msg", "synchronizing TSDB blocks for all users") if err := us.InitialSync(context.Background()); err != nil { From 88572026bd72a981d841aba94e2ec2d2f173b643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:50:24 +0100 Subject: [PATCH 21/27] Make message generic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/util/metrics_helper.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index cb0ac469682..4957c40dede 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -25,7 +25,7 @@ func BuildMetricFamiliesPerUserFromUserRegistries(regs map[string]*prometheus.Re } if err != nil { - level.Warn(Logger).Log("msg", "failed to gather metrics from TSDB shipper", "user", userID, "err", err) + level.Warn(Logger).Log("msg", "failed to gather metrics from registry", "user", userID, "err", err) continue } } From 6e803b800cae6210ed6a27da113ce7bdfde53986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:52:08 +0100 Subject: [PATCH 22/27] Ignore result and error and make lint happy. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/bucket_store_metrics_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/querier/bucket_store_metrics_test.go b/pkg/querier/bucket_store_metrics_test.go index 69f50d2e7cd..cdcfa1ece2c 100644 --- a/pkg/querier/bucket_store_metrics_test.go +++ b/pkg/querier/bucket_store_metrics_test.go @@ -201,7 +201,7 @@ func benchmarkMetricsCollection(b *testing.B, users int) { b.ResetTimer() for i := 0; i < b.N; i++ { - mainReg.Gather() + _, _ = mainReg.Gather() } } From d5f9693925a7075e127cd1ed9ce9df8a7d3cdfe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:57:17 +0100 Subject: [PATCH 23/27] Added test for getMetricsWithLabelNames with no labels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/util/metrics_helper_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index 745ce94a1a0..027f613b3b3 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -57,6 +57,15 @@ func TestGetMetricsWithLabelNames(t *testing.T) { labelValues: []string{"12", "22"}, metrics: []*dto.Metric{m6}}, }, out) + + // no labels -- returns all metrics in single key. this isn't very efficient, and there are other functions + // (without labels) to handle this better, but it still works. + out2 := getMetricsWithLabelNames(&dto.MetricFamily{Metric: []*dto.Metric{m1, m2, m3, m4, m5, m6}}, nil) + require.Equal(t, map[string]metricsWithLabels{ + getLabelsString(nil): { + labelValues: []string{}, + metrics: []*dto.Metric{m1, m2, m3, m4, m5, m6}}, + }, out2) } func makeLabels(namesAndValues ...string) []*dto.LabelPair { From 020ae23d4173a9b0f8664b8c949e7a33da58a1ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 08:58:01 +0100 Subject: [PATCH 24/27] Comment about missing m1 in test output. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/util/metrics_helper_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index 027f613b3b3..ce2ddd9a10b 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -46,6 +46,7 @@ func TestGetMetricsWithLabelNames(t *testing.T) { out := getMetricsWithLabelNames(&dto.MetricFamily{Metric: []*dto.Metric{m1, m2, m3, m4, m5, m6}}, labels) + // m1 is not returned at all, as it doesn't habe both required labels. require.Equal(t, map[string]metricsWithLabels{ getLabelsString([]string{"10", "20"}): { labelValues: []string{"10", "20"}, From 02d54c40ebad43c3cbc53db7ae59c7f11ea31fe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 09:13:41 +0100 Subject: [PATCH 25/27] Fixed duplicate entry in CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7897f32beb..0823a44944e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,8 +33,7 @@ instructions below to upgrade your Postgres. * [ENHANCEMENT] Experimental TSDB: Open existing TSDB on startup to prevent ingester from becoming ready before it can accept writes. #1917 * `--experimental.tsdb.max-tsdb-opening-concurrency-on-startup` * [ENHANCEMENT] Experimental TSDB: Added `cortex_ingester_shipper_dir_syncs_total`, `cortex_ingester_shipper_dir_sync_failures_total`, `cortex_ingester_shipper_uploads_total` and `cortex_ingester_shipper_upload_failures_total` metrics from TSDB shipper component. #1983 -* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store (many metrics to list, but all have `cortex_querier_bucket_store_` prefix). #1996 -* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos in memory index cache (many metrics to list, but all have `cortex_querier_blocks_index_cache_` prefix). #1996 +* [ENHANCEMENT] Experimental TSDB: Querier now exports aggregate metrics from Thanos bucket store and in memory index cache (many metrics to list, but all have `cortex_querier_bucket_store_` or `cortex_querier_blocks_index_cache_` prefix). #1996 * [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861 * [BUGFIX] Fixed #1904 ingesters getting stuck in a LEAVING state after coming up from an ungraceful exit. #1921 * [BUGFIX] Reduce memory usage when ingester Push() errors. #1922 From 7741f310deaedc1d168e79d1def577bd04c92551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 09:15:31 +0100 Subject: [PATCH 26/27] Use single call. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/querier/block.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/querier/block.go b/pkg/querier/block.go index 55886e4a328..7246b6f4eaf 100644 --- a/pkg/querier/block.go +++ b/pkg/querier/block.go @@ -46,8 +46,7 @@ func NewBlockQuerier(cfg tsdb.Config, logLevel logging.Level, r prometheus.Regis b.us = us if r != nil { - r.MustRegister(b.syncTimes) - r.MustRegister(us.tsdbMetrics) + r.MustRegister(b.syncTimes, us.tsdbMetrics) } level.Info(util.Logger).Log("msg", "synchronizing TSDB blocks for all users") From a9a95e0a482b12b695986864baa6ef103f6af1ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 21 Jan 2020 09:21:04 +0100 Subject: [PATCH 27/27] Typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/util/metrics_helper_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index ce2ddd9a10b..2b4b3f6ab81 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -46,7 +46,7 @@ func TestGetMetricsWithLabelNames(t *testing.T) { out := getMetricsWithLabelNames(&dto.MetricFamily{Metric: []*dto.Metric{m1, m2, m3, m4, m5, m6}}, labels) - // m1 is not returned at all, as it doesn't habe both required labels. + // m1 is not returned at all, as it doesn't have both required labels. require.Equal(t, map[string]metricsWithLabels{ getLabelsString([]string{"10", "20"}): { labelValues: []string{"10", "20"},