@@ -9,6 +9,14 @@ import (
9
9
dto "github.com/prometheus/client_model/go"
10
10
)
11
11
12
+ const (
13
+ memSeriesCreatedTotalName = "cortex_ingester_memory_series_created_total"
14
+ memSeriesCreatedTotalHelp = "The total number of series that were created per user."
15
+
16
+ memSeriesRemovedTotalName = "cortex_ingester_memory_series_removed_total"
17
+ memSeriesRemovedTotalHelp = "The total number of series that were removed per user."
18
+ )
19
+
12
20
type ingesterMetrics struct {
13
21
flushQueueLength prometheus.Gauge
14
22
ingestedSamples prometheus.Counter
@@ -24,7 +32,7 @@ type ingesterMetrics struct {
24
32
walReplayDuration prometheus.Gauge
25
33
}
26
34
27
- func newIngesterMetrics (r prometheus.Registerer ) * ingesterMetrics {
35
+ func newIngesterMetrics (r prometheus.Registerer , registerMetricsConflictingWithTSDB bool ) * ingesterMetrics {
28
36
m := & ingesterMetrics {
29
37
flushQueueLength : prometheus .NewGauge (prometheus.GaugeOpts {
30
38
Name : "cortex_ingester_flush_queue_length" ,
@@ -69,12 +77,12 @@ func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics {
69
77
Help : "The current number of users in memory." ,
70
78
}),
71
79
memSeriesCreatedTotal : prometheus .NewCounterVec (prometheus.CounterOpts {
72
- Name : "cortex_ingester_memory_series_created_total" ,
73
- Help : "The total number of series that were created per user." ,
80
+ Name : memSeriesCreatedTotalName ,
81
+ Help : memSeriesCreatedTotalHelp ,
74
82
}, []string {"user" }),
75
83
memSeriesRemovedTotal : prometheus .NewCounterVec (prometheus.CounterOpts {
76
- Name : "cortex_ingester_memory_series_removed_total" ,
77
- Help : "The total number of series that were removed per user." ,
84
+ Name : memSeriesRemovedTotalName ,
85
+ Help : memSeriesRemovedTotalHelp ,
78
86
}, []string {"user" }),
79
87
walReplayDuration : prometheus .NewGauge (prometheus.GaugeOpts {
80
88
Name : "cortex_ingester_wal_replay_duration_seconds" ,
@@ -93,29 +101,43 @@ func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics {
93
101
m .queriedChunks ,
94
102
m .memSeries ,
95
103
m .memUsers ,
96
- m .memSeriesCreatedTotal ,
97
- m .memSeriesRemovedTotal ,
98
104
m .walReplayDuration ,
99
105
)
106
+
107
+ if registerMetricsConflictingWithTSDB {
108
+ r .MustRegister (
109
+ m .memSeriesCreatedTotal ,
110
+ m .memSeriesRemovedTotal ,
111
+ )
112
+ }
100
113
}
101
114
102
115
return m
103
116
}
104
117
105
- // TSDB shipper metrics. We aggregate metrics from individual TSDB shippers into
106
- // a single set of counters, which are exposed as Cortex metrics.
107
- type shipperMetrics struct {
118
+ // TSDB metrics. Each tenant has its own registry, that TSDB code uses.
119
+ type tsdbMetrics struct {
120
+ // We aggregate metrics from individual TSDB registries into
121
+ // a single set of counters, which are exposed as Cortex metrics.
108
122
dirSyncs * prometheus.Desc // sum(thanos_shipper_dir_syncs_total)
109
123
dirSyncFailures * prometheus.Desc // sum(thanos_shipper_dir_sync_failures_total)
110
124
uploads * prometheus.Desc // sum(thanos_shipper_uploads_total)
111
125
uploadFailures * prometheus.Desc // sum(thanos_shipper_upload_failures_total)
112
126
127
+ // These two metrics replace metrics in ingesterMetrics, as we count them differently
128
+ memSeriesCreatedTotal * prometheus.Desc
129
+ memSeriesRemovedTotal * prometheus.Desc
130
+
131
+ // These maps drive the collection output. Key = original metric name to group.
132
+ sumCountersGlobally map [string ]* prometheus.Desc
133
+ sumCountersPerUser map [string ]* prometheus.Desc
134
+
113
135
regsMu sync.RWMutex // custom mutex for shipper registry, to avoid blocking main user state mutex on collection
114
- regs map [string ]* prometheus.Registry // One prometheus registry (used by shipper) per tenant
136
+ regs map [string ]* prometheus.Registry // One prometheus registry per tenant
115
137
}
116
138
117
- func newShipperMetrics (r prometheus.Registerer ) * shipperMetrics {
118
- m := & shipperMetrics {
139
+ func newTSDBMetrics (r prometheus.Registerer ) * tsdbMetrics {
140
+ m := & tsdbMetrics {
119
141
regs : make (map [string ]* prometheus.Registry ),
120
142
121
143
dirSyncs : prometheus .NewDesc (
@@ -134,6 +156,21 @@ func newShipperMetrics(r prometheus.Registerer) *shipperMetrics {
134
156
"cortex_ingester_shipper_upload_failures_total" ,
135
157
"TSDB: Total number of failed object uploads" ,
136
158
nil , nil ),
159
+
160
+ memSeriesCreatedTotal : prometheus .NewDesc (memSeriesCreatedTotalName , memSeriesCreatedTotalHelp , []string {"user" }, nil ),
161
+ memSeriesRemovedTotal : prometheus .NewDesc (memSeriesRemovedTotalName , memSeriesRemovedTotalHelp , []string {"user" }, nil ),
162
+ }
163
+
164
+ m .sumCountersGlobally = map [string ]* prometheus.Desc {
165
+ "thanos_shipper_dir_syncs_total" : m .dirSyncs ,
166
+ "thanos_shipper_dir_sync_failures_total" : m .dirSyncFailures ,
167
+ "thanos_shipper_uploads_total" : m .uploads ,
168
+ "thanos_shipper_upload_failures_total" : m .uploadFailures ,
169
+ }
170
+
171
+ m .sumCountersPerUser = map [string ]* prometheus.Desc {
172
+ "prometheus_tsdb_head_series_created_total" : m .memSeriesCreatedTotal ,
173
+ "prometheus_tsdb_head_series_removed_total" : m .memSeriesRemovedTotal ,
137
174
}
138
175
139
176
if r != nil {
@@ -142,51 +179,58 @@ func newShipperMetrics(r prometheus.Registerer) *shipperMetrics {
142
179
return m
143
180
}
144
181
145
- func (sm * shipperMetrics ) Describe (out chan <- * prometheus.Desc ) {
182
+ func (sm * tsdbMetrics ) Describe (out chan <- * prometheus.Desc ) {
146
183
out <- sm .dirSyncs
147
184
out <- sm .dirSyncFailures
148
185
out <- sm .uploads
149
186
out <- sm .uploadFailures
187
+ out <- sm .memSeriesCreatedTotal
188
+ out <- sm .memSeriesRemovedTotal
150
189
}
151
190
152
- func (sm * shipperMetrics ) Collect (out chan <- prometheus.Metric ) {
153
- gathered := make (map [string ][]* dto.MetricFamily )
191
+ func (sm * tsdbMetrics ) Collect (out chan <- prometheus.Metric ) {
192
+ regs := sm .registries ()
193
+ data := gatheredMetricsPerUser {}
154
194
155
- regs := sm .shipperRegistries ()
156
195
for userID , r := range regs {
157
196
m , err := r .Gather ()
158
197
if err != nil {
159
198
level .Warn (util .Logger ).Log ("msg" , "failed to gather metrics from TSDB shipper" , "user" , userID , "err" , err )
160
199
continue
161
200
}
162
201
163
- addToGatheredMap ( gathered , m )
202
+ data . addGatheredDataForUser ( userID , m )
164
203
}
165
204
166
205
// OK, we have it all. Let's build results.
167
- out <- prometheus .MustNewConstMetric (sm .dirSyncs , prometheus .CounterValue , sumCounters (gathered ["thanos_shipper_dir_syncs_total" ]))
168
- out <- prometheus .MustNewConstMetric (sm .dirSyncFailures , prometheus .CounterValue , sumCounters (gathered ["thanos_shipper_dir_sync_failures_total" ]))
169
- out <- prometheus .MustNewConstMetric (sm .uploads , prometheus .CounterValue , sumCounters (gathered ["thanos_shipper_uploads_total" ]))
170
- out <- prometheus .MustNewConstMetric (sm .uploadFailures , prometheus .CounterValue , sumCounters (gathered ["thanos_shipper_upload_failures_total" ]))
206
+ for metric , desc := range sm .sumCountersGlobally {
207
+ out <- prometheus .MustNewConstMetric (desc , prometheus .CounterValue , data .sumCountersAcrossAllUsers (metric ))
208
+ }
209
+
210
+ for metric , desc := range sm .sumCountersPerUser {
211
+ userValues := data .sumCountersPerUser (metric )
212
+ for user , val := range userValues {
213
+ out <- prometheus .MustNewConstMetric (desc , prometheus .CounterValue , val , user )
214
+ }
215
+ }
171
216
}
172
217
173
- func (sm * shipperMetrics ) shipperRegistries () []* prometheus.Registry {
218
+ // make a copy of the map, so that metrics can be gathered while the new registry is being added.
219
+ func (sm * tsdbMetrics ) registries () map [string ]* prometheus.Registry {
174
220
sm .regsMu .RLock ()
175
221
defer sm .regsMu .RUnlock ()
176
222
177
- regs := make ([ ]* prometheus.Registry , 0 , len (sm .regs ))
178
- for _ , r := range sm .regs {
179
- regs = append ( regs , r )
223
+ regs := make (map [ string ]* prometheus.Registry , len (sm .regs ))
224
+ for u , r := range sm .regs {
225
+ regs [ u ] = r
180
226
}
181
227
return regs
182
228
}
183
229
184
- func (sm * shipperMetrics ) newRegistryForUser (userID string ) prometheus.Registerer {
185
- reg := prometheus .NewRegistry ()
230
+ func (sm * tsdbMetrics ) setRegistryForUser (userID string , registry * prometheus.Registry ) {
186
231
sm .regsMu .Lock ()
187
- sm .regs [userID ] = reg
232
+ sm .regs [userID ] = registry
188
233
sm .regsMu .Unlock ()
189
- return reg
190
234
}
191
235
192
236
func sumCounters (mfs []* dto.MetricFamily ) float64 {
@@ -207,11 +251,37 @@ func sumCounters(mfs []*dto.MetricFamily) float64 {
207
251
return result
208
252
}
209
253
210
- func addToGatheredMap (all map [string ][]* dto.MetricFamily , mfs []* dto.MetricFamily ) {
211
- for _ , m := range mfs {
254
+ // first key = userID, second key = metric name. Value = slice of gathered values with the same metric name.
255
+ type gatheredMetricsPerUser map [string ]map [string ][]* dto.MetricFamily
256
+
257
+ func (d gatheredMetricsPerUser ) addGatheredDataForUser (userID string , metrics []* dto.MetricFamily ) {
258
+ // first, create new map which maps metric names to a slice of MetricFamily instances.
259
+ // That makes it easier to do searches later.
260
+ perMetricName := map [string ][]* dto.MetricFamily {}
261
+
262
+ for _ , m := range metrics {
212
263
if m .Name == nil {
213
264
continue
214
265
}
215
- all [* m .Name ] = append (all [* m .Name ], m )
266
+ perMetricName [* m .Name ] = append (perMetricName [* m .Name ], m )
267
+ }
268
+
269
+ d [userID ] = perMetricName
270
+ }
271
+
272
+ func (d gatheredMetricsPerUser ) sumCountersAcrossAllUsers (counter string ) float64 {
273
+ result := float64 (0 )
274
+ for _ , perMetric := range d {
275
+ result += sumCounters (perMetric [counter ])
216
276
}
277
+ return result
278
+ }
279
+
280
+ func (d gatheredMetricsPerUser ) sumCountersPerUser (counter string ) map [string ]float64 {
281
+ result := map [string ]float64 {}
282
+ for user , perMetric := range d {
283
+ v := sumCounters (perMetric [counter ])
284
+ result [user ] = v
285
+ }
286
+ return result
217
287
}
0 commit comments