Skip to content

Commit 03d4318

Browse files
authored
feat: instrument the query layer to track rate-limited queries (#3894)
* feat: instrument the query layer to track rate-limited queries Signed-off-by: Jacob Lisi <[email protected]> * chore: update changelog Signed-off-by: Jacob Lisi <[email protected]> * fix: fix goimports linting error Signed-off-by: Jacob Lisi <[email protected]> * fix per PR comments Signed-off-by: Jacob Lisi <[email protected]> * rename discarded_queries --> discarded_requests Signed-off-by: Jacob Lisi <[email protected]> * fix lint Signed-off-by: Jacob Lisi <[email protected]>
1 parent e02797a commit 03d4318

File tree

6 files changed

+39
-11
lines changed

6 files changed

+39
-11
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
* `-alertmanager.alertmanager-client.tls-server-name`
3131
* `-alertmanager.alertmanager-client.tls-insecure-skip-verify`
3232
* [FEATURE] Compactor: added blocks storage per-tenant retention support. This is configured via `-compactor.retention-period`, and can be overridden on a per-tenant basis. #3879
33+
* [ENHANCEMENT] Queries: Instrument queries that were discarded due to the configured `max_outstanding_requests_per_tenant`. #3894
34+
* `cortex_query_frontend_discarded_requests_total`
35+
* `cortex_query_scheduler_discarded_requests_total`
3336
* [ENHANCEMENT] Ruler: Add TLS and explicit basis authentication configuration options for the HTTP client the ruler uses to communicate with the alertmanager. #3752
3437
* `-ruler.alertmanager-client.basic-auth-username`: Configure the basic authentication username used by the client. Takes precedent over a URL configured username.
3538
* `-ruler.alertmanager-client.basic-auth-password`: Configure the basic authentication password used by the client. Takes precedent over a URL configured password.

pkg/frontend/v1/frontend.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,10 @@ type Frontend struct {
5757
activeUsers *util.ActiveUsersCleanupService
5858

5959
// Metrics.
60-
queueLength *prometheus.GaugeVec
61-
numClients prometheus.GaugeFunc
62-
queueDuration prometheus.Histogram
60+
queueLength *prometheus.GaugeVec
61+
discardedRequests *prometheus.CounterVec
62+
numClients prometheus.GaugeFunc
63+
queueDuration prometheus.Histogram
6364
}
6465

6566
type request struct {
@@ -83,14 +84,18 @@ func New(cfg Config, limits Limits, log log.Logger, registerer prometheus.Regist
8384
Name: "cortex_query_frontend_queue_length",
8485
Help: "Number of queries in the queue.",
8586
}, []string{"user"}),
87+
discardedRequests: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
88+
Name: "cortex_query_frontend_discarded_requests_total",
89+
Help: "Total number of query requests discarded.",
90+
}, []string{"user"}),
8691
queueDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
8792
Name: "cortex_query_frontend_queue_duration_seconds",
8893
Help: "Time spend by requests queued.",
8994
Buckets: prometheus.DefBuckets,
9095
}),
9196
}
9297

93-
f.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, f.queueLength)
98+
f.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, f.queueLength, f.discardedRequests)
9499
f.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(f.cleanupInactiveUserMetrics)
95100

96101
f.numClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
@@ -114,6 +119,7 @@ func (f *Frontend) stopping(_ error) error {
114119

115120
func (f *Frontend) cleanupInactiveUserMetrics(user string) {
116121
f.queueLength.DeleteLabelValues(user)
122+
f.discardedRequests.DeleteLabelValues(user)
117123
}
118124

119125
// RoundTripGRPC round trips a proto (instead of a HTTP request).

pkg/frontend/v1/frontend_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,11 @@ func TestFrontendCheckReady(t *testing.T) {
127127
} {
128128
t.Run(tt.name, func(t *testing.T) {
129129
f := &Frontend{
130-
log: log.NewNopLogger(),
131-
requestQueue: queue.NewRequestQueue(5, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"})),
130+
log: log.NewNopLogger(),
131+
requestQueue: queue.NewRequestQueue(5,
132+
prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
133+
prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}),
134+
),
132135
}
133136
for i := 0; i < tt.connectedClients; i++ {
134137
f.requestQueue.RegisterQuerierConnection("test")

pkg/scheduler/queue/queue.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,16 @@ type RequestQueue struct {
4747
queues *queues
4848
stopped bool
4949

50-
queueLength *prometheus.GaugeVec // Per user.
50+
queueLength *prometheus.GaugeVec // Per user and reason.
51+
discardedRequests *prometheus.CounterVec // Per user.
5152
}
5253

53-
func NewRequestQueue(maxOutstandingPerTenant int, queueLength *prometheus.GaugeVec) *RequestQueue {
54+
func NewRequestQueue(maxOutstandingPerTenant int, queueLength *prometheus.GaugeVec, discardedRequests *prometheus.CounterVec) *RequestQueue {
5455
q := &RequestQueue{
5556
queues: newUserQueues(maxOutstandingPerTenant),
5657
connectedQuerierWorkers: atomic.NewInt32(0),
5758
queueLength: queueLength,
59+
discardedRequests: discardedRequests,
5860
}
5961

6062
q.cond = sync.NewCond(&q.mtx)
@@ -91,6 +93,7 @@ func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers in
9193
}
9294
return nil
9395
default:
96+
q.discardedRequests.WithLabelValues(userID).Inc()
9497
return ErrTooManyRequests
9598
}
9699
}

pkg/scheduler/queue/queue_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@ func BenchmarkGetNextRequest(b *testing.B) {
1717
queues := make([]*RequestQueue, 0, b.N)
1818

1919
for n := 0; n < b.N; n++ {
20-
queue := NewRequestQueue(maxOutstandingPerTenant, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}))
20+
queue := NewRequestQueue(maxOutstandingPerTenant,
21+
prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
22+
prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}),
23+
)
2124
queues = append(queues, queue)
2225

2326
for ix := 0; ix < queriers; ix++ {
@@ -71,7 +74,10 @@ func BenchmarkQueueRequest(b *testing.B) {
7174
requests := make([]string, 0, numTenants)
7275

7376
for n := 0; n < b.N; n++ {
74-
q := NewRequestQueue(maxOutstandingPerTenant, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}))
77+
q := NewRequestQueue(maxOutstandingPerTenant,
78+
prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
79+
prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}),
80+
)
7581

7682
for ix := 0; ix < queriers; ix++ {
7783
q.RegisterQuerierConnection(fmt.Sprintf("querier-%d", ix))

pkg/scheduler/scheduler.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ type Scheduler struct {
5555

5656
// Metrics.
5757
queueLength *prometheus.GaugeVec
58+
discardedRequests *prometheus.CounterVec
5859
connectedQuerierClients prometheus.GaugeFunc
5960
connectedFrontendClients prometheus.GaugeFunc
6061
queueDuration prometheus.Histogram
@@ -100,7 +101,12 @@ func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer promethe
100101
Name: "cortex_query_scheduler_queue_length",
101102
Help: "Number of queries in the queue.",
102103
}, []string{"user"})
103-
s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, s.queueLength)
104+
105+
s.discardedRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
106+
Name: "cortex_query_scheduler_discarded_requests_total",
107+
Help: "Total number of query requests discarded.",
108+
}, []string{"user"})
109+
s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, s.queueLength, s.discardedRequests)
104110

105111
s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
106112
Name: "cortex_query_scheduler_queue_duration_seconds",
@@ -471,6 +477,7 @@ func (s *Scheduler) stopping(_ error) error {
471477

472478
func (s *Scheduler) cleanupMetricsForInactiveUser(user string) {
473479
s.queueLength.DeleteLabelValues(user)
480+
s.discardedRequests.DeleteLabelValues(user)
474481
}
475482

476483
func (s *Scheduler) getConnectedFrontendClientsMetric() float64 {

0 commit comments

Comments
 (0)