Skip to content

Commit c0dbcb1

Browse files
authored
Fix query frontend v2 failed to cancel request (#5447)
* fix query frontend v2 failed to cancel request Signed-off-by: Ben Ye <[email protected]> * update changelog Signed-off-by: Ben Ye <[email protected]> * add metric Signed-off-by: Ben Ye <[email protected]> --------- Signed-off-by: Ben Ye <[email protected]>
1 parent 7b51a48 commit c0dbcb1

File tree

3 files changed

+14
-2
lines changed

3 files changed

+14
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
* [BUGFIX] Distributor: Fix potential data corruption in cases of timeout between distributors and ingesters. #5422
4949
* [BUGFIX] Store Gateway: Fix bug in store gateway ring comparison logic. #5426
5050
* [BUGFIX] Ring: Fix bug in consistency of Get func in a scaling zone-aware ring. #5429
51+
* [BUGFIX] Query Frontend: Fix bug of failing to cancel downstream request context in query frontend v2 mode (query scheduler enabled). #5447
5152

5253
## 1.15.1 2023-04-26
5354

pkg/frontend/v2/frontend.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/cortexproject/cortex/pkg/util/flagext"
2525
"github.com/cortexproject/cortex/pkg/util/grpcclient"
2626
"github.com/cortexproject/cortex/pkg/util/httpgrpcutil"
27+
util_log "github.com/cortexproject/cortex/pkg/util/log"
2728
"github.com/cortexproject/cortex/pkg/util/services"
2829
)
2930

@@ -70,6 +71,9 @@ type Frontend struct {
7071

7172
schedulerWorkers *frontendSchedulerWorkers
7273
requests *requestsInProgress
74+
75+
// Metric for number of cancellation failed to send to query scheduler.
76+
cancelFailedQueries prometheus.Counter
7377
}
7478

7579
type frontendRequest struct {
@@ -135,6 +139,11 @@ func NewFrontend(cfg Config, log log.Logger, reg prometheus.Registerer) (*Fronte
135139
return float64(f.schedulerWorkers.getWorkersCount())
136140
})
137141

142+
f.cancelFailedQueries = promauto.With(reg).NewCounter(prometheus.CounterOpts{
143+
Name: "cortex_query_frontend_cancel_failed_queries_total",
144+
Help: "Total number of queries that are failed to be canceled due to cancel channel full.",
145+
})
146+
138147
f.Service = services.NewIdleService(f.starting, f.stopping)
139148
return f, nil
140149
}
@@ -226,7 +235,9 @@ enqueueAgain:
226235
case cancelCh <- freq.queryID:
227236
// cancellation sent.
228237
default:
229-
// failed to cancel, ignore.
238+
// failed to cancel, log it.
239+
level.Warn(util_log.WithContext(ctx, f.log)).Log("msg", "failed to enqueue cancellation signal", "query_id", freq.queryID)
240+
f.cancelFailedQueries.Inc()
230241
}
231242
}
232243
return nil, ctx.Err()

pkg/frontend/v2/frontend_scheduler_worker.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ func newFrontendSchedulerWorker(conn *grpc.ClientConn, schedulerAddr string, fro
171171
schedulerAddr: schedulerAddr,
172172
frontendAddr: frontendAddr,
173173
requestCh: requestCh,
174-
cancelCh: make(chan uint64),
174+
cancelCh: make(chan uint64, 1000), // Use buffered channel to make sure we can always enqueue to cancel request context.
175175
}
176176
w.ctx, w.cancel = context.WithCancel(context.Background())
177177

0 commit comments

Comments
 (0)