Fix query frontend v2 failed to cancel request (#5447)

yeya24 · web-flow · commit c0dbcb1c8971 · 2023-07-10T09:45:16.000-07:00
* fix query frontend v2 failed to cancel request

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

* update changelog

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

* add metric

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

---------

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -48,6 +48,7 @@
 * [BUGFIX] Distributor: Fix potential data corruption in cases of timeout between distributors and ingesters. #5422
 * [BUGFIX] Store Gateway: Fix bug in store gateway ring comparison logic. #5426
 * [BUGFIX] Ring: Fix bug in consistency of Get func in a scaling zone-aware ring. #5429
+* [BUGFIX] Query Frontend: Fix bug of failing to cancel downstream request context in query frontend v2 mode (query scheduler enabled). #5447
 
 ## 1.15.1 2023-04-26
 
diff --git a/pkg/frontend/v2/frontend.go b/pkg/frontend/v2/frontend.go
@@ -24,6 +24,7 @@ import (
 	"github.com/cortexproject/cortex/pkg/util/flagext"
 	"github.com/cortexproject/cortex/pkg/util/grpcclient"
 	"github.com/cortexproject/cortex/pkg/util/httpgrpcutil"
+	util_log "github.com/cortexproject/cortex/pkg/util/log"
 	"github.com/cortexproject/cortex/pkg/util/services"
 )
 
@@ -70,6 +71,9 @@ type Frontend struct {
 
 	schedulerWorkers *frontendSchedulerWorkers
 	requests         *requestsInProgress
+
+	// Metric for number of cancellation failed to send to query scheduler.
+	cancelFailedQueries prometheus.Counter
 }
 
 type frontendRequest struct {
@@ -135,6 +139,11 @@ func NewFrontend(cfg Config, log log.Logger, reg prometheus.Registerer) (*Fronte
 		return float64(f.schedulerWorkers.getWorkersCount())
 	})
 
+	f.cancelFailedQueries = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "cortex_query_frontend_cancel_failed_queries_total",
+		Help: "Total number of queries that are failed to be canceled due to cancel channel full.",
+	})
+
 	f.Service = services.NewIdleService(f.starting, f.stopping)
 	return f, nil
 }
@@ -226,7 +235,9 @@ enqueueAgain:
 			case cancelCh <- freq.queryID:
 				// cancellation sent.
 			default:
-				// failed to cancel, ignore.
+				// failed to cancel, log it.
+				level.Warn(util_log.WithContext(ctx, f.log)).Log("msg", "failed to enqueue cancellation signal", "query_id", freq.queryID)
+				f.cancelFailedQueries.Inc()
 			}
 		}
 		return nil, ctx.Err()
diff --git a/pkg/frontend/v2/frontend_scheduler_worker.go b/pkg/frontend/v2/frontend_scheduler_worker.go
@@ -171,7 +171,7 @@ func newFrontendSchedulerWorker(conn *grpc.ClientConn, schedulerAddr string, fro
 		schedulerAddr: schedulerAddr,
 		frontendAddr:  frontendAddr,
 		requestCh:     requestCh,
-		cancelCh:      make(chan uint64),
+		cancelCh:      make(chan uint64, 1000), // Use buffered channel to make sure we can always enqueue to cancel request context.
 	}
 	w.ctx, w.cancel = context.WithCancel(context.Background())
 

Original file line number	Diff line number	Diff line change
`@@ -171,7 +171,7 @@ func newFrontendSchedulerWorker(conn *grpc.ClientConn, schedulerAddr string, fro`
`171`	`171`	`schedulerAddr: schedulerAddr,`
`172`	`172`	`frontendAddr: frontendAddr,`
`173`	`173`	`requestCh: requestCh,`
`174`		`- cancelCh: make(chan uint64),`
	`174`	`+ cancelCh: make(chan uint64, 1000), // Use buffered channel to make sure we can always enqueue to cancel request context.`
`175`	`175`	`}`
`176`	`176`	`w.ctx, w.cancel = context.WithCancel(context.Background())`
`177`	`177`