Skip to content

Commit fd31ae6

Browse files
haiyangzsmb49
authored andcommitted
net: mana: Fix race of mana_hwc_post_rx_wqe and new hwc response
BugLink: https://bugs.launchpad.net/bugs/2080594 commit 8af174e upstream. The mana_hwc_rx_event_handler() / mana_hwc_handle_resp() calls complete(&ctx->comp_event) before posting the wqe back. It's possible that other callers, like mana_create_txq(), start the next round of mana_hwc_send_request() before the posting of wqe. And if the HW is fast enough to respond, it can hit no_wqe error on the HW channel, then the response message is lost. The mana driver may fail to create queues and open, because of waiting for the HW response and timed out. Sample dmesg: [ 528.610840] mana 39d4:00:02.0: HWC: Request timed out! [ 528.614452] mana 39d4:00:02.0: Failed to send mana message: -110, 0x0 [ 528.618326] mana 39d4:00:02.0 enP14804s2: Failed to create WQ object: -110 To fix it, move posting of rx wqe before complete(&ctx->comp_event). Cc: [email protected] Fixes: ca9c54d ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Haiyang Zhang <[email protected]> Reviewed-by: Long Li <[email protected]> Signed-off-by: David S. Miller <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]> Signed-off-by: Koichiro Den <[email protected]> Signed-off-by: Stefan Bader <[email protected]>
1 parent 666c488 commit fd31ae6

File tree

1 file changed

+34
-28
lines changed

1 file changed

+34
-28
lines changed

drivers/net/ethernet/microsoft/mana/hw_channel.c

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,41 @@ static int mana_hwc_verify_resp_msg(const struct hwc_caller_ctx *caller_ctx,
5151
return 0;
5252
}
5353

54+
static int mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq,
55+
struct hwc_work_request *req)
56+
{
57+
struct device *dev = hwc_rxq->hwc->dev;
58+
struct gdma_sge *sge;
59+
int err;
60+
61+
sge = &req->sge;
62+
sge->address = (u64)req->buf_sge_addr;
63+
sge->mem_key = hwc_rxq->msg_buf->gpa_mkey;
64+
sge->size = req->buf_len;
65+
66+
memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request));
67+
req->wqe_req.sgl = sge;
68+
req->wqe_req.num_sge = 1;
69+
req->wqe_req.client_data_unit = 0;
70+
71+
err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL);
72+
if (err)
73+
dev_err(dev, "Failed to post WQE on HWC RQ: %d\n", err);
74+
return err;
75+
}
76+
5477
static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len,
55-
const struct gdma_resp_hdr *resp_msg)
78+
struct hwc_work_request *rx_req)
5679
{
80+
const struct gdma_resp_hdr *resp_msg = rx_req->buf_va;
5781
struct hwc_caller_ctx *ctx;
5882
int err;
5983

6084
if (!test_bit(resp_msg->response.hwc_msg_id,
6185
hwc->inflight_msg_res.map)) {
6286
dev_err(hwc->dev, "hwc_rx: invalid msg_id = %u\n",
6387
resp_msg->response.hwc_msg_id);
88+
mana_hwc_post_rx_wqe(hwc->rxq, rx_req);
6489
return;
6590
}
6691

@@ -74,30 +99,13 @@ static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len,
7499
memcpy(ctx->output_buf, resp_msg, resp_len);
75100
out:
76101
ctx->error = err;
77-
complete(&ctx->comp_event);
78-
}
79-
80-
static int mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq,
81-
struct hwc_work_request *req)
82-
{
83-
struct device *dev = hwc_rxq->hwc->dev;
84-
struct gdma_sge *sge;
85-
int err;
86-
87-
sge = &req->sge;
88-
sge->address = (u64)req->buf_sge_addr;
89-
sge->mem_key = hwc_rxq->msg_buf->gpa_mkey;
90-
sge->size = req->buf_len;
91102

92-
memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request));
93-
req->wqe_req.sgl = sge;
94-
req->wqe_req.num_sge = 1;
95-
req->wqe_req.client_data_unit = 0;
103+
/* Must post rx wqe before complete(), otherwise the next rx may
104+
* hit no_wqe error.
105+
*/
106+
mana_hwc_post_rx_wqe(hwc->rxq, rx_req);
96107

97-
err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL);
98-
if (err)
99-
dev_err(dev, "Failed to post WQE on HWC RQ: %d\n", err);
100-
return err;
108+
complete(&ctx->comp_event);
101109
}
102110

103111
static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
@@ -208,14 +216,12 @@ static void mana_hwc_rx_event_handler(void *ctx, u32 gdma_rxq_id,
208216
return;
209217
}
210218

211-
mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, resp);
219+
mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, rx_req);
212220

213-
/* Do no longer use 'resp', because the buffer is posted to the HW
214-
* in the below mana_hwc_post_rx_wqe().
221+
/* Can no longer use 'resp', because the buffer is posted to the HW
222+
* in mana_hwc_handle_resp() above.
215223
*/
216224
resp = NULL;
217-
218-
mana_hwc_post_rx_wqe(hwc_rxq, rx_req);
219225
}
220226

221227
static void mana_hwc_tx_event_handler(void *ctx, u32 gdma_txq_id,

0 commit comments

Comments
 (0)