Skip to content

Commit 661f385

Browse files
BernardMetzlerjgunthorpe
authored andcommitted
RDMA/siw: Fix handling of zero-sized Read and Receive Queues.
During connection setup, the application may choose to zero-size inbound and outbound READ queues, as well as the Receive queue. This patch fixes handling of zero-sized queues, but not prevents it. Kamal Heib says in an initial error report: When running the blktests over siw the following shift-out-of-bounds is reported, this is happening because the passed IRD or ORD from the ulp could be zero which will lead to unexpected behavior when calling roundup_pow_of_two(), fix that by blocking zero values of ORD or IRD. UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'long unsigned int' CPU: 20 PID: 3957 Comm: kworker/u64:13 Tainted: G S 5.10.0-rc6 #2 Hardware name: Dell Inc. PowerEdge R630/02C2CP, BIOS 2.1.5 04/11/2016 Workqueue: iw_cm_wq cm_work_handler [iw_cm] Call Trace: dump_stack+0x99/0xcb ubsan_epilogue+0x5/0x40 __ubsan_handle_shift_out_of_bounds.cold.11+0xb4/0xf3 ? down_write+0x183/0x3d0 siw_qp_modify.cold.8+0x2d/0x32 [siw] ? __local_bh_enable_ip+0xa5/0xf0 siw_accept+0x906/0x1b60 [siw] ? xa_load+0x147/0x1f0 ? siw_connect+0x17a0/0x17a0 [siw] ? lock_downgrade+0x700/0x700 ? siw_get_base_qp+0x1c2/0x340 [siw] ? _raw_spin_unlock_irqrestore+0x39/0x40 iw_cm_accept+0x1f4/0x430 [iw_cm] rdma_accept+0x3fa/0xb10 [rdma_cm] ? check_flush_dependency+0x410/0x410 ? cma_rep_recv+0x570/0x570 [rdma_cm] nvmet_rdma_queue_connect+0x1a62/0x2680 [nvmet_rdma] ? nvmet_rdma_alloc_cmds+0xce0/0xce0 [nvmet_rdma] ? lock_release+0x56e/0xcc0 ? lock_downgrade+0x700/0x700 ? lock_downgrade+0x700/0x700 ? __xa_alloc_cyclic+0xef/0x350 ? __xa_alloc+0x2d0/0x2d0 ? rdma_restrack_add+0xbe/0x2c0 [ib_core] ? __ww_mutex_die+0x190/0x190 cma_cm_event_handler+0xf2/0x500 [rdma_cm] iw_conn_req_handler+0x910/0xcb0 [rdma_cm] ? _raw_spin_unlock_irqrestore+0x39/0x40 ? trace_hardirqs_on+0x1c/0x150 ? cma_ib_handler+0x8a0/0x8a0 [rdma_cm] ? __kasan_kmalloc.constprop.7+0xc1/0xd0 cm_work_handler+0x121c/0x17a0 [iw_cm] ? iw_cm_reject+0x190/0x190 [iw_cm] ? trace_hardirqs_on+0x1c/0x150 process_one_work+0x8fb/0x16c0 ? pwq_dec_nr_in_flight+0x320/0x320 worker_thread+0x87/0xb40 ? __kthread_parkme+0xd1/0x1a0 ? process_one_work+0x16c0/0x16c0 kthread+0x35f/0x430 ? kthread_mod_delayed_work+0x180/0x180 ret_from_fork+0x22/0x30 Fixes: a531975 ("rdma/siw: main include file") Fixes: f29dd55 ("rdma/siw: queue pair methods") Fixes: 8b6a361 ("rdma/siw: receive path") Fixes: b9be6f1 ("rdma/siw: transmit path") Fixes: 303ae1c ("rdma/siw: application interface") Link: https://lore.kernel.org/r/[email protected] Reported-by: Kamal Heib <[email protected]> Reported-by: Yi Zhang <[email protected]> Reported-by: kernel test robot <[email protected]> Signed-off-by: Bernard Metzler <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent aaf1226 commit 661f385

File tree

5 files changed

+177
-146
lines changed

5 files changed

+177
-146
lines changed

drivers/infiniband/sw/siw/siw.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ static inline struct siw_sqe *orq_get_free(struct siw_qp *qp)
653653
{
654654
struct siw_sqe *orq_e = orq_get_tail(qp);
655655

656-
if (orq_e && READ_ONCE(orq_e->flags) == 0)
656+
if (READ_ONCE(orq_e->flags) == 0)
657657
return orq_e;
658658

659659
return NULL;

drivers/infiniband/sw/siw/siw_qp.c

Lines changed: 142 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -199,26 +199,26 @@ void siw_qp_llp_write_space(struct sock *sk)
199199

200200
static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
201201
{
202-
irq_size = roundup_pow_of_two(irq_size);
203-
orq_size = roundup_pow_of_two(orq_size);
204-
205-
qp->attrs.irq_size = irq_size;
206-
qp->attrs.orq_size = orq_size;
207-
208-
qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
209-
if (!qp->irq) {
210-
siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
211-
qp->attrs.irq_size = 0;
212-
return -ENOMEM;
202+
if (irq_size) {
203+
irq_size = roundup_pow_of_two(irq_size);
204+
qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
205+
if (!qp->irq) {
206+
qp->attrs.irq_size = 0;
207+
return -ENOMEM;
208+
}
213209
}
214-
qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
215-
if (!qp->orq) {
216-
siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
217-
qp->attrs.orq_size = 0;
218-
qp->attrs.irq_size = 0;
219-
vfree(qp->irq);
220-
return -ENOMEM;
210+
if (orq_size) {
211+
orq_size = roundup_pow_of_two(orq_size);
212+
qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
213+
if (!qp->orq) {
214+
qp->attrs.orq_size = 0;
215+
qp->attrs.irq_size = 0;
216+
vfree(qp->irq);
217+
return -ENOMEM;
218+
}
221219
}
220+
qp->attrs.irq_size = irq_size;
221+
qp->attrs.orq_size = orq_size;
222222
siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
223223
return 0;
224224
}
@@ -288,13 +288,14 @@ int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
288288
if (ctrl & MPA_V2_RDMA_WRITE_RTR)
289289
wqe->sqe.opcode = SIW_OP_WRITE;
290290
else if (ctrl & MPA_V2_RDMA_READ_RTR) {
291-
struct siw_sqe *rreq;
291+
struct siw_sqe *rreq = NULL;
292292

293293
wqe->sqe.opcode = SIW_OP_READ;
294294

295295
spin_lock(&qp->orq_lock);
296296

297-
rreq = orq_get_free(qp);
297+
if (qp->attrs.orq_size)
298+
rreq = orq_get_free(qp);
298299
if (rreq) {
299300
siw_read_to_orq(rreq, &wqe->sqe);
300301
qp->orq_put++;
@@ -877,135 +878,88 @@ void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
877878
rreq->num_sge = 1;
878879
}
879880

880-
/*
881-
* Must be called with SQ locked.
882-
* To avoid complete SQ starvation by constant inbound READ requests,
883-
* the active IRQ will not be served after qp->irq_burst, if the
884-
* SQ has pending work.
885-
*/
886-
int siw_activate_tx(struct siw_qp *qp)
881+
static int siw_activate_tx_from_sq(struct siw_qp *qp)
887882
{
888-
struct siw_sqe *irqe, *sqe;
883+
struct siw_sqe *sqe;
889884
struct siw_wqe *wqe = tx_wqe(qp);
890885
int rv = 1;
891886

892-
irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
893-
894-
if (irqe->flags & SIW_WQE_VALID) {
895-
sqe = sq_get_next(qp);
896-
897-
/*
898-
* Avoid local WQE processing starvation in case
899-
* of constant inbound READ request stream
900-
*/
901-
if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
902-
qp->irq_burst = 0;
903-
goto skip_irq;
904-
}
905-
memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
906-
wqe->wr_status = SIW_WR_QUEUED;
907-
908-
/* start READ RESPONSE */
909-
wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
910-
wqe->sqe.flags = 0;
911-
if (irqe->num_sge) {
912-
wqe->sqe.num_sge = 1;
913-
wqe->sqe.sge[0].length = irqe->sge[0].length;
914-
wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
915-
wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
916-
} else {
917-
wqe->sqe.num_sge = 0;
918-
}
919-
920-
/* Retain original RREQ's message sequence number for
921-
* potential error reporting cases.
922-
*/
923-
wqe->sqe.sge[1].length = irqe->sge[1].length;
924-
925-
wqe->sqe.rkey = irqe->rkey;
926-
wqe->sqe.raddr = irqe->raddr;
887+
sqe = sq_get_next(qp);
888+
if (!sqe)
889+
return 0;
927890

928-
wqe->processed = 0;
929-
qp->irq_get++;
891+
memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
892+
wqe->wr_status = SIW_WR_QUEUED;
930893

931-
/* mark current IRQ entry free */
932-
smp_store_mb(irqe->flags, 0);
894+
/* First copy SQE to kernel private memory */
895+
memcpy(&wqe->sqe, sqe, sizeof(*sqe));
933896

897+
if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
898+
rv = -EINVAL;
934899
goto out;
935900
}
936-
sqe = sq_get_next(qp);
937-
if (sqe) {
938-
skip_irq:
939-
memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
940-
wqe->wr_status = SIW_WR_QUEUED;
941-
942-
/* First copy SQE to kernel private memory */
943-
memcpy(&wqe->sqe, sqe, sizeof(*sqe));
944-
945-
if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
901+
if (wqe->sqe.flags & SIW_WQE_INLINE) {
902+
if (wqe->sqe.opcode != SIW_OP_SEND &&
903+
wqe->sqe.opcode != SIW_OP_WRITE) {
946904
rv = -EINVAL;
947905
goto out;
948906
}
949-
if (wqe->sqe.flags & SIW_WQE_INLINE) {
950-
if (wqe->sqe.opcode != SIW_OP_SEND &&
951-
wqe->sqe.opcode != SIW_OP_WRITE) {
952-
rv = -EINVAL;
953-
goto out;
954-
}
955-
if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
956-
rv = -EINVAL;
957-
goto out;
958-
}
959-
wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1];
960-
wqe->sqe.sge[0].lkey = 0;
961-
wqe->sqe.num_sge = 1;
907+
if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
908+
rv = -EINVAL;
909+
goto out;
962910
}
963-
if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
964-
/* A READ cannot be fenced */
965-
if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
966-
wqe->sqe.opcode ==
967-
SIW_OP_READ_LOCAL_INV)) {
968-
siw_dbg_qp(qp, "cannot fence read\n");
969-
rv = -EINVAL;
970-
goto out;
971-
}
972-
spin_lock(&qp->orq_lock);
911+
wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1];
912+
wqe->sqe.sge[0].lkey = 0;
913+
wqe->sqe.num_sge = 1;
914+
}
915+
if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
916+
/* A READ cannot be fenced */
917+
if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
918+
wqe->sqe.opcode ==
919+
SIW_OP_READ_LOCAL_INV)) {
920+
siw_dbg_qp(qp, "cannot fence read\n");
921+
rv = -EINVAL;
922+
goto out;
923+
}
924+
spin_lock(&qp->orq_lock);
973925

974-
if (!siw_orq_empty(qp)) {
975-
qp->tx_ctx.orq_fence = 1;
976-
rv = 0;
977-
}
978-
spin_unlock(&qp->orq_lock);
926+
if (qp->attrs.orq_size && !siw_orq_empty(qp)) {
927+
qp->tx_ctx.orq_fence = 1;
928+
rv = 0;
929+
}
930+
spin_unlock(&qp->orq_lock);
979931

980-
} else if (wqe->sqe.opcode == SIW_OP_READ ||
981-
wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
982-
struct siw_sqe *rreq;
932+
} else if (wqe->sqe.opcode == SIW_OP_READ ||
933+
wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
934+
struct siw_sqe *rreq;
983935

984-
wqe->sqe.num_sge = 1;
936+
if (unlikely(!qp->attrs.orq_size)) {
937+
/* We negotiated not to send READ req's */
938+
rv = -EINVAL;
939+
goto out;
940+
}
941+
wqe->sqe.num_sge = 1;
985942

986-
spin_lock(&qp->orq_lock);
943+
spin_lock(&qp->orq_lock);
987944

988-
rreq = orq_get_free(qp);
989-
if (rreq) {
990-
/*
991-
* Make an immediate copy in ORQ to be ready
992-
* to process loopback READ reply
993-
*/
994-
siw_read_to_orq(rreq, &wqe->sqe);
995-
qp->orq_put++;
996-
} else {
997-
qp->tx_ctx.orq_fence = 1;
998-
rv = 0;
999-
}
1000-
spin_unlock(&qp->orq_lock);
945+
rreq = orq_get_free(qp);
946+
if (rreq) {
947+
/*
948+
* Make an immediate copy in ORQ to be ready
949+
* to process loopback READ reply
950+
*/
951+
siw_read_to_orq(rreq, &wqe->sqe);
952+
qp->orq_put++;
953+
} else {
954+
qp->tx_ctx.orq_fence = 1;
955+
rv = 0;
1001956
}
1002-
1003-
/* Clear SQE, can be re-used by application */
1004-
smp_store_mb(sqe->flags, 0);
1005-
qp->sq_get++;
1006-
} else {
1007-
rv = 0;
957+
spin_unlock(&qp->orq_lock);
1008958
}
959+
960+
/* Clear SQE, can be re-used by application */
961+
smp_store_mb(sqe->flags, 0);
962+
qp->sq_get++;
1009963
out:
1010964
if (unlikely(rv < 0)) {
1011965
siw_dbg_qp(qp, "error %d\n", rv);
@@ -1014,6 +968,65 @@ int siw_activate_tx(struct siw_qp *qp)
1014968
return rv;
1015969
}
1016970

971+
/*
972+
* Must be called with SQ locked.
973+
* To avoid complete SQ starvation by constant inbound READ requests,
974+
* the active IRQ will not be served after qp->irq_burst, if the
975+
* SQ has pending work.
976+
*/
977+
int siw_activate_tx(struct siw_qp *qp)
978+
{
979+
struct siw_sqe *irqe;
980+
struct siw_wqe *wqe = tx_wqe(qp);
981+
982+
if (!qp->attrs.irq_size)
983+
return siw_activate_tx_from_sq(qp);
984+
985+
irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
986+
987+
if (!(irqe->flags & SIW_WQE_VALID))
988+
return siw_activate_tx_from_sq(qp);
989+
990+
/*
991+
* Avoid local WQE processing starvation in case
992+
* of constant inbound READ request stream
993+
*/
994+
if (sq_get_next(qp) && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
995+
qp->irq_burst = 0;
996+
return siw_activate_tx_from_sq(qp);
997+
}
998+
memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
999+
wqe->wr_status = SIW_WR_QUEUED;
1000+
1001+
/* start READ RESPONSE */
1002+
wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
1003+
wqe->sqe.flags = 0;
1004+
if (irqe->num_sge) {
1005+
wqe->sqe.num_sge = 1;
1006+
wqe->sqe.sge[0].length = irqe->sge[0].length;
1007+
wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
1008+
wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
1009+
} else {
1010+
wqe->sqe.num_sge = 0;
1011+
}
1012+
1013+
/* Retain original RREQ's message sequence number for
1014+
* potential error reporting cases.
1015+
*/
1016+
wqe->sqe.sge[1].length = irqe->sge[1].length;
1017+
1018+
wqe->sqe.rkey = irqe->rkey;
1019+
wqe->sqe.raddr = irqe->raddr;
1020+
1021+
wqe->processed = 0;
1022+
qp->irq_get++;
1023+
1024+
/* mark current IRQ entry free */
1025+
smp_store_mb(irqe->flags, 0);
1026+
1027+
return 1;
1028+
}
1029+
10171030
/*
10181031
* Check if current CQ state qualifies for calling CQ completion
10191032
* handler. Must be called with CQ lock held.

drivers/infiniband/sw/siw/siw_qp_rx.c

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,10 @@ static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
680680
}
681681
spin_lock_irqsave(&qp->sq_lock, flags);
682682

683+
if (unlikely(!qp->attrs.irq_size)) {
684+
run_sq = 0;
685+
goto error_irq;
686+
}
683687
if (tx_work->wr_status == SIW_WR_IDLE) {
684688
/*
685689
* immediately schedule READ response w/o
@@ -712,8 +716,9 @@ static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
712716
/* RRESP now valid as current TX wqe or placed into IRQ */
713717
smp_store_mb(resp->flags, SIW_WQE_VALID);
714718
} else {
715-
pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
716-
qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
719+
error_irq:
720+
pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
721+
qp_id(qp), qp->attrs.irq_size);
717722

718723
siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
719724
RDMAP_ETYPE_REMOTE_OPERATION,
@@ -740,6 +745,9 @@ static int siw_orqe_start_rx(struct siw_qp *qp)
740745
struct siw_sqe *orqe;
741746
struct siw_wqe *wqe = NULL;
742747

748+
if (unlikely(!qp->attrs.orq_size))
749+
return -EPROTO;
750+
743751
/* make sure ORQ indices are current */
744752
smp_mb();
745753

@@ -796,8 +804,8 @@ int siw_proc_rresp(struct siw_qp *qp)
796804
*/
797805
rv = siw_orqe_start_rx(qp);
798806
if (rv) {
799-
pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
800-
qp_id(qp), qp->orq_get % qp->attrs.orq_size);
807+
pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
808+
qp_id(qp), qp->attrs.orq_size);
801809
goto error_term;
802810
}
803811
rv = siw_rresp_check_ntoh(srx, frx);
@@ -1290,11 +1298,13 @@ static int siw_rdmap_complete(struct siw_qp *qp, int error)
12901298
wc_status);
12911299
siw_wqe_put_mem(wqe, SIW_OP_READ);
12921300

1293-
if (!error)
1301+
if (!error) {
12941302
rv = siw_check_tx_fence(qp);
1295-
else
1296-
/* Disable current ORQ eleement */
1297-
WRITE_ONCE(orq_get_current(qp)->flags, 0);
1303+
} else {
1304+
/* Disable current ORQ element */
1305+
if (qp->attrs.orq_size)
1306+
WRITE_ONCE(orq_get_current(qp)->flags, 0);
1307+
}
12981308
break;
12991309

13001310
case RDMAP_RDMA_READ_REQ:

0 commit comments

Comments
 (0)